diff --git a/cmake/mlu.cmake b/cmake/mlu.cmake new file mode 100644 index 0000000000000000000000000000000000000000..b73ab16462b83e952807289d511fdb95ad74c6cd --- /dev/null +++ b/cmake/mlu.cmake @@ -0,0 +1,61 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_MLU) + return() +endif() + +if(NOT DEFINED NEUWARE_HOME) + set(NEUWARE_HOME $ENV{NEUWARE_HOME}) + if(NOT NEUWARE_HOME) + message(FATAL_ERROR "Must set NEUWARE_HOME or env NEUWARE_HOME when LITE_WITH_MLU=ON") + endif() +endif() + +message(STATUS "LITE_WITH_MLU: ${LITE_WITH_MLU}") +find_path(CNML_INC NAMES cnml.h + PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH) +if(NOT CNML_INC) + message(FATAL_ERROR "Can not find cnml.h in ${NEUWARE_HOME}/include") +endif() + +find_path(CNRT_INC NAMES cnrt.h + PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH) +if(NOT CNRT_INC) + message(FATAL_ERROR "Can not find cnrt.h in ${NEUWARE_HOME}/include") +endif() + +include_directories("${NEUWARE_HOME}/include") + +find_library(CNML_LIB_FILE NAMES cnml + PATHS ${NEUWARE_HOME}/lib64) + +if(NOT CNML_LIB_FILE) + message(FATAL_ERROR "Can not find CNML Library in ${NEUWARE_HOME}/lib64") +else() + message(STATUS "Found CNML Library: ${CNML_LIB_FILE}") + add_library(cnml_lib SHARED IMPORTED GLOBAL) + set_property(TARGET cnml_lib PROPERTY IMPORTED_LOCATION ${CNML_LIB_FILE}) +endif() + +find_library(CNRT_LIB_FILE NAMES cnrt + PATHS ${NEUWARE_HOME}/lib64) + +if(NOT CNRT_LIB_FILE) + message(FATAL_ERROR "Can not find CNRT Library in ${NEUWARE_HOME}/lib64") +else() + message(STATUS "Found CNRT Library: ${CNRT_LIB_FILE}") + add_library(cnrt_lib SHARED IMPORTED GLOBAL) + set_property(TARGET cnrt_lib PROPERTY IMPORTED_LOCATION ${CNRT_LIB_FILE}) +endif() diff --git a/lite/backends/mlu/CMakeLists.txt b/lite/backends/mlu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..29c90b422044be4e6a7aa9f4a8da45018a41f11a --- /dev/null +++ b/lite/backends/mlu/CMakeLists.txt @@ -0,0 +1,7 @@ +if(NOT LITE_WITH_MLU) + return() +endif() + +message (STATUS "Lite with mlu backend") + +lite_cc_library(target_wrapper_mlu SRCS target_wrapper.cc DEPS cnml_lib cnrt_lib) diff --git a/lite/backends/mlu/mlu_utils.h b/lite/backends/mlu/mlu_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..08dd355e8100a48363704168d264f6116ae58a79 --- /dev/null +++ b/lite/backends/mlu/mlu_utils.h @@ -0,0 +1,67 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +/* + * This file contains some MLU specific uitls. + */ + +#define CNRT_CALL(msg) \ + CHECK_EQ(static_cast(msg), CNRT_RET_SUCCESS) \ + << (msg) \ + << " MLU CNRT: " << cnrtGetErrorStr(static_cast(msg)) + +#define CNML_CALL(msg) \ + CHECK_EQ(static_cast(msg), CNML_STATUS_SUCCESS) \ + << (msg) << " MLU CNML: " \ + << ::paddle::lite::mlu::CnmlErrorInfo(static_cast(msg)) + +namespace paddle { +namespace lite { +namespace mlu { + +static const char* CnmlErrorInfo(int error) { + switch (error) { +#define LITE_CNML_ERROR_INFO(xx) \ + case xx: \ + return #xx; \ + break; + LITE_CNML_ERROR_INFO(CNML_STATUS_NODEVICE); + LITE_CNML_ERROR_INFO(CNML_STATUS_SUCCESS); + LITE_CNML_ERROR_INFO(CNML_STATUS_DOMAINERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_INVALIDARG); + LITE_CNML_ERROR_INFO(CNML_STATUS_LENGTHERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_OUTOFRANGE); + LITE_CNML_ERROR_INFO(CNML_STATUS_RANGEERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_OVERFLOWERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_UNDERFLOWERR); + LITE_CNML_ERROR_INFO(CNML_STATUS_INVALIDPARAM); + LITE_CNML_ERROR_INFO(CNML_STATUS_BADALLOC); + LITE_CNML_ERROR_INFO(CNML_STATUS_BADTYPEID); + LITE_CNML_ERROR_INFO(CNML_STATUS_BADCAST); + LITE_CNML_ERROR_INFO(CNML_STATUS_UNSUPPORT); +#undef LITE_CNML_ERROR_INFO + default: + return "unknown error"; + break; + } +} + +} // namespace mlu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/mlu/target_wrapper.cc b/lite/backends/mlu/target_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..2385f69246a163830e0df855082d728da2743e02 --- /dev/null +++ b/lite/backends/mlu/target_wrapper.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/mlu/target_wrapper.h" + +#include + +#include "lite/backends/mlu/mlu_utils.h" + +namespace paddle { +namespace lite { +namespace mlu { + +void cnrtMemcpyHtoD(void* dst, const void* src, size_t size) { + CNRT_CALL(cnrtMemcpy( + dst, const_cast(src), size, CNRT_MEM_TRANS_DIR_HOST2DEV)) + << " cnrt memcpy htod failed"; +} + +void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) { + CNRT_CALL(cnrtMemcpy( + dst, const_cast(src), size, CNRT_MEM_TRANS_DIR_DEV2HOST)) + << " cnrt memcpy dtoh failed"; +} + +} // namespace mlu + +size_t TargetWrapperMlu::num_devices() { + uint32_t dev_count = 0; + CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed"; + LOG(INFO) << "Current MLU device count: " << dev_count; + return dev_count; +} + +void* TargetWrapperMlu::Malloc(size_t size) { + void* ptr{}; + CNRT_CALL(cnrtMalloc(&ptr, size)) << " cnrt malloc failed"; + // LOG(INFO) << "Malloc mlu ptr: " << ptr << " with size: " << size; + return ptr; +} + +void TargetWrapperMlu::Free(void* ptr) { + CNRT_CALL(cnrtFree(ptr)) << " cnrt free failed"; +} + +void TargetWrapperMlu::MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir) { + // LOG(INFO) << "dst: " << dst << " src: " << src << " size: " << size + //<< " dir: " << (int)dir; + switch (dir) { + case IoDirection::DtoD: { + std::unique_ptr cpu_tmp_ptr(new char[size]); + mlu::cnrtMemcpyDtoH(cpu_tmp_ptr.get(), src, size); + mlu::cnrtMemcpyHtoD(dst, cpu_tmp_ptr.get(), size); + break; + } + case IoDirection::HtoD: + mlu::cnrtMemcpyHtoD(dst, src, size); + break; + case IoDirection::DtoH: + mlu::cnrtMemcpyDtoH(dst, src, size); + break; + default: + LOG(FATAL) << "Unsupported IoDirection" << static_cast(dir); + } +} + +// void TargetWrapperMlu::MemcpyAsync(void* dst, +// const void* src, +// size_t size, +// IoDirection dir, +// const stream_t& stream) { +// LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync."; +// MemcpySync(dst, src, size, dir); +// } + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/mlu/target_wrapper.h b/lite/backends/mlu/target_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..2d9e10806f78e56f50b04d408dab219c923456fc --- /dev/null +++ b/lite/backends/mlu/target_wrapper.h @@ -0,0 +1,54 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/backends/mlu/mlu_utils.h" +#include "lite/core/target_wrapper.h" + +namespace paddle { +namespace lite { + +using TargetWrapperMlu = TargetWrapper; + +template <> +class TargetWrapper { + public: + using queue_t = cnrtQueue_t; + + static size_t num_devices(); + static size_t maxinum_queue() { return 0; } // TODO(zhangshijin): fix out it. + + static size_t GetCurDevice() { return 0; } + + static void CreateQueue(queue_t* queue) {} + static void DestroyQueue(const queue_t& queue) {} + + static void QueueSync(const queue_t& queue) {} + + static void* Malloc(size_t size); + static void Free(void* ptr); + + static void MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir); + // static void MemcpyAsync(void* dst, + // const void* src, + // size_t size, + // IoDirection dir, + // const queue_t& queue); +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..d6240888d0806486f478511ef81ba8179b46ab43 --- /dev/null +++ b/lite/core/mir/mlu_postprocess_pass.cc @@ -0,0 +1,499 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/mlu_postprocess_pass.h" +#include +#include +#include +#include +#include +#include +#include "lite/core/mir/graph_visualize_pass.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/operators/subgraph_op.h" + +namespace paddle { +namespace lite { +namespace mir { + +Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, + const std::string& cast_arg_name, + SSAGraph* graph, + Node* cur_node, + Node* inst_node, + const Type* cast_type) { + // create the arg node + auto* cast_arg = graph->NewArgumentNode(cast_arg_name); + cast_arg->AsArg().type = cast_type; + inst_node->AsStmt().op()->scope()->Var(cast_arg_name); + + // create the stmt node + auto* cast_inst = graph->NewInstructNode(); + // create op + auto cast_op = LiteOpRegistry::Global().Create(op_type); + CHECK(cast_op) << "create op [" << op_type << "] failed"; + cpp::OpDesc op_desc; + op_desc.SetType(op_type); + if (op_type == "cast") { + op_desc.SetAttr("in_dtype", 5); // FP32 + op_desc.SetAttr("out_dtype", 4); // FP16 + op_desc.SetInput("X", {cur_node->AsArg().name}); + op_desc.SetOutput("Out", {cast_arg_name}); + } else if (op_type == "transpose") { + // NCHW -> NHWC + op_desc.SetAttr>("axis", {0, 2, 3, 1}); + op_desc.SetInput("X", {cur_node->AsArg().name}); + op_desc.SetOutput("Out", {cast_arg_name}); + } else if (op_type == "io_copy") { + op_desc.SetInput("Input", {cur_node->AsArg().name}); + op_desc.SetOutput("Out", {cast_arg_name}); + } else { + CHECK(0) << "Unsupport cast type"; + } + cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); + // create kernels + auto kernels = cast_op->CreateKernels(graph->valid_places()); + std::vector> selected_kernels; + bool is_found = false; + for (auto& kernel : kernels) { + if (op_type == "cast") { + const Type* in_arg_ty = kernel->GetInputDeclType("X"); + if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) { + is_found = true; + } + } else if (op_type == "transpose") { + is_found = true; + } else if (op_type == "io_copy") { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (TargetCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) && + TargetCompatibleTo(*out_arg_ty, *cast_type)) { + is_found = true; + } + } else { + CHECK(0) << "Unsupport cast type"; + } + if (is_found) { + selected_kernels.emplace_back(std::move(kernel)); + // we pick the kernel + cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); + auto& stmt = cast_inst->AsStmt(); + stmt.picked_kernel().SetContext( + ContextScheduler::Global().NewContext(stmt.picked_kernel().target())); + break; + } + } + CHECK(is_found) << "Can't find a Cast kernel for Cast op: " + << cur_node->AsArg().name << "->" << op_type; + // modify links + DirectedLink(cur_node, cast_inst); + DirectedLink(cast_inst, cast_arg); + return cast_arg; +} + +Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, + const std::string& cast_arg_name, + SSAGraph* graph, + Node* cur_node, + Node* inst_node, + const Type* cast_type) { + // create the arg node + auto* cast_arg = graph->NewArgumentNode(cast_arg_name); + cast_arg->AsArg().type = cast_type; + auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name); + // for CastAfter manully set the tensor's type + var->GetMutable<::paddle::lite::Tensor>(); + + // create the stmt node + auto* cast_inst = graph->NewInstructNode(); + // create op + auto cast_op = LiteOpRegistry::Global().Create(op_type); + CHECK(cast_op) << "create op [" << op_type << "] failed"; + cpp::OpDesc op_desc; + op_desc.SetType(op_type); + if (op_type == "cast") { + op_desc.SetAttr("in_dtype", 4); // FP32 + op_desc.SetAttr("out_dtype", 5); // FP16 + op_desc.SetInput("X", {cast_arg_name}); + op_desc.SetOutput("Out", {cur_node->AsArg().name}); + } else if (op_type == "transpose") { + // NHWC -> NCHW + op_desc.SetAttr>("axis", {0, 3, 1, 2}); + op_desc.SetInput("X", {cast_arg_name}); + op_desc.SetOutput("Out", {cur_node->AsArg().name}); + } else if (op_type == "io_copy") { + op_desc.SetInput("Input", {cast_arg_name}); + op_desc.SetOutput("Out", {cur_node->AsArg().name}); + } else { + CHECK(0) << "Unsupport cast type"; + } + + cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); + + // create kernels + auto kernels = cast_op->CreateKernels(graph->valid_places()); + std::vector> selected_kernels; + bool is_found = false; + for (auto& kernel : kernels) { + if (op_type == "cast") { + const Type* in_arg_ty = kernel->GetInputDeclType("X"); + if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) { + is_found = true; + } + } else if (op_type == "transpose") { + is_found = true; + } else if (op_type == "io_copy") { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (TargetCompatibleTo(*in_arg_ty, *cast_type) && + TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type)) { + is_found = true; + } + } else { + CHECK(0) << "Unsupport cast type"; + } + if (is_found) { + selected_kernels.emplace_back(std::move(kernel)); + // we pick the kernel + cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); + auto& stmt = cast_inst->AsStmt(); + stmt.picked_kernel().SetContext( + ContextScheduler::Global().NewContext(stmt.picked_kernel().target())); + break; + } + } + CHECK(is_found) << "Can't find a Cast kernel for Cast op: " + << cur_node->AsArg().name << "->" << op_type; + // modify links + DirectedLink(cast_arg, cast_inst); + DirectedLink(cast_inst, cur_node); + return cast_arg; +} + +void MLUPostprocessPass::InsertBefore(SSAGraph* graph, + Node* head_node, + Node* inst_node, + const Type* inst_type) { + const auto* head_type = head_node->AsArg().type; + + // break original link + RemoveDirectedLink(head_node, inst_node); + + auto* cur_node = head_node; + const auto name_prefix = + head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; + + // layout cast node + if (head_type->layout() != inst_type->layout()) { + cur_node = InsertCastBefore( + "transpose", + name_prefix + "transpose", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + head_type->target(), head_type->precision(), inst_type->layout())); + } + + // precision cast node + if (head_type->precision() != inst_type->precision()) { + cur_node = InsertCastBefore( + "cast", + name_prefix + "cast", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + head_type->target(), inst_type->precision(), inst_type->layout())); + } + + // io copy + cur_node = InsertCastBefore( + "io_copy", + name_prefix + "io_copy", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + inst_type->target(), inst_type->precision(), inst_type->layout())); + + // connect cur_node to inst_node + DirectedLink(cur_node, inst_node); + + // reset opdesc and update kernel information + UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), + head_node->AsArg().name, + cur_node->AsArg().name); + // for subgraph op, modify the BlockDesc + auto* sub_block_desc = dynamic_cast( + inst_node->AsStmt().op().get()) + ->GetSubBlock(); + for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) { + auto* sub_block_op_desc = sub_block_desc->GetOp(i); + UpdateInputTo( + sub_block_op_desc, head_node->AsArg().name, cur_node->AsArg().name); + } + + // recreate the op + RecreateOp(inst_node, graph); + + graph->CheckValid(); +} + +void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node, + const Type** arg_type, + SSAGraph* graph) { + CHECK(inst_node->IsStmt()); + constexpr auto subgraph_target = TARGET(kMLU); + constexpr auto subgraph_layout = DATALAYOUT(kNHWC); + + // get subgraph's valid precision + const auto& places = graph->valid_places(); + std::set<::paddle::lite_api::PrecisionType> prec_set; + for (const auto& place : places) { + if (place.target == TARGET(kMLU)) { + prec_set.insert(place.precision); + } + } + + // get subgraph op's type info + size_t kernel_size = inst_node->AsStmt().kernels().size(); + CHECK_GT(kernel_size, 0); + VLOG(4) << "subgraph kernel size: " << kernel_size; + + for (size_t i = 0; i < kernel_size; ++i) { + auto* kernel = inst_node->AsStmt().kernels()[i].get(); + VLOG(4) << i << "th kernel: " << TargetToStr(kernel->target()) << ", " + << PrecisionToStr(kernel->precision()) << ", " + << DataLayoutToStr(kernel->layout()); + } + + for (size_t i = 0; i < kernel_size; ++i) { + auto* kernel = inst_node->AsStmt().kernels()[i].get(); + CHECK(kernel->target() == subgraph_target); + CHECK(kernel->layout() == subgraph_layout); + if (prec_set.count(kernel->precision()) == 1) { + const auto subgraph_precision = kernel->precision(); + CHECK(subgraph_precision == PRECISION(kFloat) || + subgraph_precision == PRECISION(kFP16)) + << "Mlu node has unsupport precision"; + VLOG(4) << "picked kernel precision: " + << PrecisionToStr(subgraph_precision); + *arg_type = LiteType::GetTensorTy( + subgraph_target, subgraph_precision, subgraph_layout); + break; + } + } +} + +bool MLUPostprocessPass::NeedInsert(Node* node, const Type* inst_type) { + CHECK(node->IsArg()); + + // some op, for example batch_norm, has output nodes useless + if (node->outlinks.size() == 0) { + return false; + } + + // check if node is weight or persistent + bool is_persist = node->AsArg().is_weight || node->AsArg().is_persist; + if (is_persist) { + VLOG(4) << "Persistent arg name: " << node->AsArg().name + << " is_weight: " << node->AsArg().is_weight + << " is_persist: " << node->AsArg().is_persist; + return false; + } + + const auto target = node->AsArg().type->target(); + const auto precision = node->AsArg().type->precision(); + const auto layout = node->AsArg().type->layout(); + VLOG(4) << "arg name: " << node->AsArg().name + << " type: " << TargetToStr(target) << ", " + << PrecisionToStr(precision) << ", " << DataLayoutToStr(layout); + + // do not insert nodes if previous node is on mlu already + if (target == inst_type->target()) { + CHECK(layout == inst_type->layout()) << "Mlu node has wrong layout"; + return false; + } + + return true; +} + +void MLUPostprocessPass::InsertAfter(SSAGraph* graph, + Node* tail_node, + Node* inst_node, + const Type* inst_type) { + const auto* tail_type = tail_node->AsArg().type; + + // break original link + RemoveDirectedLink(inst_node, tail_node); + + auto* cur_node = tail_node; + const auto name_prefix = + tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; + + // layout cast node + if (tail_type->layout() != inst_type->layout()) { + cur_node = InsertCastAfter( + "transpose", + name_prefix + "transpose", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + tail_type->target(), tail_type->precision(), inst_type->layout())); + } + + // precision cast node + if (tail_type->precision() != inst_type->precision()) { + cur_node = InsertCastAfter( + "cast", + name_prefix + "cast", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + tail_type->target(), inst_type->precision(), inst_type->layout())); + } + + // io copy + cur_node = InsertCastAfter( + "io_copy", + name_prefix + "io_copy", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy( + inst_type->target(), inst_type->precision(), inst_type->layout())); + + // connect cur_node to inst_node + DirectedLink(inst_node, cur_node); + + // reset opdesc and update kernel information + UpdateOutputTo(inst_node->AsStmt().op()->mutable_op_info(), + tail_node->AsArg().name, + cur_node->AsArg().name); + // for subgraph op, modify the BlockDesc + auto* sub_block_desc = dynamic_cast( + inst_node->AsStmt().op().get()) + ->GetSubBlock(); + for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) { + auto* sub_block_op_desc = sub_block_desc->GetOp(i); + UpdateOutputTo( + sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name); + } + + // recreate the op + RecreateOp(inst_node, graph); + + graph->CheckValid(); +} + +void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) { + auto original_selected_kernel = + std::move(inst_node->AsStmt().kernels().front()); + auto updated_op_info = *inst_node->AsStmt().mutable_op_info(); + + inst_node->AsStmt().ResetOp(updated_op_info, graph->valid_places()); + inst_node->AsStmt().kernels().clear(); + inst_node->AsStmt().kernels().emplace_back( + std::move(original_selected_kernel)); + for (auto& kernel : inst_node->AsStmt().kernels()) { + VLOG(4) << "kernel info: " << kernel->name(); + inst_node->AsStmt().op()->AttachKernel(kernel.get()); + } +} + +void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { + for (auto& node : graph->mutable_nodes()) { + if (!node.IsStmt()) continue; + if (node.AsStmt().op_type() == "feed") { + for (auto& out : node.outlinks) { + bool change = true; + for (auto& inst : out->outlinks) { + if (inst->AsStmt().op_type() != "subgraph") { + change = false; + break; + } + } + if (change) { + const auto* old_type = out->AsArg().type; + out->AsArg().type = + LiteType::GetTensorTy(old_type->target(), + old_type->precision(), + ::paddle::lite_api::DataLayoutType::kNHWC, + old_type->device()); + } + } + } + if (node.AsStmt().op_type() == "fetch") { + for (auto& inp : node.inlinks) { + bool change = true; + for (auto& inst : inp->inlinks) { + if (inst->AsStmt().op_type() != "subgraph") { + change = false; + break; + } + } + if (change) { + const auto* old_type = inp->AsArg().type; + inp->AsArg().type = + LiteType::GetTensorTy(old_type->target(), + old_type->precision(), + ::paddle::lite_api::DataLayoutType::kNHWC, + old_type->device()); + } + } + } + } +} + +void MLUPostprocessPass::Apply(const std::unique_ptr& graph) { + // currently for non-persistent input and output args, mlu subgraph op + // only support float16/float32 data type + + // in two situations as folllows: + // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch; + // arg_in and arg_out are assumed to be NHWC which user should be aware of. + // Thus here we change these args' layout to NHWC + ModifyLayout(graph.get()); + + // insert io_copy, layout and precision cast of subgraph's inputs and outputs + for (auto& node : graph->mutable_nodes()) { + if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") { + const Type* subgraph_arg_type = nullptr; + GetSubgraphOpArgType(&node, &subgraph_arg_type, graph.get()); + + auto links_tmp = node.inlinks; + for (auto p_in : links_tmp) { + if (NeedInsert(p_in, subgraph_arg_type)) { + InsertBefore(graph.get(), p_in, &node, subgraph_arg_type); + } + } + links_tmp.assign(node.outlinks.begin(), node.outlinks.end()); + for (auto p_out : links_tmp) { + if (NeedInsert(p_out, subgraph_arg_type)) { + InsertAfter(graph.get(), p_out, &node, subgraph_arg_type); + } + } + } + } +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(mlu_postprocess_pass, paddle::lite::mir::MLUPostprocessPass) + .BindTargets({TARGET(kMLU)}); diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..8ffcbc952a44abea272bdd22467d86cd04baa207 --- /dev/null +++ b/lite/core/mir/mlu_postprocess_pass.h @@ -0,0 +1,114 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/core/mir/pass.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace mir { + +static void UpdateInputTo(cpp::OpDesc* desc, + const std::string& from, + const std::string& to) { + for (auto& item : *desc->mutable_inputs()) { + for (auto& input : item.second) { + if (input == from) { + input = to; + } + } + } + if (desc->Type() != "subgraph") return; + auto input_names = + desc->GetAttr>("input_data_names"); + for (size_t i = 0; i < input_names.size(); ++i) { + if (input_names[i] == from) { + input_names[i] = to; + } + } + desc->SetAttr>("input_data_names", input_names); +} + +static void UpdateOutputTo(cpp::OpDesc* desc, + const std::string& from, + const std::string& to) { + for (auto& item : *desc->mutable_outputs()) { + for (auto& output : item.second) { + if (output == from) { + output = to; + } + } + } + if (desc->Type() != "subgraph") return; + auto output_names = + desc->GetAttr>("output_data_names"); + for (size_t i = 0; i < output_names.size(); ++i) { + if (output_names[i] == from) { + output_names[i] = to; + } + } + desc->SetAttr>("output_data_names", output_names); +} + +/* + * The pass changes the node's target to mlu which follows a mlu subgraph op + * */ +class MLUPostprocessPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; + + private: + void GetSubgraphOpArgType(Node* inst_node, + const Type** arg_type, + SSAGraph* graph); + + void ModifyLayout(SSAGraph* graph); + + bool NeedInsert(Node* node, const Type* inst_type); + + void InsertBefore(SSAGraph* graph, + Node* head_node, + Node* inst_node, + const Type* type); + + void InsertAfter(SSAGraph* graph, + Node* tail_node, + Node* inst_node, + const Type* type); + + Node* InsertCastBefore(const std::string& op_type, + const std::string& cast_arg_name, + SSAGraph* graph, + Node* cur_node, + Node* inst_node, + const Type* cast_type); + + Node* InsertCastAfter(const std::string& op_type, + const std::string& cast_arg_name, + SSAGraph* graph, + Node* cur_node, + Node* inst_node, + const Type* cast_type); + + void RecreateOp(Node* inst_node, SSAGraph* graph); +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/subgraph_cast_display_pass.cc b/lite/core/mir/subgraph_cast_display_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..3a2c94d23298fcb607de0bf821d0dc92c95da7bb --- /dev/null +++ b/lite/core/mir/subgraph_cast_display_pass.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/pass.h" +#include "lite/core/mir/pass_registry.h" + +namespace paddle { +namespace lite { +namespace mir { + +class SubgraphCastDisplayPass : public DebugPass { + public: + void Apply(const std::unique_ptr& graph) override { + VLOG(3) << "== Argument types =="; + for (auto& node : graph->mutable_nodes()) { + if (!node.IsArg()) continue; + + auto* type = node.AsArg().type; + if (type) { + VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type; + } else { + VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK"; + } + } + VLOG(3) << "---------------------"; + + // + VLOG(0) << "== SubgraphOp Debug Info =="; + for (auto& node : graph->mutable_nodes()) { + if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") { + VLOG(0) << "FOUND SUBGRAPH OP"; + display_debug_info(node, "subgraph"); + break; + } + } + VLOG(0) << "---------------------"; + } + + void display_debug_info(const Node& node, + std::string op_type, + bool display_in_nodes = true, + bool display_out_nodes = true) { + CHECK(node.IsStmt()); + VLOG(0) << node.AsStmt(); + if (display_in_nodes) { + for (auto p_in_arg_node : node.inlinks) { + CHECK(p_in_arg_node->IsArg()); + VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name + << " type: " << *p_in_arg_node->AsArg().type + << " is_weight: " << p_in_arg_node->AsArg().is_weight + << " is_persist: " << p_in_arg_node->AsArg().is_persist + << " input_count: " << p_in_arg_node->inlinks.size(); + if (p_in_arg_node->inlinks.size() == 0) { + VLOG(0) << "** END with No Op"; + } + for (auto p_in_stmt_node : p_in_arg_node->inlinks) { + CHECK(p_in_stmt_node->IsStmt()); + std::string stmt_op_type = p_in_stmt_node->AsStmt().op_type(); + if (stmt_op_type == "cast" || stmt_op_type == "transpose" || + stmt_op_type == "io_copy") { + display_debug_info(*p_in_stmt_node, stmt_op_type, true, false); + } else { + VLOG(0) << "** END with op type: " << stmt_op_type; + } + } + } + } + if (display_out_nodes) { + for (auto p_out_arg_node : node.outlinks) { + CHECK(p_out_arg_node->IsArg()); + VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name + << " type: " << *p_out_arg_node->AsArg().type + << " is_weight: " << p_out_arg_node->AsArg().is_weight + << " is_persist: " << p_out_arg_node->AsArg().is_persist + << " output_count: " << p_out_arg_node->outlinks.size(); + if (p_out_arg_node->outlinks.size() == 0) { + VLOG(0) << "** END with No Op"; + } + for (auto p_out_stmt_node : p_out_arg_node->outlinks) { + CHECK(p_out_stmt_node->IsStmt()); + std::string stmt_op_type = p_out_stmt_node->AsStmt().op_type(); + if (stmt_op_type == "cast" || stmt_op_type == "transpose" || + stmt_op_type == "io_copy") { + display_debug_info(*p_out_stmt_node, stmt_op_type, false, true); + } else { + VLOG(0) << "** END with op type: " << stmt_op_type; + } + } + } + } + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(subgraph_cast_display_pass, + paddle::lite::mir::SubgraphCastDisplayPass) + .BindTargets({TARGET(kAny)}); diff --git a/lite/kernels/mlu/CMakeLists.txt b/lite/kernels/mlu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c41f05ca0cb23013418654f195394f88adf05b1 --- /dev/null +++ b/lite/kernels/mlu/CMakeLists.txt @@ -0,0 +1,8 @@ +if(NOT LITE_WITH_MLU) + return() +endif() + +add_subdirectory(bridges) +add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges}) +add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) +add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) diff --git a/lite/kernels/mlu/bridges/CMakeLists.txt b/lite/kernels/mlu/bridges/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..302d580ee1594f983e516d42da6f57221b3b33c8 --- /dev/null +++ b/lite/kernels/mlu/bridges/CMakeLists.txt @@ -0,0 +1,41 @@ +if(NOT LITE_WITH_MLU) + return() +endif() + +lite_cc_library(subgraph_bridge_utility_mlu SRCS utility.cc DEPS ${mlu_builder_libs} tensor) +lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs}) +lite_cc_library(subgraph_bridge_graph_mlu SRCS graph.cc DEPS subgraph_bridge_utility_mlu subgraph_bridge_tensor_mlu) + +set(mlu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_mlu subgraph_bridge_graph_mlu) + +lite_cc_library(subgraph_bridge_act_op_mlu SRCS act_op.cc DEPS ${mlu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_batch_norm_op_mlu SRCS batch_norm_op.cc DEPS ${mlu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_conv_op_mlu SRCS conv_op.cc DEPS ${mlu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_elementwise_ops_mlu SRCS elementwise_ops.cc DEPS ${mlu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_pool_op_mlu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_softmax_op_mlu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_deps_mlu}) +set(mlu_subgraph_bridges + subgraph_bridge_registry + subgraph_bridge_utility_mlu + subgraph_bridge_graph_mlu + subgraph_bridge_act_op_mlu + subgraph_bridge_conv_op_mlu + subgraph_bridge_elementwise_ops_mlu + subgraph_bridge_pool_op_mlu + subgraph_bridge_softmax_op_mlu + subgraph_bridge_fc_op_mlu + subgraph_bridge_batch_norm_op_mlu + CACHE INTERNAL "mlu_subgraph_bridges") + + +# lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges}) +# lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +# lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +# lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +# lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +# lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +# lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +# lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) + +message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}") diff --git a/lite/kernels/mlu/bridges/act_op.cc b/lite/kernels/mlu/bridges/act_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..50291ec297f9d035f8a7fbe1b525f8ece27bfeb6 --- /dev/null +++ b/lite/kernels/mlu/bridges/act_op.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Create act node and set params from op + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType()); + CHECK(graph->HasNode(x_var_name)); + auto input_tensor = graph->GetNode(x_var_name); + cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type); + cnmlBaseOp_t activation_op; + CNML_CALL(cnmlCreateActiveOp(&activation_op, + act_type, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + graph->FuseOp(activation_op); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter); diff --git a/lite/kernels/mlu/bridges/act_op_test.cc b/lite/kernels/mlu/bridges/act_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..51cdc52dc6da764ab0c2d720b9159fd8b0a2c0df --- /dev/null +++ b/lite/kernels/mlu/bridges/act_op_test.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/operators/activation_ops.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int ActConverter(void* ctx, OpLite* op); + +template void FillTensor(Tensor* x, + float lower = -2, + float upper = -2); + +void act_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto op_type = op_info->Type(); + auto x = scope->FindTensor("x"); + auto out = scope->FindMutableTensor("out"); + auto out_ref = scope->FindMutableTensor("out_ref"); + out->Resize(x->dims()); + out_ref->Resize(x->dims()); + auto x_data = x->data(); + auto out_data = out->mutable_data(); + CHECK_EQ(x->numel(), out->numel()); + + // "sigmoid","relu","tanh","relu_clipped","leaky_relu","softsign","hard_sigmoid" + if (op_type == "sigmoid") { + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = 1.f / (1.f + std::exp(-x_data[i])); + } + } else if (op_type == "relu") { + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = std::max(0.f, x_data[i]); + } + } else if (op_type == "tanh") { + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = (std::exp(x_data[i]) - std::exp(-x_data[i])) / + (std::exp(x_data[i]) + std::exp(-x_data[i])); + } + } else if (op_type == "relu_clipped") { + auto relu_clipped_coef = op_info->GetAttr("Relu_clipped_coef"); + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = std::min(std::max(0.f, x_data[i]), relu_clipped_coef); + } + } else if (op_type == "relu6") { + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = std::min(std::max(0.f, x_data[i]), 6.f); + } + } else if (op_type == "leaky_relu") { + auto alpha = op_info->GetAttr("alpha"); + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = std::max(x_data[i], x_data[i] * alpha); + } + } else if (op_type == "softsign") { + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = x_data[i] / (1 + std::abs(x_data[i])); + } + } else if (op_type == "hard_sigmoid") { + auto slope = op_info->GetAttr("slope"); + auto offset = op_info->GetAttr("offset"); + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = std::min(1.f, slope * x_data[i] + offset); + out_data[i] = std::max(0.f, out_data[i]); + } + } else { + LOG(FATAL) << "unsupported activation type: " << op_type; + } +} + +void test_act(std::vector x_shape, std::string op_type) { + // prepare input&output variables + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + std::string out_ref_var_name("out_ref"); + auto* x = scope.NewTensor(x_var_name); + auto* out = scope.NewTensor(out_var_name); + auto* out_ref = scope.NewTensor(out_ref_var_name); + x->Resize(x_shape); + + // initialize input&output data + FillTensor(x, 2, 8); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType(op_type); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + if (op_type == "relu_clipped") { + opdesc.SetAttr("Relu_clipped_coef", 3.f); + } else if (op_type == "relu6") { + opdesc.SetAttr("Relu_clipped_coef", 6.f); + } else if (op_type == "leaky_relu") { + opdesc.SetAttr("alpha", 0.02f); + } else if (op_type == "hard_sigmoid") { + opdesc.SetAttr("slope", 0.2f); + opdesc.SetAttr("offset", 0.5f); + } + + // create and convert op to NPU model, then run it on NPU + auto op = CreateOp(opdesc, &scope); + // execute reference implementation and save to output tensor + act_ref(op); + out_ref->CopyDataFrom(*out); + + LaunchOp(op, {x_var_name}, {out_var_name}); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(MLUBridges, activation) { + std::vector> shapes{{1}, {2, 3}, {1, 2, 3, 4}}; + std::vector types{"sigmoid", "relu", "tanh"}; + for (auto x_shape : shapes) { + for (auto op_type : types) { + test_act(x_shape, op_type); + } + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(MLU, relu, paddle::lite::subgraph::mlu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(MLU, + sigmoid, + paddle::lite::subgraph::mlu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(MLU, tanh, paddle::lite::subgraph::mlu::ActConverter); diff --git a/lite/kernels/mlu/bridges/batch_norm_op.cc b/lite/kernels/mlu/bridges/batch_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d95a5115c96c10a8881f50c44fee9881c6a9e218 --- /dev/null +++ b/lite/kernels/mlu/bridges/batch_norm_op.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Get input vars and op attributes + auto x_var_name = op_info->Input("X").front(); + auto scale_var_name = op_info->Input("Scale").front(); + auto bias_var_name = op_info->Input("Bias").front(); + auto mean_var_name = op_info->Input("Mean").front(); + auto variance_var_name = op_info->Input("Variance").front(); + auto y_var_name = op_info->Output("Y").front(); + auto epsilon = op_info->GetAttr("epsilon"); + + auto output = scope->FindVar(y_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto output_tensor = graph->AddNode( + y_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType()); + + CHECK(graph->HasNode(x_var_name)); + + auto mean = scope->FindVar(mean_var_name)->GetMutable(); + auto mean_dims = mean->dims().Vectorize(); + auto mean_tensor = graph->AddNode( + mean_var_name, mean_dims, CNML_CONST, CNML_CNHW, graph->FPType()); + + auto variance = scope->FindVar(variance_var_name)->GetMutable(); + auto variance_dims = variance->dims().Vectorize(); + auto variance_tensor = graph->AddNode( + variance_var_name, variance_dims, CNML_CONST, CNML_CNHW, graph->FPType()); + + auto scale = scope->FindVar(scale_var_name)->GetMutable(); + auto bias = scope->FindVar(bias_var_name)->GetMutable(); + + int co = static_cast(mean_dims[0]); + + for (int i = 0; i < co; ++i) { + variance->mutable_data()[i] = + scale->data()[i] / sqrtf(variance->data()[i] + epsilon); + mean->mutable_data()[i] = + mean->data()[i] - + bias->data()[i] / variance->data()[i]; + } + + auto input_tensor = graph->GetNode(x_var_name); + cnmlBaseOp_t bn_op; + CNML_CALL(cnmlCreateBatchNormOpForward(&bn_op, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + mean_tensor->mlu_tensor(), + variance_tensor->mlu_tensor())); + + graph->BindConstData(variance_var_name, variance); + graph->BindConstData(mean_var_name, mean); + graph->FuseOp(bn_op); + + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(batch_norm, + kMLU, + paddle::lite::subgraph::mlu::BatchNormConverter); diff --git a/lite/kernels/mlu/bridges/batch_norm_op_test.cc b/lite/kernels/mlu/bridges/batch_norm_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..47e291bf3d83e8ce85216e86505817be6ed8b106 --- /dev/null +++ b/lite/kernels/mlu/bridges/batch_norm_op_test.cc @@ -0,0 +1,186 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/batch_norm_op.h" +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int BatchNormConverter(void* ctx, OpLite* op); + +template +void batch_norm_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable(); + auto bias = + scope->FindVar(op_info->Input("Bias").front())->GetMutable(); + auto scale = + scope->FindVar(op_info->Input("Scale").front())->GetMutable(); + auto mean = + scope->FindVar(op_info->Input("Mean").front())->GetMutable(); + auto variance = + scope->FindVar(op_info->Input("Variance").front())->GetMutable(); + + auto x_data = x->data(); + auto y_data = y->mutable_data(); + auto scale_data = scale->mutable_data(); + auto bias_data = bias->mutable_data(); + auto mean_data = mean->mutable_data(); + auto variance_data = variance->mutable_data(); + DDim x_dims = x->dims(); + + float epsilon = op_info->GetAttr("epsilon"); + // float momentum = op_info->GetAttr("momentum"); + auto data_layout = op_info->GetAttr("data_layout"); + + bool global_stats = op_info->GetAttr("use_global_stats"); + if (global_stats) { + int64_t outer_size = 0; + int64_t channel_size = 0; + int64_t inner_size = 0; + if (data_layout == "NCHW") { + outer_size = x_dims[0]; + channel_size = x_dims[1]; + inner_size = x_dims.Slice(2, x_dims.size()).production(); + } else { + LOG(FATAL) << "Unknown storage order: " << data_layout; + } + auto x_ptr = x_data; + auto y_ptr = y_data; + for (int o = 0; o < outer_size; o++) { + for (int c = 0; c < channel_size; c++) { + for (int i = 0; i < inner_size; i++) { + dtype norm_x = + (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon); + *y_ptr = norm_x * scale_data[c] + bias_data[c]; + x_ptr++; + y_ptr++; + } + } + } + } +} + +void test_batch_norm( + int bs, int ic, int ih, int iw, float epsilon, float momentum) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + std::string scale_var_name = "scale"; + std::string bias_var_name = "bias"; + std::string mean_var_name = "mean"; + std::string variance_var_name = "variance"; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* scale = scope.Var(scale_var_name)->GetMutable(); + auto* bias = scope.Var(bias_var_name)->GetMutable(); + auto* mean = scope.Var(mean_var_name)->GetMutable(); + auto* variance = scope.Var(variance_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize({bs, ic, ih, iw}); + scale->Resize({ic}); + bias->Resize({ic}); + mean->Resize({ic}); + variance->Resize({ic}); + + // initialize input&output data + FillTensor(x, -100, 100); + FillTensor(scale, -6.7, 13.78); + FillTensor(bias, -12.11, 12.94); + FillTensor(mean, -23.45, 67.89); + // variance > 0 + FillTensor(variance, 1.5f, 76.78f); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("batch_norm"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetInput("Scale", {scale_var_name}); + opdesc.SetInput("Bias", {bias_var_name}); + opdesc.SetInput("Mean", {mean_var_name}); + opdesc.SetInput("Variance", {variance_var_name}); + opdesc.SetOutput("Y", {out_var_name}); + opdesc.SetAttr("is_test", 1); + opdesc.SetAttr("use_global_stats", true); + opdesc.SetAttr("epsilon", epsilon); + opdesc.SetAttr("momentum", momentum); + opdesc.SetAttr("data_layout", std::string("NCHW")); + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + // execute reference implementation and save to output tensor + batch_norm_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input_trans; + input_trans.Resize({bs, ic, ih, iw}); + transpose(x->mutable_data(), + input_trans.mutable_data(), + {bs, ic, ih, iw}, + {0, 2, 3, 1}); + + out->Resize({bs, ih, iw, ic}); + x->CopyDataFrom(input_trans); + x->Resize({bs, ih, iw, ic}); + + LaunchOp(op, {x_var_name}, {out_var_name}); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + Tensor output_trans; + output_trans.Resize({bs, ic, ih, iw}); + transpose(out_data, + output_trans.mutable_data(), + {bs, ih, iw, ic}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(MLUBridges, batch_norm) { + for (auto bs : {1, 4, 7}) { + for (auto ic : {1, 4, 7}) { + for (auto ih : {1, 4, 7}) { + for (auto iw : {1, 4, 7}) { + for (auto epsilon : {1e-4f, 1e-5f}) { + for (auto momentum : {0.9f, 0.99f}) { + test_batch_norm(bs, ic, ih, iw, epsilon, momentum); + } + } + } + } + } + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(MLU, + batch_norm, + paddle::lite::subgraph::mlu::BatchNormConverter); diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e9fdacdca92398cee9f5e01b3f34e41e672274b5 --- /dev/null +++ b/lite/kernels/mlu/bridges/conv_op.cc @@ -0,0 +1,200 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/conv_op.h" +#include +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto* graph = static_cast(ctx); + const auto* op_info = op->op_info(); + const auto* scope = op->scope(); + VLOG(3) << "[MLU] Converting " << op_info->Type() << "... "; + + // Get input, filter and op attributes + const auto input_var_name = op_info->Input("Input").front(); + const auto& input_dims_nhwc = + scope->FindVar(input_var_name)->GetMutable()->dims(); + const auto input_dims = DimNHWC2NCHW(input_dims_nhwc); + const auto filter_var_name = op_info->Input("Filter").front(); + auto* filter = scope->FindVar(filter_var_name)->GetMutable(); + const auto& filter_dims = filter->dims(); + const auto output_var_name = op_info->Output("Output").front(); + const auto bs = input_dims[0]; + const auto oc = filter_dims[0]; + CHECK_EQ(input_dims.size(), 4); + CHECK_EQ(filter_dims.size(), 4); + const auto strides = op_info->GetAttr>("strides"); + auto dilations = op_info->GetAttr>("dilations"); + auto paddings = op_info->GetAttr>("paddings"); + CHECK_EQ(strides.size(), 2L); + CHECK_EQ(dilations.size(), 2L); + if (paddings.size() == 2L) { + for (size_t i = 0; i < strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "Paddings size should be the same or twice as the input size."; + + const std::string padding_algorithm = + op_info->HasAttr("padding_algorithm") + ? op_info->GetAttr("padding_algorithm") + : ""; + + operators::UpdatePaddingAndDilation(&paddings, + &dilations, + strides, + padding_algorithm, + input_dims, + filter_dims); + + std::vector output_shape({bs, oc}); + for (size_t i = 0; i < 2; i++) { + const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1; + output_shape.push_back( + (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) / + strides[i] + + 1); + } + + const auto output_shape_nhwc = DimNCHW2NHWC(output_shape); + const auto output_tensor = graph->AddNode(output_var_name, + output_shape_nhwc, + CNML_TENSOR, + CNML_NHWC, + graph->FPType()); + scope->FindVar(output_var_name) + ->GetMutable<::paddle::lite::Tensor>() + ->Resize(output_shape_nhwc); + + // Create filter node + const auto filter_tensor = graph->AddNode(filter_var_name, + filter_dims.Vectorize(), + CNML_FILTER, + CNML_NCHW, + graph->FPType()); + const auto weight_scale = + op_info->GetAttr>("weight_scale"); + + if (filter->precision() == PrecisionType::kUnk || + filter->precision() == PrecisionType::kInt8) { + std::vector filter_dequant(filter->data_size()); + dequant(filter_dequant.data(), + filter->mutable_data(), + 1, + filter_dims[0], + filter_dims[1] * filter_dims[2] * filter_dims[3], + weight_scale); + transpose(filter_dequant.data(), + filter->mutable_data(), + {static_cast(filter_dims[0]), + static_cast(filter_dims[1]), + static_cast(filter_dims[2]), + static_cast(filter_dims[3])}, + {0, 2, 3, 1}); + filter->set_precision(PrecisionType::kFloat); + } else if (filter->precision() != PrecisionType::kFloat) { + LOG(FATAL) << "UnSupported weight precision!"; + } + + cnmlConvOpParam_t conv_param; + CNML_CALL(cnmlCreateConvOpParam(&conv_param, + strides[0], + strides[1], + dilations[0], + dilations[1], + paddings[0] * 2, + paddings[2] * 2)); + std::string bias_var_name; + std::shared_ptr bias_tensor; + if (HasInputArg(op_info, scope, "Bias")) { + const DDim output_dims(output_shape); + bias_var_name = op_info->Input("Bias").front(); + auto* bias = scope->FindVar(bias_var_name)->GetMutable(); + const auto& bias_dims = bias->dims(); + const auto bias_data_size = bias_dims.production(); + const auto output_data_size = output_dims.production(); + std::vector bias_shape; + if (bias_data_size == oc) { + // 0: {oc} + bias_shape = {oc}; + } else if (bias_data_size == output_data_size / bs) { + LOG(FATAL) << "Unsupported ... ..."; + // 1: {1, oc, oh, ow} + bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]}; + } else if (bias_data_size == output_data_size) { + LOG(FATAL) << "Unsupported ... ..."; + // 2: {n, oc, oh, ow} + bias_shape = output_dims.Vectorize(); + } else { + LOG(ERROR) << "[MLU] Bias dimension " << bias_dims + << " isn't supported in conv2d Op when output dimension is " + << output_dims; + } + bias_tensor = graph->AddNode(bias_var_name, + bias_dims.Vectorize(), + CNML_CONST, + CNML_CNHW, + graph->FPType()); + graph->BindConstData(bias_var_name, bias); + } + cnmlBaseOp_t conv_op; + const auto input_scale = op_info->GetAttr("input_scale"); + CNML_CALL(cnmlCreateConvOpForward( + &conv_op, + conv_param, + graph->GetNode(input_var_name)->mlu_tensor(), + output_tensor->mlu_tensor(), + filter_tensor->mlu_tensor(), + bias_tensor ? bias_tensor->mlu_tensor() : nullptr)); + + graph->SetComputingDataType( + conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale); + graph->SetComputingDataType( + conv_op, + filter_tensor->mlu_tensor(), + 1 / *min_element(weight_scale.begin(), weight_scale.end())); + CNML_CALL(cnmlSetOperationComputingLayout(conv_op, CNML_NHWC)); + if (HasInputArg(op_info, scope, "Bias")) { + auto* bias = scope->FindVar(bias_var_name)->GetMutable(); + graph->BindConstData(bias_var_name, bias); + } + graph->BindConstData(filter_var_name, filter); + graph->FuseOp(conv_op); + CNML_CALL(cnmlDestroyConvOpParam(&conv_param)); + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(conv2d, + kMLU, + paddle::lite::subgraph::mlu::ConvConverter); +REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d, + kMLU, + paddle::lite::subgraph::mlu::ConvConverter); diff --git a/lite/kernels/mlu/bridges/conv_op_test.cc b/lite/kernels/mlu/bridges/conv_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e8ef9ba04fd6126f00f4ee2ff869495929bfdc9a --- /dev/null +++ b/lite/kernels/mlu/bridges/conv_op_test.cc @@ -0,0 +1,350 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/conv_op.h" +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int ConvConverter(void* ctx, OpLite* op); + +void conv_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto input = + scope->FindVar(op_info->Input("Input").front())->GetMutable(); + auto filter = + scope->FindVar(op_info->Input("Filter").front())->GetMutable(); + auto output = + scope->FindVar(op_info->Output("Output").front())->GetMutable(); + std::vector strides = + op_info->GetAttr>("strides"); + std::vector paddings = + op_info->GetAttr>("paddings"); + int32_t groups = op_info->GetAttr("groups"); + std::vector dilations = + op_info->GetAttr>("dilations"); + bool fuse_relu = op_info->GetAttr("fuse_relu"); + auto input_dims = input->dims(); + auto filter_dims = filter->dims(); + auto output_dims = output->dims(); + auto input_data = input->mutable_data(); + auto filter_data = filter->mutable_data(); + auto output_data = output->mutable_data(); + int kernel_w = filter_dims[3]; + int kernel_h = filter_dims[2]; + int stride_w = strides[1]; + int stride_h = strides[0]; + int dila_w = dilations[1]; + int dila_h = dilations[0]; + int pad_w = paddings[2]; + int pad_h = paddings[0]; + int batch_size = input_dims[0]; + int in_ch_size = input_dims[1]; + int in_h = input_dims[2]; + int in_w = input_dims[3]; + int out_ch_size = output_dims[1]; + int out_h = output_dims[2]; + int out_w = output_dims[3]; + int out_c_group = out_ch_size / groups; + int in_c_group = in_ch_size / groups; + Tensor* bias = nullptr; + float* bias_data = nullptr; + bool is_channel_bias = false; + if (op_info->HasInput("Bias")) { + auto bias_var_names = op_info->Input("Bias"); + if (bias_var_names.size() > 0) { + auto bias_var_name = bias_var_names.front(); + bias = scope->FindVar(bias_var_name)->GetMutable(); + auto bias_dims = bias->dims(); + is_channel_bias = bias_dims.production() == out_ch_size; + bias_data = bias->mutable_data(); + } + } + for (int n = 0; n < batch_size; ++n) { + for (int g = 0; g < groups; ++g) { + for (int oc = 0; oc < out_c_group; ++oc) { + for (int oh = 0; oh < out_h; ++oh) { + for (int ow = 0; ow < out_w; ++ow) { + int out_idx = n * groups * out_c_group * out_h * out_w + + g * out_c_group * out_h * out_w + oc * out_h * out_w + + oh * out_w + ow; + float out_value = + bias_data != nullptr + ? (is_channel_bias ? bias_data[g * out_c_group + oc] + : bias_data[out_idx]) + : 0; + // + out_value *= beta; + for (int ic = 0; ic < in_c_group; ++ic) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int iw = ow * stride_w - pad_w + kw * (dila_w); + int ih = oh * stride_h - pad_h + kh * (dila_h); + if (iw < 0 || iw >= in_w) continue; + if (ih < 0 || ih >= in_h) continue; + int in_idx = n * in_ch_size * in_h * in_w + + g * in_c_group * in_h * in_w + ic * in_h * in_w + + ih * in_w + iw; + int filter_idx = + g * out_c_group * in_c_group * kernel_h * kernel_w + + oc * in_c_group * kernel_h * kernel_w + + ic * kernel_h * kernel_w + kh * kernel_w + kw; + out_value += input_data[in_idx] * filter_data[filter_idx]; + } + } + } + if (fuse_relu) { + out_value = out_value > 0 ? out_value : 0; + } + output_data[out_idx] = out_value; + } + } + } + } + } +} + +void test_conv(int bs, + int ic, + int oc, + int ih, + int iw, + bool has_bias, + bool is_channel_bias, + bool fuse_relu, + bool depthwise, + int dilation, + int stride, + int padding, + int kernel) { + // prepare input&output variables + Scope scope; + std::string input_var_name("input"); + std::string filter_var_name("filter"); + std::string filter_int_var_name("filter_int"); + std::string bias_var_name("bias"); + std::string output_var_name("output"); + std::string output_ref_var_name("output_ref"); + auto* input = scope.Var(input_var_name)->GetMutable(); + auto* filter = scope.Var(filter_var_name)->GetMutable(); + auto* filter_int = scope.Var(filter_int_var_name)->GetMutable(); + auto* bias = scope.Var(bias_var_name)->GetMutable(); + auto* output = scope.Var(output_var_name)->GetMutable(); + auto* output_ref = scope.Var(output_ref_var_name)->GetMutable(); + + // get group size and input&filter shape + int groups = 1; + if (depthwise) { // depthwise convolution ? + groups = oc = ic; + } + std::vector input_shape = {bs, ic, ih, iw}; + std::vector filter_shape = {oc, ic / groups, kernel, kernel}; + std::vector output_shape({bs, oc}); + for (size_t i = 0; i < 2; i++) { + const int dkernel = dilation * (kernel - 1) + 1; + int output_size = (input_shape[i + 2] + 2 * padding - dkernel) / stride + 1; + output_shape.push_back(output_size); + } + input->Resize(input_shape); + filter->Resize(filter_shape); + filter_int->Resize(filter_shape); + // initialize input&output data + FillTensor(filter_int, -4, 4); + float filter_scale = 1. / 16; + float input_scale = 1. / 8; + + Tensor input_int; + input_int.Resize(input_shape); + FillTensor(&input_int, -127, 127); + for (int i = 0; i < input->data_size(); i++) { + input->mutable_data()[i] = input_int.data()[i] * input_scale; + } + for (int i = 0; i < filter->data_size(); i++) { + filter->mutable_data()[i] = + filter_int->data()[i] * filter_scale; + } + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType(depthwise ? "depthwise_conv2d" : "conv2d"); + opdesc.SetInput("Input", {input_var_name}); + opdesc.SetInput("Filter", {filter_var_name}); + opdesc.SetOutput("Output", {output_var_name}); + opdesc.SetAttr("dilations", std::vector({dilation, dilation})); + opdesc.SetAttr("strides", std::vector({stride, stride})); + opdesc.SetAttr("paddings", + std::vector({padding, padding, padding, padding})); + opdesc.SetAttr("groups", groups); + opdesc.SetAttr("fuse_relu", static_cast(fuse_relu)); + if (has_bias) { + if (is_channel_bias) { + bias->Resize({oc}); + } else { + bias->Resize({output_shape}); + } + FillTensor(bias); + opdesc.SetInput("Bias", {bias_var_name}); + } + + auto op_cpu = CreateOp(opdesc, &scope); + // execute reference implementation and save to output tensor('out') + conv_ref(op_cpu); + output_ref->CopyDataFrom(*output); + + // initialize op desc + cpp::OpDesc opdesc_mlu; + opdesc_mlu.SetType(depthwise ? "depthwise_conv2d" : "conv2d"); + opdesc_mlu.SetInput("Input", {input_var_name}); + opdesc_mlu.SetInput("Filter", {filter_int_var_name}); + opdesc_mlu.SetOutput("Output", {output_var_name}); + opdesc_mlu.SetAttr("dilations", std::vector({dilation, dilation})); + opdesc_mlu.SetAttr("strides", std::vector({stride, stride})); + opdesc_mlu.SetAttr( + "paddings", std::vector({padding, padding, padding, padding})); + opdesc_mlu.SetAttr("groups", groups); + opdesc_mlu.SetAttr("fuse_relu", static_cast(fuse_relu)); + + opdesc_mlu.SetAttr("weight_scale", std::vector(oc, filter_scale)); + opdesc_mlu.SetAttr("input_scale", input_scale); + + if (has_bias) { + if (is_channel_bias) { + bias->Resize({oc}); + } else { + bias->Resize({output_shape}); + } + FillTensor(bias); + opdesc_mlu.SetInput("Bias", {bias_var_name}); + } + + for (int i = 0; i < bs; i++) { + for (int j = 0; j < ic; j++) { + for (int k = 0; k < ih * iw; k++) { + input->mutable_data()[i * ic * ih * iw + k * ic + j] = + input_int.data()[i * ic * ih * iw + j * ih * iw + k] * + input_scale; + } + } + } + + input->Resize({bs, ih, iw, ic}); + output->Resize( + {output_shape[0], output_shape[2], output_shape[3], output_shape[1]}); + + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc_mlu, &scope); + LaunchOp(op, {input_var_name}, {output_var_name}); + // compare results + auto* output_data = output->mutable_data(); + auto* output_ref_data = output_ref->mutable_data(); + Tensor output_trans; + output_trans.Resize({output_shape}); + transpose(output_data, + output_trans.mutable_data(), + {static_cast(output_shape[0]), + static_cast(output_shape[2]), + static_cast(output_shape[3]), + static_cast(output_shape[1])}, + {0, 3, 1, 2}); + output_data = output_trans.mutable_data(); + for (int i = 0; i < output->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); + } +} + +TEST(MLUBridges, conv) { +#if 1 + for (auto bs : {1}) { + for (auto ic : {3}) { + for (auto oc : {32}) { + for (auto ih : {13}) { + for (auto iw : {13}) { + for (auto has_bias : {false}) { + for (auto is_channel_bias : {true}) { + for (auto fuse_relu : {false}) { + for (auto depthwise : {false}) { + for (auto dilation : {1}) { + for (auto stride : {1}) { + for (auto kernel : {3}) { + // std::vector paddings = {kernel / 2}; + std::vector paddings = {0}; + if (kernel / 2 != 0) { + // paddings.push_back(0); + } + for (auto padding : paddings) { + VLOG(3) << "bs: " << bs << " ic: " << ic + << " oc: " << oc << " ih: " << ih + << " iw: " << iw + << " has_bias: " << has_bias + << " is_channel_bias: " << is_channel_bias + << " fuse_relu: " << fuse_relu + << " depthwise: " << depthwise + << " dilation: " << dilation + << " stride: " << stride + << " padding: " << padding + << " kernel: " << kernel; + test_conv(bs, + ic, + oc, + ih, + iw, + has_bias, + is_channel_bias, + fuse_relu, + depthwise, + dilation, + stride, + padding, + kernel); + } + } + } + } + } + } + } + } + } + } + } + } + } +#else + test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 1, 3); + test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 3); + test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 2, 5); + test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 5); +#endif +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(MLU, + conv2d, + paddle::lite::subgraph::mlu::ConvConverter); +REGISTER_SUBGRAPH_BRIDGE(MLU, + depthwise_conv2d, + paddle::lite::subgraph::mlu::ConvConverter); diff --git a/lite/kernels/mlu/bridges/elementwise_ops.cc b/lite/kernels/mlu/bridges/elementwise_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ef949925d20e0a2cb1c7f25d840e2041d79dd7a --- /dev/null +++ b/lite/kernels/mlu/bridges/elementwise_ops.cc @@ -0,0 +1,153 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +std::vector CvtYShape(const Tensor& x, Tensor* y, int axis) { + auto x_dims = x.dims(); + CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x"; + auto y_dims = y->dims(); + CHECK_GE(x_dims.size(), y_dims.size()); + + if (axis < 0) { + axis += x_dims.size(); + } + + std::vector y_new_shape(y_dims.Vectorize()); + if (y_new_shape.size() == 4UL) { + return y_new_shape; + } + for (int i = 0; i < axis; i++) { + y_new_shape.insert(y_new_shape.begin(), 1); + } + while (y_new_shape.size() < 4) { + y_new_shape.push_back(1); + } + CHECK_EQ(y_new_shape.size(), 4UL); + return y_new_shape; +} + +int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("X").front(); + auto y_var_name = op_info->Input("Y").front(); + auto out_var_name = op_info->Output("Out").front(); + auto axis = op_info->GetAttr("axis"); + + auto x_tensor = graph->GetNode(x_var_name); + auto x = scope->FindTensor(x_var_name); + std::shared_ptr y_tensor; + if (graph->HasNode(y_var_name)) { + y_tensor = graph->GetNode(y_var_name); + } else { + auto y = scope->FindMutableTensor(y_var_name); + auto y_new_shape = CvtYShape(*x, y, axis); + // all subgraph input tensor are built at first + // If we can not find the tensor, it should be const tensor + y_tensor = graph->AddNode( + y_var_name, y_new_shape, CNML_CONST, CNML_NCHW, graph->FPType()); + graph->BindConstData(y_var_name, y); + } + + auto output_tensor = graph->AddNode(out_var_name, + x->dims().Vectorize(), + CNML_TENSOR, + CNML_NHWC, + graph->FPType()); + + cnmlBaseOp_t elementwise_op; + if (op_type == "elementwise_add") { + CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op, + x_tensor->mlu_tensor(), + y_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + } else if (op_type == "fusion_elementwise_add_activation") { + auto mid_tensor = graph->AddNode(out_var_name + "_mid", + x->dims().Vectorize(), + CNML_TENSOR, + CNML_NHWC, + graph->FPType()); + CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op, + x_tensor->mlu_tensor(), + y_tensor->mlu_tensor(), + mid_tensor->mlu_tensor())); + } else if (op_type == "elementwise_sub") { + CNML_CALL(cnmlCreateBroadcastSubOp(&elementwise_op, + x_tensor->mlu_tensor(), + y_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + } else if (op_type == "elementwise_mul") { + CNML_CALL(cnmlCreateBroadcastMultOp(&elementwise_op, + x_tensor->mlu_tensor(), + y_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + } else if (op_type == "elementwise_div") { + CNML_CALL(cnmlCreateRealDivOp(&elementwise_op, + x_tensor->mlu_tensor(), + y_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + } else { + LOG(WARNING) << "[MLU] Unsupported op type: " << op_type; + return FAILED; + } + + graph->FuseOp(elementwise_op); + cnmlBaseOp_t act_op; + if (op_type == "fusion_elementwise_add_activation") { + auto mid_tensor = graph->GetNode(out_var_name + "_mid"); + auto type_string = op_info->GetAttr("act_type"); + cnmlActiveFunction_t act_type = OpTypeToCNMLActType(type_string); + CNML_CALL(cnmlCreateActiveOp(&act_op, + act_type, + mid_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + graph->FuseOp(act_op); + } + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(elementwise_add, + kMLU, + paddle::lite::subgraph::mlu::ElementwiseConverter); +REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, + kMLU, + paddle::lite::subgraph::mlu::ElementwiseConverter); +REGISTER_SUBGRAPH_BRIDGE(elementwise_sub, + kMLU, + paddle::lite::subgraph::mlu::ElementwiseConverter); +REGISTER_SUBGRAPH_BRIDGE(elementwise_mul, + kMLU, + paddle::lite::subgraph::mlu::ElementwiseConverter); +REGISTER_SUBGRAPH_BRIDGE(elementwise_div, + kMLU, + paddle::lite::subgraph::mlu::ElementwiseConverter); diff --git a/lite/kernels/mlu/bridges/elementwise_ops_test.cc b/lite/kernels/mlu/bridges/elementwise_ops_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..388aa68600e180945d19e1a4e4728cf26bf801e1 --- /dev/null +++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc @@ -0,0 +1,198 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/elementwise_ops.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int ElementwiseConverter(void* ctx, OpLite* op); + +template +void elementwise_add_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindTensor("x"); + auto y = scope->FindTensor("y"); + auto out = scope->FindMutableTensor("out_ref"); + out->Resize(x->dims()); + + auto x_data = x->data(); + auto y_data = y->data(); + auto out_data = out->mutable_data(); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + int axis = op_info->GetAttr("axis"); + + if (axis < 0) { + axis += x_dims.size(); + } + int batch = 1; + int channels = y->numel(); + int num = x->numel() / channels / batch; + // do elementwise add/sub/max... + std::string op_type = op_info->Type(); + if (op_type == "elementwise_add") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr + diny_data; + dout_ptr++; + din_ptr++; + } + } + } + } else if (op_type == "elementwise_sub") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr - diny_data; + dout_ptr++; + din_ptr++; + } + } + } + } else if (op_type == "elementwise_mul") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr * diny_data; + dout_ptr++; + din_ptr++; + } + } + } + } else if (op_type == "elementwise_div") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr / diny_data; + dout_ptr++; + din_ptr++; + } + } + } + } else if (op_type == "elementwise_max") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = std::max(*din_ptr, diny_data); + dout_ptr++; + din_ptr++; + } + } + } + } else { + LOG(FATAL) << "unsupported Elementwise type: " << op_type; + } +} + +void test_elementwise_add(const std::vector& x_shape, + const std::vector& y_shape, + int axis, + std::string elt_type) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string y_var_name = "y"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* y = scope.Var(y_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize(x_shape); + y->Resize(y_shape); + + // initialize input&output data + FillTensor(x, 1, 3); + FillTensor(y, 1, 3); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("elementwise_" + elt_type); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetInput("Y", {y_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", axis); + + // create and convert op to NPU model, then run it on NPU + auto op = CreateOp(opdesc, &scope); + + // execute reference implementation and save to output tensor + elementwise_add_ref(op); + out_ref->CopyDataFrom(*out); + + LaunchOp(op, {x_var_name, y_var_name}, {out_var_name}); + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(MLUBridges, elementwise_add) { + for (auto elt_type : {"add", "sub", "mul", "div"}) { + // test_elementwise_add({1, 2, 3, 4}, {2}, 1, elt_type); + // test_elementwise_add({1, 2, 3, 4}, {1, 2, 1, 1}, 1, elt_type); + test_elementwise_add({1, 2, 3, 4}, {1, 2, 3, 4}, 3, elt_type); + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(MLU, + elementwise_add, + paddle::lite::subgraph::mlu::ElementwiseConverter); +REGISTER_SUBGRAPH_BRIDGE(MLU, + elementwise_sub, + paddle::lite::subgraph::mlu::ElementwiseConverter); +REGISTER_SUBGRAPH_BRIDGE(MLU, + elementwise_mul, + paddle::lite::subgraph::mlu::ElementwiseConverter); +REGISTER_SUBGRAPH_BRIDGE(MLU, + elementwise_div, + paddle::lite::subgraph::mlu::ElementwiseConverter); diff --git a/lite/kernels/mlu/bridges/fc_op.cc b/lite/kernels/mlu/bridges/fc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..43a75daa2b3d2d6200f3607e213ab62ee6ba3cdb --- /dev/null +++ b/lite/kernels/mlu/bridges/fc_op.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("Input").front(); + auto w_var_name = op_info->Input("W").front(); + auto output_var_name = op_info->Output("Out").front(); + + // int in_num_col_dims = op_info->GetAttr("in_num_col_dims"); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto w = scope->FindVar(w_var_name)->GetMutable(); + auto x_dims = x->dims(); + auto w_dims = w->dims(); + + CHECK_GE(x_dims.size(), 2UL); + CHECK_EQ(w_dims.size(), 2UL); + + // Create w node + std::vector w_shape{w_dims[1], w_dims[0]}; + auto w_tensor = graph->AddNode( + w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType()); + + auto input_scale = op_info->GetAttr("input_scale"); + + std::vector output_shape_nhwc({1, 1, 1, w_dims[1]}); + auto output_tensor = graph->AddNode(output_var_name, + output_shape_nhwc, + CNML_TENSOR, + CNML_NHWC, + graph->FPType()); + scope->FindVar(output_var_name) + ->GetMutable<::paddle::lite::Tensor>() + ->Resize(output_shape_nhwc); + + std::string bias_var_name; + std::shared_ptr bias_tensor; + // Add bias node if bias tensor exists + if (HasInputArg(op_info, scope, "Bias")) { + bias_var_name = op_info->Input("Bias").front(); + auto bias = scope->FindVar(bias_var_name)->GetMutable(); + auto bias_dims = bias->dims(); + CHECK(!graph->HasNode(bias_var_name)); + // CHECK_EQ(bias_dims.production(), n); + + bias_tensor = graph->AddNode(bias_var_name, + bias_dims.Vectorize(), + CNML_CONST, + CNML_CNHW, + graph->FPType()); + graph->BindConstData(bias_var_name, bias); + } + cnmlBaseOp_t fc_op; + CNML_CALL(cnmlCreateMlpOp(&fc_op, + graph->GetNode(x_var_name)->mlu_tensor(), + output_tensor->mlu_tensor(), + w_tensor->mlu_tensor(), + bias_tensor ? bias_tensor->mlu_tensor() : nullptr)); + graph->SetComputingDataType( + fc_op, graph->GetNode(x_var_name)->mlu_tensor(), 1 / input_scale); + auto weight_scale = op_info->GetAttr>("weight_scale"); + + // LOG(INFO) << "W precision " << int(w->precision()); + if (w->precision() == PrecisionType::kUnk || + w->precision() == PrecisionType::kInt8) { + std::vector w_dequant(w->data_size()); + dequant(w_dequant.data(), + w->mutable_data(), + 1, + w_dims[1], + w_dims[0], + weight_scale); + for (int i = 0; i < w_dims[1]; i++) { + for (int j = 0; j < w_dims[0]; j++) { + w->mutable_data()[i * w_dims[0] + j] = + w_dequant[i + j * w_dims[1]]; + } + } + w->set_precision(PrecisionType::kFloat); + } else if (w->precision() != PrecisionType::kFloat) { + LOG(FATAL) << "UnSupported weight precision!"; + } + // graph->BindConstData(w_var_name, w_dequant.data()); + graph->BindConstData(w_var_name, w); + + graph->SetComputingDataType( + fc_op, + w_tensor->mlu_tensor(), + 1 / *min_element(weight_scale.begin(), weight_scale.end())); + + graph->FuseOp(fc_op); + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(fc, kMLU, paddle::lite::subgraph::mlu::FCConverter); diff --git a/lite/kernels/mlu/bridges/fc_op_test.cc b/lite/kernels/mlu/bridges/fc_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..7e5cfdb32e7d993f32403dc764462575181f9d4d --- /dev/null +++ b/lite/kernels/mlu/bridges/fc_op_test.cc @@ -0,0 +1,173 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/fc_op.h" +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int FCConverter(void* ctx, OpLite* op); + +void fc_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto input = + scope->FindVar(op_info->Input("Input").front())->GetMutable(); + auto w = scope->FindVar(op_info->Input("W").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + int32_t in_num_col_dims = op_info->GetAttr("in_num_col_dims"); + Tensor* bias = nullptr; + float* bias_data = nullptr; + if (op_info->HasInput("Bias")) { + auto bias_var_names = op_info->Input("Bias"); + if (bias_var_names.size() > 0) { + auto bias_var_name = bias_var_names.front(); + bias = scope->FindVar(bias_var_name)->GetMutable(); + bias_data = bias->mutable_data(); + } + } + auto input_data = input->data(); + auto w_data = w->mutable_data(); + auto out_data = out->mutable_data(); + auto in_mat_dims = input->dims().Flatten2D(in_num_col_dims); + int out_num_classes = w->dims()[1]; + const int M = in_mat_dims[0]; + const int K = in_mat_dims[1]; + const int N = out_num_classes; + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + out_data[m * N + n] = 0; + for (int k = 0; k < K; ++k) { + out_data[m * N + n] += input_data[m * K + k] * w_data[k * N + n]; + } + } + } + if (bias_data != nullptr) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + out_data[m * N + n] += bias_data[n]; + } + } + } +} + +void test_fc(const std::vector& input_shape, + const std::vector& w_shape, + int in_num_col_dims, + bool has_bias) { + CHECK_EQ(w_shape.size(), 2UL); + + Scope scope; + std::string input_var_name("Input"); + std::string w_var_name("W"); + std::string w_int_var_name("W_int"); + std::string bias_var_name("Bias"); + std::string out_var_name("Out"); + std::string out_ref_var_name("out_ref"); + auto* input = scope.Var(input_var_name)->GetMutable(); + auto* w = scope.Var(w_var_name)->GetMutable(); + auto* w_int = scope.Var(w_int_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + input->Resize(input_shape); + w->Resize(w_shape); + w_int->Resize(w_shape); + + FillTensor(w_int, -127, 127); + float w_scale = 1. / 1024; + float input_scale = 1. / 8; + + Tensor input_int; + input_int.Resize(input_shape); + FillTensor(&input_int, -127, 127); + for (int i = 0; i < input->data_size(); i++) { + input->mutable_data()[i] = input_int.data()[i] * input_scale; + } + + for (int i = 0; i < w->data_size(); i++) { + w->mutable_data()[i] = w_int->data()[i] * w_scale; + } + + // create fc op + cpp::OpDesc fc_op_desc; + fc_op_desc.SetType("fc"); + fc_op_desc.SetInput("Input", {input_var_name}); + fc_op_desc.SetInput("W", {w_var_name}); + fc_op_desc.SetOutput("Out", {out_var_name}); + fc_op_desc.SetAttr("in_num_col_dims", static_cast(in_num_col_dims)); + if (has_bias) { + auto* bias = scope.Var(bias_var_name)->GetMutable(); + bias->Resize({w_shape[1]}); + FillTensor(bias); + fc_op_desc.SetInput("Bias", {bias_var_name}); + } + + auto fc_op = CreateOp(fc_op_desc, &scope); + fc_ref(fc_op); + out_ref->CopyDataFrom(*out); + + // create fc imlu op + cpp::OpDesc fc_op_desc_mlu; + fc_op_desc_mlu.SetType("fc"); + fc_op_desc_mlu.SetInput("Input", {input_var_name}); + fc_op_desc_mlu.SetInput("W", {w_int_var_name}); + fc_op_desc_mlu.SetOutput("Out", {out_var_name}); + fc_op_desc_mlu.SetAttr("in_num_col_dims", static_cast(in_num_col_dims)); + + fc_op_desc_mlu.SetAttr("weight_scale", + std::vector(w_shape[1], w_scale)); + fc_op_desc_mlu.SetAttr("input_scale", input_scale); + if (has_bias) { + fc_op_desc_mlu.SetInput("Bias", {bias_var_name}); + } + + auto fc_op_mlu = CreateOp(fc_op_desc_mlu, &scope); + input->Resize({static_cast(input_shape[0]), + static_cast(input_shape[2]), + static_cast(input_shape[3]), + static_cast(input_shape[1])}); + out->Resize({static_cast(input_shape[0]), static_cast(w_shape[1])}); + LaunchOp(fc_op_mlu, {input_var_name}, {out_var_name}); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +TEST(MLUBridges, fc) { + for (bool use_bias : {true, false}) { + // test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias); + // test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias); + // test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias); + test_fc({1, 1024, 1, 1}, {1024, 32}, 1, use_bias); + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(MLU, fc, paddle::lite::subgraph::mlu::FCConverter); diff --git a/lite/kernels/mlu/bridges/graph.cc b/lite/kernels/mlu/bridges/graph.cc new file mode 100644 index 0000000000000000000000000000000000000000..27c6ab2597fa6930b14c4c4e34750030608167b6 --- /dev/null +++ b/lite/kernels/mlu/bridges/graph.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include +#include +#include "lite/kernels/mlu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +std::shared_ptr Graph::AddNode(const std::string& name, + std::vector shape, + cnmlTensorType_t tensor_type, + cnmlDataOrder_t data_order, + cnmlDataType_t mlu_dtype, + void* raw_ptr) { + CHECK(!HasNode(name)); + auto node = std::shared_ptr( + new MLUTensor(shape, tensor_type, data_order, mlu_dtype)); + node->set_mlu_ptr(raw_ptr); + nodes_.insert(std::make_pair(name, node)); + return node; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h new file mode 100644 index 0000000000000000000000000000000000000000..140900a2dde004281945e50fb1c72d09b58befa1 --- /dev/null +++ b/lite/kernels/mlu/bridges/graph.h @@ -0,0 +1,166 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/tensor.h" +#include "lite/kernels/mlu/bridges/tensor.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +// The Context of the converters which used for converting the ops of subgraph +// to the MLU IR graph +class Graph { + public: + Graph() { CNML_CALL(cnmlCreateFusionOp(&fusion_op_)); } + + ~Graph() { + CNML_CALL(cnmlDestroyFusionOp(&fusion_op_)); + for (auto op : ops_) { + CNML_CALL(cnmlDestroyBaseOp(&op)); + } + } + + // Data node + std::shared_ptr AddNode( + const std::string& name, + std::vector shape, + cnmlTensorType_t tensor_type = CNML_TENSOR, + cnmlDataOrder_t data_order = CNML_NCHW, + cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32, + void* raw_ptr = nullptr); + + std::shared_ptr GetNode(const std::string& name) { + CHECK(HasNode(name)) << "[MLU] Node " << name << " not found."; + return nodes_.at(name); + } + + bool HasNode(const std::string& name) { + return nodes_.find(name) != nodes_.end(); + } + + void AddInput(std::shared_ptr tensor) { + inputs_.push_back(tensor->mlu_tensor()); + input_tensors_.push_back(tensor); + } + + void AddOutput(std::shared_ptr tensor) { + outputs_.push_back(tensor->mlu_tensor()); + output_tensors_.push_back(tensor); + } + + void FuseOp(cnmlBaseOp_t op) { CNML_CALL(cnmlFuseOp(op, fusion_op_)); } + + void Compile(cnmlCoreVersion_t core_version, int core_number) { + CNML_CALL(cnmlSetFusionIO(fusion_op_, + inputs_.data(), + inputs_.size(), + outputs_.data(), + outputs_.size())); + CNML_CALL(cnmlSetFusionOpCorenum(fusion_op_, core_number)); + CNML_CALL(cnmlSetFusionOpCoreVersion(fusion_op_, core_version)); + CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_)); + for (auto in : input_tensors_) { + input_addrs_.push_back(in->mlu_data()); + } + for (auto out : output_tensors_) { + output_addrs_.push_back(out->mlu_data()); + } + } + + void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) { + CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_, + input_addrs_.data(), + input_addrs_.size(), + output_addrs_.data(), + output_addrs_.size(), + &forward_param, + que)); + CNRT_CALL(cnrtSyncQueue(que)); + } + + void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) { + const float* data = tensor->data(); + size_t len = tensor->data_size(); + if (fp_type_ == CNML_DATA_FLOAT32) { + CNML_CALL(cnmlBindConstData_V2( + nodes_[tensor_name]->mlu_tensor(), + const_cast(static_cast(data)), + false)); + } else if (fp_type_ == CNML_DATA_FLOAT16) { + auto* data_fp16 = tensor->mutable_data<::paddle::lite::fluid::float16>(); + for (size_t i = 0; i < len; ++i) { + data_fp16[i] = static_cast<::paddle::lite::fluid::float16>(data[i]); + } + CNML_CALL(cnmlBindConstData_V2(nodes_[tensor_name]->mlu_tensor(), + static_cast(data_fp16), + false)); + } else { + CHECK(0); + } + } + + void SetComputingDataType(cnmlBaseOp_t op, + cnmlTensor_t tensor, + float scale, + cnmlDataType_t data_type = CNML_DATA_INT8) { + cnmlQuantizedParam_t quant_param; + CNML_CALL( + cnmlCreateQuantizedParam(&quant_param, scale2position(scale), 1, 0.0)); + CNML_CALL( + cnmlSetOperationComputingDataType(op, tensor, data_type, quant_param)); + CNML_CALL(cnmlDestroyQuantizedParam(&quant_param)); + } + + void SetFPType(::paddle::lite_api::PrecisionType type) { + switch (type) { + case ::paddle::lite_api::PrecisionType::kFP16: + fp_type_ = CNML_DATA_FLOAT16; + break; + case ::paddle::lite_api::PrecisionType::kFloat: + fp_type_ = CNML_DATA_FLOAT32; + break; + default: + CHECK(0); + } + } + + cnmlDataType_t FPType() { return fp_type_; } + + private: + cnmlDataType_t fp_type_{CNML_DATA_FLOAT32}; + std::unordered_map> nodes_; + std::vector inputs_; + std::vector outputs_; + std::vector input_addrs_; + std::vector output_addrs_; + std::vector> input_tensors_; + std::vector> output_tensors_; + std::vector ops_; + cnmlFusionOp_t fusion_op_; +}; + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/mlu/bridges/paddle_use_bridges.h b/lite/kernels/mlu/bridges/paddle_use_bridges.h new file mode 100644 index 0000000000000000000000000000000000000000..1b12970afadd4e3bdcd7568c05bc15583ccbaaae --- /dev/null +++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h @@ -0,0 +1,24 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +USE_SUBGRAPH_BRIDGE(relu, kMLU); +USE_SUBGRAPH_BRIDGE(conv2d, kMLU); +USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU); +USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU); +USE_SUBGRAPH_BRIDGE(pool2d, kMLU); +USE_SUBGRAPH_BRIDGE(softmax, kMLU); +USE_SUBGRAPH_BRIDGE(batch_norm, kMLU); +USE_SUBGRAPH_BRIDGE(fc, kMLU); diff --git a/lite/kernels/mlu/bridges/pool_op.cc b/lite/kernels/mlu/bridges/pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3119b6c77dca10641c7c7c32072969fedb1ecef6 --- /dev/null +++ b/lite/kernels/mlu/bridges/pool_op.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/pool_op.h" +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +inline cnmlPoolMode_t ToCnmlPoolMode(const std::string& pool_mode) { + cnmlPoolMode_t cnml_pool_mode; + if (pool_mode == "max") { + cnml_pool_mode = CNML_POOL_MAX; + } else if (pool_mode == "avg") { + cnml_pool_mode = CNML_POOL_AVG; + } else { + CHECK(false) << "Unexpected pool mode " << pool_mode; + } + + return cnml_pool_mode; +} + +int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Get input, and attributes + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindTensor(x_var_name); + auto input_dims_nhwc = x->dims(); + const auto input_dims = DimNHWC2NCHW(input_dims_nhwc); + auto output_var_name = op_info->Output("Out").front(); + auto pooling_type = op_info->GetAttr("pooling_type"); + auto ceil_mode = op_info->GetAttr("ceil_mode"); + auto paddings = op_info->GetAttr>("paddings"); + auto global_pooling = op_info->GetAttr("global_pooling"); + auto ksize = op_info->GetAttr>("ksize"); + auto strides = op_info->GetAttr>("strides"); + + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + int pad_height = paddings[0]; + int pad_width = paddings[2]; + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + bool adaptive = false; + if (op_info->HasAttr("adaptive")) { + adaptive = op_info->GetAttr("adaptive"); + } + lite::operators::UpdatePadding(&paddings, + global_pooling, + adaptive, + padding_algorithm, + x->dims(), + strides, + ksize); + + std::vector output_shape({input_dims[0], input_dims[1]}); + for (size_t i = 0; i < 2; i++) { + output_shape.push_back( + (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - ksize[0]) / + strides[i] + + 1); + } + + auto output_shape_nhwc = DimNCHW2NHWC(output_shape); + auto output_tensor = graph->AddNode(output_var_name, + output_shape_nhwc, + CNML_TENSOR, + CNML_NHWC, + graph->FPType()); + scope->FindVar(output_var_name) + ->GetMutable<::paddle::lite::Tensor>() + ->Resize(output_shape_nhwc); + + cnmlPoolOpParam_t pool_param; + CNML_CALL( + cnmlCreatePoolOpParam_V2(&pool_param, + ksize[0], + ksize[1], + strides[0], + strides[1], + pad_height, + pad_width, + 1, // dilation + 1, + ToCnmlPoolMode(pooling_type), + ceil_mode ? CNML_POOL_KVALID : CNML_POOL_KFULL, + true, /* real */ + 1 /* blend factor */)); + cnmlBaseOp_t pool_op; + CNML_CALL(cnmlCreatePoolOp(&pool_op, + pool_param, + graph->GetNode(x_var_name)->mlu_tensor(), + output_tensor->mlu_tensor())); + CNML_CALL(cnmlDestroyPoolOpParam(&pool_param)); + graph->FuseOp(pool_op); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(pool2d, + kMLU, + paddle::lite::subgraph::mlu::PoolConverter); diff --git a/lite/kernels/mlu/bridges/pool_op_test.cc b/lite/kernels/mlu/bridges/pool_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..29ef68781f4a99ebcc20901dabab6ee22a258424 --- /dev/null +++ b/lite/kernels/mlu/bridges/pool_op_test.cc @@ -0,0 +1,280 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/pool_op.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int PoolConverter(void* ctx, OpLite* op); + +void pool_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + auto& in_dims = x->dims(); + auto& out_dims = out->dims(); + + const float* src_ptr = x->data(); + float* dst_ptr = out->mutable_data(); + + std::vector ksize = op_info->GetAttr>("ksize"); + std::vector strides = op_info->GetAttr>("strides"); + std::vector paddings = op_info->GetAttr>("paddings"); + bool exclusive = op_info->GetAttr("exclusive"); + std::string pooling_type = op_info->GetAttr("pooling_type"); + bool global_pooling = op_info->GetAttr("global_pooling"); + + int in_n = in_dims[0]; + int in_c = in_dims[1]; + int in_h = in_dims[2]; + int in_w = in_dims[3]; + int size_in_n = in_c * in_h * in_w; + int size_in_c = in_h * in_w; + + int out_h = out_dims[2]; + int out_w = out_dims[3]; + int size_out_n = in_c * out_h * out_w; + int size_out_c = out_h * out_w; + + int window_h = ksize[0]; + int window_w = ksize[1]; + int stride_h = strides[0]; + int stride_w = strides[1]; + int pad_h = paddings[0]; + int pad_w = paddings[2]; + + if (global_pooling == true) { + for (int n = 0; n < in_n; ++n) { + for (int c = 0; c < in_c; ++c) { + const float* src = src_ptr + n * size_in_n + c * size_in_c; + float res = src[0]; + if (pooling_type == "max") { + for (int i = 1; i < size_in_c; ++i) { + float cur_val = src[i]; + res = cur_val > res ? cur_val : res; + } + } else if (pooling_type == "avg") { + for (int i = 1; i < size_in_c; ++i) { + float cur_val = src[i]; + res += cur_val; + } + res /= size_in_c; + } + dst_ptr[n * size_out_n + c] = res; + } + } + } else { + for (int n = 0; n < in_n; ++n) { + for (int c = 0; c < in_c; ++c) { + for (int h = 0; h < out_h; ++h) { + int sh = h * stride_h; + int eh = sh + window_h; + sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; + eh = (eh - pad_h) > in_h ? in_h : eh - pad_h; + for (int w = 0; w < out_w; ++w) { + int sw = w * stride_w; + int ew = sw + window_w; + sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; + ew = (ew - pad_w) > in_w ? in_w : ew - pad_w; + int pooling_size = (ew - sw) * (eh - sh); + if (pooling_size == 0) continue; + float res = 0.f; + for (int kh = sh; kh < eh; ++kh) { + for (int kw = sw; kw < ew; ++kw) { + int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw; + if (kh == sh && kw == sw) { + res = src_ptr[src_idx]; + } else { + if (pooling_type == "max") { + res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx]; + } + if (pooling_type == "avg") { + res += src_ptr[src_idx]; + } + } + } + } + if (pooling_type == "avg") { + if (exclusive) { + res /= pooling_size; + } else { + res /= window_h * window_w; + } + } + dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res; + } + } + } + } + } +} + +void test_pool(int bs, + int ic, + int ih, + int iw, + std::string pooling_type, + bool ceil_mode, + bool global_pooling, + bool exclusive, + int ksize, + int stride, + int padding) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize({bs, ic, ih, iw}); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("pool2d"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("pooling_type", pooling_type); + opdesc.SetAttr("ksize", std::vector({ksize, ksize})); + opdesc.SetAttr("global_pooling", global_pooling); + opdesc.SetAttr("exclusive", exclusive); + opdesc.SetAttr("ceil_mode", ceil_mode); + opdesc.SetAttr("strides", std::vector({stride, stride})); + opdesc.SetAttr("paddings", + std::vector({padding, padding, padding, padding})); + + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + // execute reference implementation and save to output tensor + pool_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input_trans; + input_trans.Resize({bs, ic, ih, iw}); + transpose(x->mutable_data(), + input_trans.mutable_data(), + {bs, ic, ih, iw}, + {0, 2, 3, 1}); + + auto os = out->dims(); + out->Resize({static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}); + x->CopyDataFrom(input_trans); + x->Resize({bs, ih, iw, ic}); + + LaunchOp(op, {x_var_name}, {out_var_name}); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + Tensor output_trans; + output_trans.Resize(out->dims()); + transpose(out_data, + output_trans.mutable_data(), + {static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(MLUBridges, pool) { + // for (auto pooling_type : {"max", "avg"}) { + // for (auto ceil_mode : {true, false}) { + // for (auto global_pooling : {/*true, */ false}) { + // for (auto exclusive : {true /*, false*/}) { + // for (auto ksize : {2, 3}) { + // for (auto stride : {1, 2}) { + // for (auto padding : {0, 1}) { + // for (auto bs : {1, 3}) { + // for (auto ic : {1, 3}) { + // for (auto ih : {3, 7}) { + // for (auto iw : {3, 7}) { + // test_pool(bs, + // ic, + // ih, + // iw, + // pooling_type, + // ceil_mode, + // global_pooling, + // exclusive, + // ksize, + // stride, + // padding); + // } + // } + // } + // } + // } + // } + // } + // } + // } + // } + // } + + for (auto pooling_type : {"max", "avg"}) { + for (auto ceil_mode : {true, false}) { + bool global_pooling = false; + bool exclusive = true; + int ksize = 2; + int stride = 1; + int padding = 0; + int bs = 6; + int ic = 6; + int ih = 6; + int iw = 6; + test_pool(bs, + ic, + ih, + iw, + pooling_type, + ceil_mode, + global_pooling, + exclusive, + ksize, + stride, + padding); + } + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(MLU, + pool2d, + paddle::lite::subgraph::mlu::PoolConverter); diff --git a/lite/kernels/mlu/bridges/softmax_op.cc b/lite/kernels/mlu/bridges/softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b9e2b1116dc95ec276f8d85a5669cec45d98ea39 --- /dev/null +++ b/lite/kernels/mlu/bridges/softmax_op.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Get op's attributes + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + + // nchw axis to nhwc aixs + int nchw_to_nhwc_aixs_map[4] = {0, 3, 1, 2}; + int axis = 1; + if (op_info->HasAttr("axis")) { + axis = op_info->GetAttr("axis"); + if (axis < 0) { + axis = output_dims.size() + axis; + } + } + + int nhwc_axis = nchw_to_nhwc_aixs_map[axis]; + + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType()); + cnmlBaseOp_t softmax_op; + CNML_CALL(cnmlCreateNdSoftmaxOp(&softmax_op, + nhwc_axis, + graph->GetNode(x_var_name)->mlu_tensor(), + output_tensor->mlu_tensor())); + graph->FuseOp(softmax_op); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(softmax, + kMLU, + paddle::lite::subgraph::mlu::SoftmaxConverter); diff --git a/lite/kernels/mlu/bridges/softmax_op_test.cc b/lite/kernels/mlu/bridges/softmax_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..7ceb050d8008f8186fdd737c394d8fe8dc0ffd7f --- /dev/null +++ b/lite/kernels/mlu/bridges/softmax_op_test.cc @@ -0,0 +1,176 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/softmax_op.h" +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int SoftmaxConverter(void* ctx, OpLite* op); + +template +void softmax_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + auto x_data = x->data(); + auto out_data = out->mutable_data(); + DDim x_dims = x->dims(); + + auto x_rank = x_dims.size(); + int axis = op_info->GetAttr("axis"); + if (axis < 0) { + axis += x_rank; + } + int axis_size = x_dims[axis]; + int outer_num = x_dims.Slice(0, axis).production(); + int inner_num = x_dims.Slice(axis + 1, x_rank).production(); + int compute_size = outer_num * inner_num; + for (int i = 0; i < compute_size; i++) { + int idx_inner = i % inner_num; + int idx_outer = (i / inner_num) * axis_size; + int start = idx_outer * inner_num + idx_inner; + int offset; + + offset = start; + dtype max_data = std::numeric_limits::lowest(); + for (int j = 0; j < axis_size; j++) { + max_data = x_data[offset] > max_data ? x_data[offset] : max_data; + offset += inner_num; + } + + offset = start; + dtype sum_data = (dtype)0; + for (int j = 0; j < axis_size; j++) { + out_data[offset] = exp(x_data[offset] - max_data); + sum_data += out_data[offset]; + offset += inner_num; + } + + offset = start; + for (int j = 0; j < axis_size; j++) { + out_data[offset] /= sum_data; + offset += inner_num; + } + } +} + +void test_softmax(const std::vector& input_shape, int axis) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize(input_shape); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("softmax"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", axis); + + // create and convert op to NPU model, then run it on NPU + auto op = CreateOp(opdesc, &scope); + // execute reference implementation and save to output tensor + softmax_ref(op); + out_ref->CopyDataFrom(*out); + + int bs = x->dims()[0]; + int ic = x->dims()[1]; + int ih = x->dims()[2]; + int iw = x->dims()[3]; + Tensor input_trans; + input_trans.Resize({bs, ic, ih, iw}); + transpose(x->mutable_data(), + input_trans.mutable_data(), + {bs, ic, ih, iw}, + {0, 2, 3, 1}); + + out->Resize({bs, ih, iw, ic}); + x->CopyDataFrom(input_trans); + x->Resize({bs, ih, iw, ic}); + + LaunchOp(op, {x_var_name}, {out_var_name}); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + Tensor output_trans; + output_trans.Resize({bs, ic, ih, iw}); + transpose(out_data, + output_trans.mutable_data(), + {bs, ih, iw, ic}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(MLUBridges, softmax) { + // test_softmax({1, 4}, -1); + // // Bug exists in HiAI DDK when the number of items > 16500 + // test_softmax({1, 16500}, -1); + // test_softmax({1, 4}, 0); + // test_softmax({1, 4}, 1); + // test_softmax({3, 4}, -1); + // test_softmax({3, 4}, 0); + // test_softmax({3, 4}, 1); + // test_softmax({1, 4, 7}, -1); + // test_softmax({1, 4, 7}, 0); + // // Bug exists in HiAI DDK when axis is 1 and iw > 1 + // // test_softmax({1, 4, 7}, 1); + // test_softmax({1, 4, 1}, 1); + // test_softmax({1, 4, 7}, 2); + // test_softmax({3, 4, 7}, -1); + // test_softmax({3, 4, 7}, 0); + // test_softmax({3, 4, 1}, 1); + // test_softmax({3, 4, 7}, 2); + test_softmax({1, 4, 7, 9}, -1); + test_softmax({1, 4, 7, 9}, 0); + test_softmax({1, 4, 7, 9}, 1); + // Bug exists in HiAI DDK when axis is 2 and iw > 1 + // test_softmax({1, 4, 7, 9}, 2); + test_softmax({1, 4, 7, 1}, 2); + test_softmax({1, 4, 7, 9}, 3); + test_softmax({3, 4, 7, 9}, -1); + test_softmax({3, 4, 7, 9}, 0); + test_softmax({3, 4, 7, 9}, 1); + test_softmax({3, 4, 7, 1}, 2); + test_softmax({3, 4, 7, 9}, 3); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(MLU, + softmax, + paddle::lite::subgraph::mlu::SoftmaxConverter); diff --git a/lite/kernels/mlu/bridges/tensor.cc b/lite/kernels/mlu/bridges/tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..be7e1f09beaee61dace598b958ab4f95f14b38f8 --- /dev/null +++ b/lite/kernels/mlu/bridges/tensor.cc @@ -0,0 +1,271 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/tensor.h" +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +MLUTensor::MLUTensor(const std::vector& shape, + cnmlTensorType_t tensor_type, + cnmlDataOrder_t data_order, + cnmlDataType_t mlu_dtype) + : mlu_tensor_(nullptr), tensor_type_(tensor_type), mlu_ptr_(nullptr) { + std::vector int_shape; + for (auto i : shape) { + if (i <= INT_MAX) { + int_shape.push_back(i); + } else { + LOG(FATAL) << "Shape size is beyond the limitation of MLUTensor!"; + } + } + remember(int_shape, tensor_type, mlu_dtype, data_order); +} + +void MLUTensor::remember(const std::vector& shape, + cnmlTensorType_t tensor_type, + cnmlDataType_t mlu_dtype, + cnmlDataOrder_t shape_order) { + tensor_type_ = tensor_type; + mlu_dtype_ = mlu_dtype; + + int size = 4; + if (shape.size() > 4 || shape_order == CNML_ARRAY) { + size = shape.size(); + } + shape_.resize(size); + if (shape.size() <= 4) { + switch (shape_order) { + case CNML_NCHW: + shape_[0] = shape.size() > 0 ? shape[0] : 1; + shape_[3] = shape.size() > 1 ? shape[1] : 1; + shape_[1] = shape.size() > 2 ? shape[2] : 1; + shape_[2] = shape.size() > 3 ? shape[3] : 1; + break; + case CNML_NCWH: + shape_[0] = shape.size() > 0 ? shape[0] : 1; + shape_[3] = shape.size() > 1 ? shape[1] : 1; + shape_[2] = shape.size() > 3 ? shape[3] : 1; + shape_[1] = shape.size() > 2 ? shape[2] : 1; + break; + case CNML_NHWC: + shape_[0] = shape.size() > 0 ? shape[0] : 1; + shape_[3] = shape.size() > 3 ? shape[3] : 1; + shape_[1] = shape.size() > 1 ? shape[1] : 1; + shape_[2] = shape.size() > 2 ? shape[2] : 1; + break; + case CNML_NHCW: + shape_[0] = shape.size() > 0 ? shape[0] : 1; + shape_[3] = shape.size() > 2 ? shape[2] : 1; + shape_[1] = shape.size() > 1 ? shape[1] : 1; + shape_[2] = shape.size() > 3 ? shape[3] : 1; + break; + case CNML_NWCH: + shape_[0] = shape.size() > 0 ? shape[0] : 1; + shape_[3] = shape.size() > 2 ? shape[2] : 1; + shape_[1] = shape.size() > 3 ? shape[3] : 1; + shape_[2] = shape.size() > 1 ? shape[1] : 1; + break; + case CNML_NWHC: + shape_[0] = shape.size() > 0 ? shape[0] : 1; + shape_[3] = shape.size() > 3 ? shape[3] : 1; + shape_[1] = shape.size() > 2 ? shape[2] : 1; + shape_[2] = shape.size() > 1 ? shape[1] : 1; + break; + case CNML_CNHW: + shape_[0] = shape.size() > 1 ? shape[1] : 1; + shape_[3] = shape.size() > 0 ? shape[0] : 1; + shape_[1] = shape.size() > 2 ? shape[2] : 1; + shape_[2] = shape.size() > 3 ? shape[3] : 1; + break; + case CNML_CNWH: + shape_[0] = shape.size() > 1 ? shape[1] : 1; + shape_[3] = shape.size() > 0 ? shape[0] : 1; + shape_[1] = shape.size() > 3 ? shape[3] : 1; + shape_[2] = shape.size() > 2 ? shape[2] : 1; + break; + case CNML_CHWN: + shape_[0] = shape.size() > 3 ? shape[3] : 1; + shape_[3] = shape.size() > 0 ? shape[0] : 1; + shape_[1] = shape.size() > 1 ? shape[1] : 1; + shape_[2] = shape.size() > 2 ? shape[2] : 1; + break; + case CNML_CHNW: + shape_[0] = shape.size() > 2 ? shape[2] : 1; + shape_[3] = shape.size() > 0 ? shape[0] : 1; + shape_[1] = shape.size() > 1 ? shape[1] : 1; + shape_[2] = shape.size() > 3 ? shape[3] : 1; + break; + case CNML_CWNH: + shape_[0] = shape.size() > 2 ? shape[2] : 1; + shape_[3] = shape.size() > 0 ? shape[0] : 1; + shape_[1] = shape.size() > 3 ? shape[3] : 1; + shape_[2] = shape.size() > 1 ? shape[1] : 1; + break; + case CNML_CWHN: + shape_[0] = shape.size() > 3 ? shape[3] : 1; + shape_[3] = shape.size() > 0 ? shape[0] : 1; + shape_[1] = shape.size() > 2 ? shape[2] : 1; + shape_[2] = shape.size() > 1 ? shape[1] : 1; + break; + case CNML_HNCW: + shape_[0] = shape.size() > 1 ? shape[1] : 1; + shape_[3] = shape.size() > 2 ? shape[2] : 1; + shape_[1] = shape.size() > 0 ? shape[0] : 1; + shape_[2] = shape.size() > 3 ? shape[3] : 1; + break; + case CNML_HNWC: + shape_[0] = shape.size() > 1 ? shape[1] : 1; + shape_[3] = shape.size() > 3 ? shape[3] : 1; + shape_[1] = shape.size() > 0 ? shape[0] : 1; + shape_[2] = shape.size() > 2 ? shape[2] : 1; + break; + case CNML_HCWN: + shape_[0] = shape.size() > 3 ? shape[3] : 1; + shape_[3] = shape.size() > 1 ? shape[1] : 1; + shape_[1] = shape.size() > 0 ? shape[0] : 1; + shape_[2] = shape.size() > 2 ? shape[2] : 1; + break; + case CNML_HCNW: + shape_[0] = shape.size() > 2 ? shape[2] : 1; + shape_[3] = shape.size() > 1 ? shape[1] : 1; + shape_[1] = shape.size() > 0 ? shape[0] : 1; + shape_[2] = shape.size() > 3 ? shape[3] : 1; + break; + case CNML_HWNC: + shape_[0] = shape.size() > 2 ? shape[2] : 1; + shape_[3] = shape.size() > 3 ? shape[3] : 1; + shape_[1] = shape.size() > 0 ? shape[0] : 1; + shape_[2] = shape.size() > 1 ? shape[1] : 1; + break; + case CNML_HWCN: + shape_[0] = shape.size() > 3 ? shape[3] : 1; + shape_[3] = shape.size() > 2 ? shape[2] : 1; + shape_[1] = shape.size() > 0 ? shape[0] : 1; + shape_[2] = shape.size() > 1 ? shape[1] : 1; + break; + case CNML_WNCH: + shape_[0] = shape.size() > 1 ? shape[1] : 1; + shape_[3] = shape.size() > 2 ? shape[2] : 1; + shape_[1] = shape.size() > 3 ? shape[3] : 1; + shape_[2] = shape.size() > 0 ? shape[0] : 1; + break; + case CNML_WNHC: + shape_[0] = shape.size() > 1 ? shape[1] : 1; + shape_[3] = shape.size() > 3 ? shape[3] : 1; + shape_[1] = shape.size() > 2 ? shape[2] : 1; + shape_[2] = shape.size() > 0 ? shape[0] : 1; + break; + case CNML_WCHN: + shape_[0] = shape.size() > 3 ? shape[3] : 1; + shape_[3] = shape.size() > 1 ? shape[1] : 1; + shape_[1] = shape.size() > 2 ? shape[2] : 1; + shape_[2] = shape.size() > 0 ? shape[0] : 1; + break; + case CNML_WCNH: + shape_[0] = shape.size() > 2 ? shape[2] : 1; + shape_[3] = shape.size() > 1 ? shape[1] : 1; + shape_[1] = shape.size() > 3 ? shape[3] : 1; + shape_[2] = shape.size() > 0 ? shape[0] : 1; + break; + case CNML_WHNC: + shape_[0] = shape.size() > 2 ? shape[2] : 1; + shape_[3] = shape.size() > 3 ? shape[3] : 1; + shape_[1] = shape.size() > 1 ? shape[1] : 1; + shape_[2] = shape.size() > 0 ? shape[0] : 1; + break; + case CNML_WHCN: + shape_[0] = shape.size() > 3 ? shape[3] : 1; + shape_[3] = shape.size() > 2 ? shape[2] : 1; + shape_[1] = shape.size() > 1 ? shape[1] : 1; + shape_[2] = shape.size() > 0 ? shape[0] : 1; + break; + case CNML_ARRAY: + shape_ = shape; + break; + default: + LOG(FATAL) << "Unsupported mluDataOrder! " << int(shape_order); + break; + } + } else { + switch (shape_order) { + case CNML_NCDHW: + shape_[0] = shape[0]; + shape_[4] = shape[1]; + shape_[1] = shape[2]; + shape_[2] = shape[3]; + shape_[3] = shape[4]; + break; + case CNML_NDHWC: + shape_[0] = shape[0]; + shape_[4] = shape[4]; + shape_[1] = shape[1]; + shape_[2] = shape[2]; + shape_[3] = shape[3]; + break; + case CNML_DHWCN: + shape_[0] = shape[4]; + shape_[4] = shape[3]; + shape_[1] = shape[0]; + shape_[2] = shape[1]; + shape_[3] = shape[2]; + break; + case CNML_ARRAY: + shape_ = shape; + break; + default: + shape_[0] = shape[0]; + shape_[4] = shape[1]; + shape_[1] = shape[2]; + shape_[2] = shape[3]; + shape_[3] = shape[4]; + break; + } + } + dim_ = shape_.size(); +} + +void MLUTensor::Create() { + if (mlu_tensor_ == nullptr) { + CNML_CALL(cnmlCreateTensor_V2(&mlu_tensor_, tensor_type_)); + std::vector dim_shape(shape_); + int* dim_strides = nullptr; + CNML_CALL(cnmlSetTensorShape_V2( + mlu_tensor_, dim_, dim_shape.data(), dim_strides)); + CNML_CALL(cnmlSetTensorDataType(mlu_tensor_, mlu_dtype_)); + } +} + +cnmlTensor_t MLUTensor::mlu_tensor() { + Create(); + return mlu_tensor_; +} + +MLUTensor::~MLUTensor() { + if (mlu_tensor_ != nullptr) { + CNML_CALL(cnmlDestroyTensor(&mlu_tensor_)); + mlu_tensor_ = nullptr; + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..7bb2e1b20334e359b2db0ecf1fe61e16175413dc --- /dev/null +++ b/lite/kernels/mlu/bridges/tensor.h @@ -0,0 +1,66 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/kernels/mlu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +class MLUTensor { + public: + MLUTensor() + : mlu_tensor_(nullptr), + tensor_type_(CNML_TENSOR), + mlu_dtype_(CNML_DATA_FLOAT32) {} + + void set_mlu_ptr(void* mlu_data) { mlu_ptr_ = mlu_data; } + + MLUTensor(const std::vector& shape, + cnmlTensorType_t tensor_type = CNML_TENSOR, + cnmlDataOrder_t data_order = CNML_NCHW, + cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32); + + void remember(const std::vector& shape, + cnmlTensorType_t tensor_type, + cnmlDataType_t mlu_dtype, + cnmlDataOrder_t shape_order); + void Create(); + cnmlTensor_t mlu_tensor(); + void* mlu_data() { + CHECK(mlu_ptr_ != nullptr); + return mlu_ptr_; + } + + ~MLUTensor(); + + private: + cnmlTensor_t mlu_tensor_; + + std::vector shape_; + cnmlTensorType_t tensor_type_; + cnmlDataType_t mlu_dtype_; + int dim_{0}; + cnmlDataOrder_t data_order_; + void* mlu_ptr_; +}; + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/mlu/bridges/test_helper.cc b/lite/kernels/mlu/bridges/test_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..cf2d7bd6c1ec5634bb0d7556a16166ac0b0bcb45 --- /dev/null +++ b/lite/kernels/mlu/bridges/test_helper.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/test_helper.h" +#include +#include "lite/core/device_info.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/mlu/subgraph_compute.h" +#include "lite/kernels/npu/bridges/registry.h" +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void LaunchOp(const std::shared_ptr op, + const std::vector& input_var_names, + const std::vector& output_var_names) { + CNRT_CALL(cnrtInit(0)); + SetMluDevice(0); + cnrtQueue_t queue_; + cnrtInvokeFuncParam_t forward_param; + u32_t affinity = 1; + int data_param = 1; + forward_param.data_parallelism = &data_param; + forward_param.affinity = &affinity; + forward_param.end = CNRT_PARAM_END; + CNRT_CALL(cnrtCreateQueue(&queue_)); + cnrtDev_t dev_handle; + CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, 0)); + CNRT_CALL(cnrtSetCurrentDevice(dev_handle)); + auto scope = op->scope(); + auto op_type = op->op_info()->Type(); + paddle::lite::subgraph::mlu::Graph graph; + // convert op to IR graph + const auto& bridges = subgraph::Registry::Instance(); + CHECK(bridges.Exists(op_type, TARGET(kMLU))); + + // Convert all of input data vars and added into the MLU IR graph + for (auto& input_name : input_var_names) { + auto input_tensor = scope->FindMutableTensor(input_name); + CHECK(input_tensor); + Tensor temp_input; + temp_input.Resize(input_tensor->dims().Vectorize()); + temp_input.CopyDataFrom(*input_tensor); + auto input_node = + graph.AddNode(input_name, + input_tensor->dims().Vectorize(), + CNML_TENSOR, + CNML_NHWC, + graph.FPType(), + reinterpret_cast( + input_tensor->mutable_data(TARGET(kMLU)))); + CHECK(input_node); + CNRT_CHECK(cnrtMemcpy(input_tensor->mutable_data(), + temp_input.mutable_data(), + sizeof(float) * input_tensor->dims().production(), + CNRT_MEM_TRANS_DIR_HOST2DEV)); + } + bridges.Select(op_type, TARGET(kMLU))( + reinterpret_cast(&graph), const_cast(op.get()), nullptr); + + for (auto& output_name : output_var_names) { + if (graph.HasNode(output_name)) { + graph.AddOutput(graph.GetNode(output_name)); + } + auto output_tensor = scope->FindMutableTensor(output_name); + void* p_data = + static_cast(output_tensor->mutable_data(TARGET(kMLU))); + auto node = graph.GetNode(output_name); + CHECK(p_data); + node->set_mlu_ptr(p_data); + } + for (auto& input_name : input_var_names) { + graph.AddInput(graph.GetNode(input_name)); + } + + graph.Compile(CNML_MLU270, 1); + + graph.Compute(forward_param, queue_); + for (auto& output_name : output_var_names) { + auto output_tensor = scope->FindMutableTensor(output_name); + Tensor temp_out; + temp_out.Resize(output_tensor->dims().Vectorize()); + CNRT_CHECK(cnrtMemcpy(temp_out.mutable_data(TARGET(kHost)), + output_tensor->mutable_data(), + sizeof(float) * output_tensor->dims().production(), + CNRT_MEM_TRANS_DIR_DEV2HOST)); + output_tensor->mutable_data(TARGET(kHost)); + output_tensor->CopyDataFrom(temp_out); + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +// USE_LITE_OP(graph_op); +// USE_LITE_KERNEL(graph_op, kMLU, kFloat, kNHWC, def); diff --git a/lite/kernels/mlu/bridges/test_helper.h b/lite/kernels/mlu/bridges/test_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..4da9e72dfcc5a81a68467f7622e2c16aedb2ded5 --- /dev/null +++ b/lite/kernels/mlu/bridges/test_helper.h @@ -0,0 +1,66 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/kernels/mlu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +template +std::shared_ptr CreateOp(const cpp::OpDesc& opdesc, lite::Scope* scope) { + auto op = std::make_shared(opdesc.Type()); + op->SetValidPlaces( + {Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kX86), PRECISION(kFloat)}, + Place{TARGET(kMLU), PRECISION(kFloat), DATALAYOUT(kNHWC)}}); + CHECK(op->Attach(opdesc, scope)); + CHECK(op->CheckShape()); + CHECK(op->InferShape()); + return op; +} + +// T is the target data type +// R is the range data type, e.g. int, half +template +void FillTensor(Tensor* x, + T lower = static_cast(-2), + T upper = static_cast(2)) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + + T* x_data = x->mutable_data(); + for (int i = 0; i < x->dims().production(); ++i) { + auto r = uniform_dist(rng) * (upper - lower) + lower; + x_data[i] = static_cast(static_cast(r)); + } +} + +void LaunchOp(const std::shared_ptr op, + const std::vector& input_var_names, + const std::vector& output_var_names); + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/mlu/bridges/utility.cc b/lite/kernels/mlu/bridges/utility.cc new file mode 100644 index 0000000000000000000000000000000000000000..f18a46518c09a69803a069ce40c1d7e3c01e9eca --- /dev/null +++ b/lite/kernels/mlu/bridges/utility.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/utility.h" +#include + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void transpose(float* input_data, + float* output_data, + std::vector input_shape, + std::vector axis) { + int old_index = -1; + int new_index = -1; + int dim[4] = {0}; + std::vector shape = input_shape; + for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) { + for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) { + for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) { + for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) { + old_index = dim[0] * shape[1] * shape[2] * shape[3] + + dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3]; + new_index = + dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] + + dim[axis[1]] * shape[axis[2]] * shape[axis[3]] + + dim[axis[2]] * shape[axis[3]] + dim[axis[3]]; + output_data[new_index] = input_data[old_index]; + } + } + } + } +} + +int scale2position(float scale) { return static_cast(-std::log2(scale)); } + +void dequant(float* dst, int8_t* src, size_t size, float scale) { + for (size_t i = 0; i < size; ++i) { + dst[i] = static_cast(src[i]) * scale; + } +} + +void dequant(float* dst, + int8_t* src, + size_t size_o, + size_t size, + size_t size_in, + std::vector scales) { + for (int out = 0; out < size_o; ++out) { + for (int s = 0; s < size; ++s) { + auto scale = scales[s]; + for (int in = 0; in < size_in; ++in) { + int idx = in + s * size_in + out * size_in * size; + dst[idx] = static_cast(src[idx]) * scale; + } + } + } +} + +cnmlActiveFunction_t OpTypeToCNMLActType(std::string op_type) { + if (op_type == "relu") { + return CNML_ACTIVE_RELU; + } else if (op_type == "sigmoid") { + return CNML_ACTIVE_SIGMOID; + } else if (op_type == "tanh") { + return CNML_ACTIVE_TANH; + } else if (op_type == "relu1") { + return CNML_ACTIVE_RELU1; + } else if (op_type == "relu6") { + return CNML_ACTIVE_RELU6; + } else if (op_type == "hard_sigmoid") { + return CNML_ACTIVE_HARD_SIGMOID; + } + LOG(FATAL) << "CNML Unspoorted op type " << op_type; + return CNML_ACTIVE_NONE; +} + +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname) { + auto iarg_names = op_info->input_argnames(); + if (std::find(iarg_names.begin(), iarg_names.end(), argname) != + iarg_names.end()) { + auto inputs = op_info->Input(argname); + if (inputs.empty()) { + return false; + } + auto var_name = inputs.front(); + auto var = scope->FindVar(var_name); + return var != nullptr; + } else { + return false; + } +} +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h new file mode 100644 index 0000000000000000000000000000000000000000..2af8274e07713300277f7280f12e6d1fcb47c3c2 --- /dev/null +++ b/lite/kernels/mlu/bridges/utility.h @@ -0,0 +1,93 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/backends/mlu/mlu_utils.h" +#include "lite/core/op_lite.h" +#include "lite/core/tensor.h" +#include "lite/fluid/data_type.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void transpose(float* input_data, + float* output_data, + std::vector input_shape, + std::vector axis); +int scale2position(float scale); +void dequant(float* dst, int8_t* src, size_t size, float scale); + +void dequant(float* dst, + int8_t* src, + size_t size_o, + size_t size, + size_t size_in, + std::vector scales); + +template +std::vector recip(std::vector x); +// Type/tensor converters for converting Paddle type/tensor to MLU type/tensor +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname); + +cnmlActiveFunction_t OpTypeToCNMLActType(std::string op_type); + +inline const ::paddle::lite::DDimLite DimNHWC2NCHW( + const ::paddle::lite::DDimLite& dim) { + return ::paddle::lite::DDimLite( + std::vector({dim[0], dim[3], dim[1], dim[2]})); +} + +inline const ::paddle::lite::DDimLite DimNCHW2NHWC( + const ::paddle::lite::DDimLite& dim) { + return ::paddle::lite::DDimLite( + std::vector({dim[0], dim[2], dim[3], dim[1]})); +} + +inline const std::vector DimNHWC2NCHW( + const std::vector& dim) { + return std::vector({dim[0], dim[3], dim[1], dim[2]}); +} + +inline const std::vector DimNCHW2NHWC( + const std::vector& dim) { + return std::vector({dim[0], dim[2], dim[3], dim[1]}); +} + +template +struct FPTypeTraits {}; + +template <> +struct FPTypeTraits { + typedef float T; +}; + +template <> +struct FPTypeTraits { + typedef ::paddle::lite::fluid::float16 T; +}; + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/mlu/calib_compute.cc b/lite/kernels/mlu/calib_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..a3be9968bd0aeeb02541374a8ce390e3601ba22f --- /dev/null +++ b/lite/kernels/mlu/calib_compute.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/calib_compute.h" +#include +#include "lite/backends/arm/math/type_trans.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace mlu { + +void CalibComputeFp32ToInt8::Run() { + // auto& param = this->Param(); + // std::vector scale = {param.scale}; + // const auto* din = param.input->data(); + // auto* dout = param.output->mutable_data(); + // lite::arm::math::fp32_to_int8( + // din, dout, scale.data(), 1, 1, param.input->numel()); + // return; +} + +void CalibComputeInt8ToFp32::Run() { + // auto& param = this->Param(); + // const auto* din = param.input->data(); + // std::vector scale = {param.scale}; + // auto* dout = param.output->mutable_data(); + // lite::arm::math::int8_to_fp32( + // din, dout, scale.data(), 1, 1, param.input->numel()); + // return; +} + +} // namespace mlu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(calib, + kMLU, + kInt8, + kNCHW, + paddle::lite::kernels::mlu::CalibComputeFp32ToInt8, + fp32_to_int8) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))}) + .Finalize(); + +REGISTER_LITE_KERNEL(calib, + kMLU, + kInt8, + kNCHW, + paddle::lite::kernels::mlu::CalibComputeInt8ToFp32, + int8_to_fp32) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))}) + .Finalize(); +REGISTER_LITE_KERNEL(calib_once, + kMLU, + kInt8, + kNCHW, + paddle::lite::kernels::mlu::CalibComputeFp32ToInt8, + fp32_to_int8) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))}) + .Finalize(); + +REGISTER_LITE_KERNEL(calib_once, + kMLU, + kInt8, + kNCHW, + paddle::lite::kernels::mlu::CalibComputeInt8ToFp32, + int8_to_fp32) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))}) + .Finalize(); diff --git a/lite/kernels/mlu/calib_compute.h b/lite/kernels/mlu/calib_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..3c5988c165c69b488dc653150a596fb96e45cde3 --- /dev/null +++ b/lite/kernels/mlu/calib_compute.h @@ -0,0 +1,51 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" +#include "lite/operators/calib_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace mlu { + +class CalibComputeFp32ToInt8 + : public KernelLite { + public: + using param_t = operators::CalibParam; + + void Run() override; + + ~CalibComputeFp32ToInt8() override{}; + + private: +}; + +class CalibComputeInt8ToFp32 + : public KernelLite { + public: + using param_t = operators::CalibParam; + + void Run() override; + + ~CalibComputeInt8ToFp32() override{}; + + private: +}; + +} // namespace mlu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/mlu/io_copy_compute.cc b/lite/kernels/mlu/io_copy_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..bc6e1838d70383edb3dcc65d7a9b0f627719e963 --- /dev/null +++ b/lite/kernels/mlu/io_copy_compute.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2019 Cambricon Authors. All Rights Reserved. + +#include +#include "lite/backends/mlu/target_wrapper.h" +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace mlu { + +using TargetW = TargetWrapper; + +// Host to MLU memory. +void CopyFromHostSync(void* target, const void* source, size_t size) { + TargetW::MemcpySync(target, source, size, IoDirection::HtoD); +} + +// MLU to Host memory. +void CopyToHostSync(void* target, const void* source, size_t size) { + TargetW::MemcpySync(target, source, size, IoDirection::DtoH); +} + +/* + * This kernel copies a tensor from host to MLU space. + */ +template +class IoCopyHostToMluCompute + : public KernelLite { + public: + using handler_t = KernelBase::type_infer_handler_t; + using param_t = operators::IoCopyParam; + + void Run() override { + auto& param = this->template Param(); + CHECK(param.x->target() == TARGET(kHost) || + param.x->target() == TARGET(kX86)); + auto mem_size = param.x->memory_size(); + // LOG(INFO) << "copy size " << mem_size; + auto* data = param.y->mutable_data(TARGET(kMLU), mem_size); + CopyFromHostSync(data, param.x->raw_data(), mem_size); + } + + std::unique_ptr GetTypeInferHandler() override { + std::unique_ptr res(new handler_t); + *res = [](const std::map& inputs, + const std::string& out) -> const Type* { + CHECK(!inputs.empty()); + auto* type = inputs.at("Input"); + CHECK(type->target() == TARGET(kHost)); + + auto out_place = type->place(); + out_place.target = TARGET(kMLU); + auto* out_type = Type::Get(type->id(), + out_place.target, + out_place.precision, + out_place.layout, + out_place.device); + return out_type; + }; + return res; + } + + std::string doc() const override { return "Copy IO from HOST to MLU"; } +}; + +/* + * This kernel copies a tensor from MLU to host space. + */ +template +class IoCopyMluToHostCompute + : public KernelLite { + public: + void Run() override { + auto& param = this->template Param(); + CHECK(param.x->target() == TARGET(kMLU)); + auto mem_size = param.x->memory_size(); + auto* data = param.y->mutable_data(TARGET(kHost), mem_size); + CopyToHostSync(data, param.x->raw_data(), mem_size); + } + + std::string doc() const override { return "Copy IO from MLU to HOST"; } +}; + +} // namespace mlu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + io_copy, + kMLU, + kFloat, + kNHWC, + paddle::lite::kernels::mlu::IoCopyHostToMluCompute, + host_to_device_kFloat) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + io_copy, + kMLU, + kFP16, + kNHWC, + paddle::lite::kernels::mlu::IoCopyHostToMluCompute, + host_to_device_kFP16) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + io_copy, + kMLU, + kFloat, + kNHWC, + paddle::lite::kernels::mlu::IoCopyMluToHostCompute, + device_to_host_kFloat) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + io_copy, + kMLU, + kFP16, + kNHWC, + paddle::lite::kernels::mlu::IoCopyMluToHostCompute, + device_to_host_kFP16) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); + +// kMLU, +// kFloat, +// kNHWC, +// paddle::lite::kernels::mlu::IoCopyHostToMluCompute, +// host_to_device) +// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))}) +// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))}) +// .Finalize(); +// +// +// kMLU, +// kFloat, +// kNHWC, +// paddle::lite::kernels::mlu::IoCopyMluToHostCompute, +// device_to_host) +// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))}) +// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) +// .Finalize(); diff --git a/lite/kernels/mlu/subgraph_compute.cc b/lite/kernels/mlu/subgraph_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..73ca9dcc20a6311d33e5cff6c6ed6be08f3c7a1f --- /dev/null +++ b/lite/kernels/mlu/subgraph_compute.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/subgraph_compute.h" +#include +#include +#include +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" +#include "lite/kernels/mlu/bridges/paddle_use_bridges.h" +#include "lite/kernels/mlu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace mlu {} // namespace mlu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + subgraph, + kMLU, + kFloat, + kNHWC, + paddle::lite::kernels::mlu::SubgraphCompute, + def_kFloat) + .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))}) + .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + subgraph, + kMLU, + kFP16, + kNHWC, + paddle::lite::kernels::mlu::SubgraphCompute, + def_FP16) + .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))}) + .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))}) + .Finalize(); diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..06fc791fe7d07ba759e2ed0f9c6187432e195186 --- /dev/null +++ b/lite/kernels/mlu/subgraph_compute.h @@ -0,0 +1,168 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/api/paddle_place.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" +#include "lite/core/types.h" +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/npu/bridges/engine.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace mlu { + +template +class SubgraphEngine : public subgraph::Engine { + public: + SubgraphEngine(KernelContext* ctx, + int block_idx, + cpp::BlockDesc* block_desc, + const std::vector& input_names, + const std::vector& output_names, + Scope* scope, + ::paddle::lite_api::PrecisionType type) + : subgraph::Engine( + ctx, block_idx, block_desc, input_names, output_names, scope) { + graph_.SetFPType(type); + } + + protected: + int BuildDeviceProgram() override { + int status = 0; + // Convert all of input data vars and added into the MLU IR graph + for (auto& input_name : input_names_) { + auto input_tensor = scope_->FindMutableTensor(input_name); + CHECK(input_tensor); + auto input_node = + graph_.AddNode(input_name, + input_tensor->dims().Vectorize(), + CNML_TENSOR, + CNML_NHWC, + graph_.FPType(), + const_cast(input_tensor->raw_data())); + CHECK(input_node); + // MLU doesn't support dynamic dimensions/shapes, so need to rebuild + // the program when the shape of any input tensor is changed. + status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED; + } + LOG(INFO) << "START TO CONVERT "; + // Convert all of ops and its weights and added into the MLU IR graph + const auto& bridges = subgraph::Registry::Instance(); + for (auto& inst : origin_program_) { + auto op = inst.op(); + CHECK(op); + op->CheckShape(); + op->InferShape(); + std::string op_type = op->op_info()->Type(); + if (!bridges.Exists(op_type, TARGET(kMLU))) { + LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type; + return subgraph::FAILED; + } + auto kernel = inst.kernel(); + status |= bridges.Select(op_type, TARGET(kMLU))( + reinterpret_cast(&graph_), + const_cast(op), + const_cast(kernel)); + if (subgraph::CHECK_FAILED(status)) { + return subgraph::FAILED; + } + } + // Obtain the output nodes of the MLU IR graph and build the graph to MLU + // runtime + std::vector valid_output_names; + for (auto& output_name : output_names_) { + if (graph_.HasNode(output_name)) { + graph_.AddOutput(graph_.GetNode(output_name)); + auto output_tensor = scope_->FindMutableTensor(output_name); + void* p_data = static_cast( + output_tensor->mutable_data::T>( + TARGET(kMLU))); + auto node = graph_.GetNode(output_name); + CHECK(p_data); + node->set_mlu_ptr(p_data); + valid_output_names.push_back(output_name); + } + } + for (auto& input_name : input_names_) { + graph_.AddInput(graph_.GetNode(input_name)); + } + CHECK(!valid_output_names.empty()) << "[MLU] no valid output names"; + // auto& mlu_context = this->ctx_->template As(); + // auto core_version = mlu_context.MLUCoreVersion(); + // auto core_number = mlu_context.MLUCoreNumber(); + // graph_.Compile(core_version, core_number); + return status; + } + + int LaunchDeviceProgram() override { + // auto& mlu_context = this->ctx_->template As(); + // auto exec_queue = mlu_context.exec_queue(); + // u32_t affinity = mlu_context.affinity(); + // cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param(); + // int data_param = 1; + // forward_param.data_parallelism = &data_param; + // forward_param.affinity = &affinity; + // forward_param.end = CNRT_PARAM_END; + // graph_.Compute(forward_param, exec_queue); + return 0; + } + + paddle::lite::subgraph::mlu::Graph graph_; +}; + +template +class SubgraphCompute + : public KernelLite { + public: + using param_t = operators::SubgraphParam; + + void PrepareForRun() override { + auto& param = this->template Param(); + // LOG(INFO) << "SUBGRAP Prepare RUN index " << param.sub_block_idx; + engine_.reset(new SubgraphEngine(this->ctx_.get(), + param.sub_block_idx, + param.sub_block_desc, + param.input_data_names, + param.output_data_names, + param.scope, + this->precision())); + CHECK(engine_); + engine_->Build(); + } + + void Run() override { + CHECK(engine_); + engine_->Launch(); + } + + virtual ~SubgraphCompute() = default; + + private: + std::unique_ptr> engine_; +}; + +} // namespace mlu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/tools/build_mlu.sh b/lite/tools/build_mlu.sh new file mode 100755 index 0000000000000000000000000000000000000000..1912efda5edc6e436cc84dbdf9919a99e1ed3279 --- /dev/null +++ b/lite/tools/build_mlu.sh @@ -0,0 +1,122 @@ +#!/bin/bash +set -ex + +# global variables with default value +NEUWARE_HOME="${NEUWARE_HOME}" # XPU SDK +TARGET_NAME="all" # default target +BUILD_EXTRA=OFF # ON(with sequence ops)/OFF +WITH_TESTING=OFF # ON/OFF + +function print_usage { + echo -e "\nUSAGE:" + echo + echo "----------------------------------------" + echo -e "--mlu_sdk_root=" + echo -e "--target_name=" + echo "----------------------------------------" + echo +} + +# readonly variables with default value +readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \ + -DWITH_PYTHON=OFF \ + -DLITE_WITH_ARM=OFF" + +readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1} + +readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz +readonly workspace=$(pwd) + +function prepare_thirdparty { + if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then + rm -rf $workspace/third-party + + if [ ! -f $workspace/third-party-05b862.tar.gz ]; then + wget $THIRDPARTY_TAR + fi + tar xzf third-party-05b862.tar.gz + else + # git submodule update --init --recursive + echo "third-party is in ready" + fi +} + +# for code gen, a source file is generated after a test, but is dependended by some targets in cmake. +# here we fake an empty file to make cmake works. +function prepare_workspace { + # in build directory + # 1. Prepare gen_code file + GEN_CODE_PATH_PREFIX=lite/gen_code + mkdir -p ./${GEN_CODE_PATH_PREFIX} + touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc + + # 2.Prepare debug tool + DEBUG_TOOL_PATH_PREFIX=lite/tools/debug + mkdir -p ./${DEBUG_TOOL_PATH_PREFIX} + # cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/ + + # clone submodule + # git submodule update --init --recursive + prepare_thirdparty +} + +function build_mlu { + build_dir=${workspace}/build.lite.mlu + mkdir -p $build_dir + cd $build_dir + + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib" + prepare_workspace + cmake .. \ + ${CMAKE_COMMON_OPTIONS} \ + -DWITH_GPU=OFF \ + -DWITH_MKLDNN=OFF \ + -DLITE_WITH_X86=ON \ + -DWITH_MKL=ON \ + -DLITE_WITH_MLU=ON \ + -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \ + -DWITH_TESTING=${WITH_TESTING} \ + -DMLU_SDK_ROOT=${XPU_SDK_ROOT} + + make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE + + cd - + echo "Done" +} + +function main { + # Parse command line. + for i in "$@"; do + case $i in + --target_name=*) + TARGET_NAME="${i#*=}" + shift + ;; + --build_extra=*) + BUILD_EXTRA="${i#*=}" + shift + ;; + --neuware_home=*) + NEUWARE_HOME="${i#*=}" + shift + ;; + build) + build_mlu + shift + ;; + full_publish) + TARGET_NAME=publish_inference + build_mlu + shift + ;; + *) + # unknown option + print_usage + exit 1 + ;; + esac + done +} + +main $@