未验证 提交 95e7f6f3 编写于 作者: Z zhangshijin 提交者: GitHub

[MLU] resnet50 supported on MLU,test=develop (#3087)

* [MLU] support resnet50 on MLU

* [MLU] support resnet50 on MLU
上级 9f09cf8e
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_MLU)
return()
endif()
if(NOT DEFINED NEUWARE_HOME)
set(NEUWARE_HOME $ENV{NEUWARE_HOME})
if(NOT NEUWARE_HOME)
message(FATAL_ERROR "Must set NEUWARE_HOME or env NEUWARE_HOME when LITE_WITH_MLU=ON")
endif()
endif()
message(STATUS "LITE_WITH_MLU: ${LITE_WITH_MLU}")
find_path(CNML_INC NAMES cnml.h
PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH)
if(NOT CNML_INC)
message(FATAL_ERROR "Can not find cnml.h in ${NEUWARE_HOME}/include")
endif()
find_path(CNRT_INC NAMES cnrt.h
PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH)
if(NOT CNRT_INC)
message(FATAL_ERROR "Can not find cnrt.h in ${NEUWARE_HOME}/include")
endif()
include_directories("${NEUWARE_HOME}/include")
find_library(CNML_LIB_FILE NAMES cnml
PATHS ${NEUWARE_HOME}/lib64)
if(NOT CNML_LIB_FILE)
message(FATAL_ERROR "Can not find CNML Library in ${NEUWARE_HOME}/lib64")
else()
message(STATUS "Found CNML Library: ${CNML_LIB_FILE}")
add_library(cnml_lib SHARED IMPORTED GLOBAL)
set_property(TARGET cnml_lib PROPERTY IMPORTED_LOCATION ${CNML_LIB_FILE})
endif()
find_library(CNRT_LIB_FILE NAMES cnrt
PATHS ${NEUWARE_HOME}/lib64)
if(NOT CNRT_LIB_FILE)
message(FATAL_ERROR "Can not find CNRT Library in ${NEUWARE_HOME}/lib64")
else()
message(STATUS "Found CNRT Library: ${CNRT_LIB_FILE}")
add_library(cnrt_lib SHARED IMPORTED GLOBAL)
set_property(TARGET cnrt_lib PROPERTY IMPORTED_LOCATION ${CNRT_LIB_FILE})
endif()
if(NOT LITE_WITH_MLU)
return()
endif()
message (STATUS "Lite with mlu backend")
lite_cc_library(target_wrapper_mlu SRCS target_wrapper.cc DEPS cnml_lib cnrt_lib)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cnml.h>
#include <cnrt.h>
#include <lite/utils/cp_logging.h>
/*
* This file contains some MLU specific uitls.
*/
#define CNRT_CALL(msg) \
CHECK_EQ(static_cast<cnrtRet_t>(msg), CNRT_RET_SUCCESS) \
<< (msg) \
<< " MLU CNRT: " << cnrtGetErrorStr(static_cast<cnrtRet_t>(msg))
#define CNML_CALL(msg) \
CHECK_EQ(static_cast<cnmlStatus_t>(msg), CNML_STATUS_SUCCESS) \
<< (msg) << " MLU CNML: " \
<< ::paddle::lite::mlu::CnmlErrorInfo(static_cast<int>(msg))
namespace paddle {
namespace lite {
namespace mlu {
static const char* CnmlErrorInfo(int error) {
switch (error) {
#define LITE_CNML_ERROR_INFO(xx) \
case xx: \
return #xx; \
break;
LITE_CNML_ERROR_INFO(CNML_STATUS_NODEVICE);
LITE_CNML_ERROR_INFO(CNML_STATUS_SUCCESS);
LITE_CNML_ERROR_INFO(CNML_STATUS_DOMAINERR);
LITE_CNML_ERROR_INFO(CNML_STATUS_INVALIDARG);
LITE_CNML_ERROR_INFO(CNML_STATUS_LENGTHERR);
LITE_CNML_ERROR_INFO(CNML_STATUS_OUTOFRANGE);
LITE_CNML_ERROR_INFO(CNML_STATUS_RANGEERR);
LITE_CNML_ERROR_INFO(CNML_STATUS_OVERFLOWERR);
LITE_CNML_ERROR_INFO(CNML_STATUS_UNDERFLOWERR);
LITE_CNML_ERROR_INFO(CNML_STATUS_INVALIDPARAM);
LITE_CNML_ERROR_INFO(CNML_STATUS_BADALLOC);
LITE_CNML_ERROR_INFO(CNML_STATUS_BADTYPEID);
LITE_CNML_ERROR_INFO(CNML_STATUS_BADCAST);
LITE_CNML_ERROR_INFO(CNML_STATUS_UNSUPPORT);
#undef LITE_CNML_ERROR_INFO
default:
return "unknown error";
break;
}
}
} // namespace mlu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/mlu/target_wrapper.h"
#include <memory>
#include "lite/backends/mlu/mlu_utils.h"
namespace paddle {
namespace lite {
namespace mlu {
void cnrtMemcpyHtoD(void* dst, const void* src, size_t size) {
CNRT_CALL(cnrtMemcpy(
dst, const_cast<void*>(src), size, CNRT_MEM_TRANS_DIR_HOST2DEV))
<< " cnrt memcpy htod failed";
}
void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) {
CNRT_CALL(cnrtMemcpy(
dst, const_cast<void*>(src), size, CNRT_MEM_TRANS_DIR_DEV2HOST))
<< " cnrt memcpy dtoh failed";
}
} // namespace mlu
size_t TargetWrapperMlu::num_devices() {
uint32_t dev_count = 0;
CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed";
LOG(INFO) << "Current MLU device count: " << dev_count;
return dev_count;
}
void* TargetWrapperMlu::Malloc(size_t size) {
void* ptr{};
CNRT_CALL(cnrtMalloc(&ptr, size)) << " cnrt malloc failed";
// LOG(INFO) << "Malloc mlu ptr: " << ptr << " with size: " << size;
return ptr;
}
void TargetWrapperMlu::Free(void* ptr) {
CNRT_CALL(cnrtFree(ptr)) << " cnrt free failed";
}
void TargetWrapperMlu::MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir) {
// LOG(INFO) << "dst: " << dst << " src: " << src << " size: " << size
//<< " dir: " << (int)dir;
switch (dir) {
case IoDirection::DtoD: {
std::unique_ptr<char[]> cpu_tmp_ptr(new char[size]);
mlu::cnrtMemcpyDtoH(cpu_tmp_ptr.get(), src, size);
mlu::cnrtMemcpyHtoD(dst, cpu_tmp_ptr.get(), size);
break;
}
case IoDirection::HtoD:
mlu::cnrtMemcpyHtoD(dst, src, size);
break;
case IoDirection::DtoH:
mlu::cnrtMemcpyDtoH(dst, src, size);
break;
default:
LOG(FATAL) << "Unsupported IoDirection" << static_cast<int>(dir);
}
}
// void TargetWrapperMlu::MemcpyAsync(void* dst,
// const void* src,
// size_t size,
// IoDirection dir,
// const stream_t& stream) {
// LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync.";
// MemcpySync(dst, src, size, dir);
// }
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/backends/mlu/mlu_utils.h"
#include "lite/core/target_wrapper.h"
namespace paddle {
namespace lite {
using TargetWrapperMlu = TargetWrapper<TARGET(kMLU)>;
template <>
class TargetWrapper<TARGET(kMLU)> {
public:
using queue_t = cnrtQueue_t;
static size_t num_devices();
static size_t maxinum_queue() { return 0; } // TODO(zhangshijin): fix out it.
static size_t GetCurDevice() { return 0; }
static void CreateQueue(queue_t* queue) {}
static void DestroyQueue(const queue_t& queue) {}
static void QueueSync(const queue_t& queue) {}
static void* Malloc(size_t size);
static void Free(void* ptr);
static void MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir);
// static void MemcpyAsync(void* dst,
// const void* src,
// size_t size,
// IoDirection dir,
// const queue_t& queue);
};
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/mlu_postprocess_pass.h"
#include <list>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "lite/core/mir/graph_visualize_pass.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/operators/subgraph_op.h"
namespace paddle {
namespace lite {
namespace mir {
Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
const std::string& cast_arg_name,
SSAGraph* graph,
Node* cur_node,
Node* inst_node,
const Type* cast_type) {
// create the arg node
auto* cast_arg = graph->NewArgumentNode(cast_arg_name);
cast_arg->AsArg().type = cast_type;
inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
// create the stmt node
auto* cast_inst = graph->NewInstructNode();
// create op
auto cast_op = LiteOpRegistry::Global().Create(op_type);
CHECK(cast_op) << "create op [" << op_type << "] failed";
cpp::OpDesc op_desc;
op_desc.SetType(op_type);
if (op_type == "cast") {
op_desc.SetAttr<int>("in_dtype", 5); // FP32
op_desc.SetAttr<int>("out_dtype", 4); // FP16
op_desc.SetInput("X", {cur_node->AsArg().name});
op_desc.SetOutput("Out", {cast_arg_name});
} else if (op_type == "transpose") {
// NCHW -> NHWC
op_desc.SetAttr<std::vector<int>>("axis", {0, 2, 3, 1});
op_desc.SetInput("X", {cur_node->AsArg().name});
op_desc.SetOutput("Out", {cast_arg_name});
} else if (op_type == "io_copy") {
op_desc.SetInput("Input", {cur_node->AsArg().name});
op_desc.SetOutput("Out", {cast_arg_name});
} else {
CHECK(0) << "Unsupport cast type";
}
cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
// create kernels
auto kernels = cast_op->CreateKernels(graph->valid_places());
std::vector<std::unique_ptr<KernelBase>> selected_kernels;
bool is_found = false;
for (auto& kernel : kernels) {
if (op_type == "cast") {
const Type* in_arg_ty = kernel->GetInputDeclType("X");
if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
is_found = true;
}
} else if (op_type == "transpose") {
is_found = true;
} else if (op_type == "io_copy") {
const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
if (TargetCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) &&
TargetCompatibleTo(*out_arg_ty, *cast_type)) {
is_found = true;
}
} else {
CHECK(0) << "Unsupport cast type";
}
if (is_found) {
selected_kernels.emplace_back(std::move(kernel));
// we pick the kernel
cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
auto& stmt = cast_inst->AsStmt();
stmt.picked_kernel().SetContext(
ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
break;
}
}
CHECK(is_found) << "Can't find a Cast kernel for Cast op: "
<< cur_node->AsArg().name << "->" << op_type;
// modify links
DirectedLink(cur_node, cast_inst);
DirectedLink(cast_inst, cast_arg);
return cast_arg;
}
Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
const std::string& cast_arg_name,
SSAGraph* graph,
Node* cur_node,
Node* inst_node,
const Type* cast_type) {
// create the arg node
auto* cast_arg = graph->NewArgumentNode(cast_arg_name);
cast_arg->AsArg().type = cast_type;
auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
// for CastAfter manully set the tensor's type
var->GetMutable<::paddle::lite::Tensor>();
// create the stmt node
auto* cast_inst = graph->NewInstructNode();
// create op
auto cast_op = LiteOpRegistry::Global().Create(op_type);
CHECK(cast_op) << "create op [" << op_type << "] failed";
cpp::OpDesc op_desc;
op_desc.SetType(op_type);
if (op_type == "cast") {
op_desc.SetAttr<int>("in_dtype", 4); // FP32
op_desc.SetAttr<int>("out_dtype", 5); // FP16
op_desc.SetInput("X", {cast_arg_name});
op_desc.SetOutput("Out", {cur_node->AsArg().name});
} else if (op_type == "transpose") {
// NHWC -> NCHW
op_desc.SetAttr<std::vector<int>>("axis", {0, 3, 1, 2});
op_desc.SetInput("X", {cast_arg_name});
op_desc.SetOutput("Out", {cur_node->AsArg().name});
} else if (op_type == "io_copy") {
op_desc.SetInput("Input", {cast_arg_name});
op_desc.SetOutput("Out", {cur_node->AsArg().name});
} else {
CHECK(0) << "Unsupport cast type";
}
cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
// create kernels
auto kernels = cast_op->CreateKernels(graph->valid_places());
std::vector<std::unique_ptr<KernelBase>> selected_kernels;
bool is_found = false;
for (auto& kernel : kernels) {
if (op_type == "cast") {
const Type* in_arg_ty = kernel->GetInputDeclType("X");
if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) {
is_found = true;
}
} else if (op_type == "transpose") {
is_found = true;
} else if (op_type == "io_copy") {
const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
if (TargetCompatibleTo(*in_arg_ty, *cast_type) &&
TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type)) {
is_found = true;
}
} else {
CHECK(0) << "Unsupport cast type";
}
if (is_found) {
selected_kernels.emplace_back(std::move(kernel));
// we pick the kernel
cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
auto& stmt = cast_inst->AsStmt();
stmt.picked_kernel().SetContext(
ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
break;
}
}
CHECK(is_found) << "Can't find a Cast kernel for Cast op: "
<< cur_node->AsArg().name << "->" << op_type;
// modify links
DirectedLink(cast_arg, cast_inst);
DirectedLink(cast_inst, cur_node);
return cast_arg;
}
void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
Node* head_node,
Node* inst_node,
const Type* inst_type) {
const auto* head_type = head_node->AsArg().type;
// break original link
RemoveDirectedLink(head_node, inst_node);
auto* cur_node = head_node;
const auto name_prefix =
head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
// layout cast node
if (head_type->layout() != inst_type->layout()) {
cur_node = InsertCastBefore(
"transpose",
name_prefix + "transpose",
graph,
cur_node,
inst_node,
LiteType::GetTensorTy(
head_type->target(), head_type->precision(), inst_type->layout()));
}
// precision cast node
if (head_type->precision() != inst_type->precision()) {
cur_node = InsertCastBefore(
"cast",
name_prefix + "cast",
graph,
cur_node,
inst_node,
LiteType::GetTensorTy(
head_type->target(), inst_type->precision(), inst_type->layout()));
}
// io copy
cur_node = InsertCastBefore(
"io_copy",
name_prefix + "io_copy",
graph,
cur_node,
inst_node,
LiteType::GetTensorTy(
inst_type->target(), inst_type->precision(), inst_type->layout()));
// connect cur_node to inst_node
DirectedLink(cur_node, inst_node);
// reset opdesc and update kernel information
UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
head_node->AsArg().name,
cur_node->AsArg().name);
// for subgraph op, modify the BlockDesc
auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
inst_node->AsStmt().op().get())
->GetSubBlock();
for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
UpdateInputTo(
sub_block_op_desc, head_node->AsArg().name, cur_node->AsArg().name);
}
// recreate the op
RecreateOp(inst_node, graph);
graph->CheckValid();
}
void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
const Type** arg_type,
SSAGraph* graph) {
CHECK(inst_node->IsStmt());
constexpr auto subgraph_target = TARGET(kMLU);
constexpr auto subgraph_layout = DATALAYOUT(kNHWC);
// get subgraph's valid precision
const auto& places = graph->valid_places();
std::set<::paddle::lite_api::PrecisionType> prec_set;
for (const auto& place : places) {
if (place.target == TARGET(kMLU)) {
prec_set.insert(place.precision);
}
}
// get subgraph op's type info
size_t kernel_size = inst_node->AsStmt().kernels().size();
CHECK_GT(kernel_size, 0);
VLOG(4) << "subgraph kernel size: " << kernel_size;
for (size_t i = 0; i < kernel_size; ++i) {
auto* kernel = inst_node->AsStmt().kernels()[i].get();
VLOG(4) << i << "th kernel: " << TargetToStr(kernel->target()) << ", "
<< PrecisionToStr(kernel->precision()) << ", "
<< DataLayoutToStr(kernel->layout());
}
for (size_t i = 0; i < kernel_size; ++i) {
auto* kernel = inst_node->AsStmt().kernels()[i].get();
CHECK(kernel->target() == subgraph_target);
CHECK(kernel->layout() == subgraph_layout);
if (prec_set.count(kernel->precision()) == 1) {
const auto subgraph_precision = kernel->precision();
CHECK(subgraph_precision == PRECISION(kFloat) ||
subgraph_precision == PRECISION(kFP16))
<< "Mlu node has unsupport precision";
VLOG(4) << "picked kernel precision: "
<< PrecisionToStr(subgraph_precision);
*arg_type = LiteType::GetTensorTy(
subgraph_target, subgraph_precision, subgraph_layout);
break;
}
}
}
bool MLUPostprocessPass::NeedInsert(Node* node, const Type* inst_type) {
CHECK(node->IsArg());
// some op, for example batch_norm, has output nodes useless
if (node->outlinks.size() == 0) {
return false;
}
// check if node is weight or persistent
bool is_persist = node->AsArg().is_weight || node->AsArg().is_persist;
if (is_persist) {
VLOG(4) << "Persistent arg name: " << node->AsArg().name
<< " is_weight: " << node->AsArg().is_weight
<< " is_persist: " << node->AsArg().is_persist;
return false;
}
const auto target = node->AsArg().type->target();
const auto precision = node->AsArg().type->precision();
const auto layout = node->AsArg().type->layout();
VLOG(4) << "arg name: " << node->AsArg().name
<< " type: " << TargetToStr(target) << ", "
<< PrecisionToStr(precision) << ", " << DataLayoutToStr(layout);
// do not insert nodes if previous node is on mlu already
if (target == inst_type->target()) {
CHECK(layout == inst_type->layout()) << "Mlu node has wrong layout";
return false;
}
return true;
}
void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
Node* tail_node,
Node* inst_node,
const Type* inst_type) {
const auto* tail_type = tail_node->AsArg().type;
// break original link
RemoveDirectedLink(inst_node, tail_node);
auto* cur_node = tail_node;
const auto name_prefix =
tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
// layout cast node
if (tail_type->layout() != inst_type->layout()) {
cur_node = InsertCastAfter(
"transpose",
name_prefix + "transpose",
graph,
cur_node,
inst_node,
LiteType::GetTensorTy(
tail_type->target(), tail_type->precision(), inst_type->layout()));
}
// precision cast node
if (tail_type->precision() != inst_type->precision()) {
cur_node = InsertCastAfter(
"cast",
name_prefix + "cast",
graph,
cur_node,
inst_node,
LiteType::GetTensorTy(
tail_type->target(), inst_type->precision(), inst_type->layout()));
}
// io copy
cur_node = InsertCastAfter(
"io_copy",
name_prefix + "io_copy",
graph,
cur_node,
inst_node,
LiteType::GetTensorTy(
inst_type->target(), inst_type->precision(), inst_type->layout()));
// connect cur_node to inst_node
DirectedLink(inst_node, cur_node);
// reset opdesc and update kernel information
UpdateOutputTo(inst_node->AsStmt().op()->mutable_op_info(),
tail_node->AsArg().name,
cur_node->AsArg().name);
// for subgraph op, modify the BlockDesc
auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
inst_node->AsStmt().op().get())
->GetSubBlock();
for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
UpdateOutputTo(
sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
}
// recreate the op
RecreateOp(inst_node, graph);
graph->CheckValid();
}
void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
auto original_selected_kernel =
std::move(inst_node->AsStmt().kernels().front());
auto updated_op_info = *inst_node->AsStmt().mutable_op_info();
inst_node->AsStmt().ResetOp(updated_op_info, graph->valid_places());
inst_node->AsStmt().kernels().clear();
inst_node->AsStmt().kernels().emplace_back(
std::move(original_selected_kernel));
for (auto& kernel : inst_node->AsStmt().kernels()) {
VLOG(4) << "kernel info: " << kernel->name();
inst_node->AsStmt().op()->AttachKernel(kernel.get());
}
}
void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
for (auto& node : graph->mutable_nodes()) {
if (!node.IsStmt()) continue;
if (node.AsStmt().op_type() == "feed") {
for (auto& out : node.outlinks) {
bool change = true;
for (auto& inst : out->outlinks) {
if (inst->AsStmt().op_type() != "subgraph") {
change = false;
break;
}
}
if (change) {
const auto* old_type = out->AsArg().type;
out->AsArg().type =
LiteType::GetTensorTy(old_type->target(),
old_type->precision(),
::paddle::lite_api::DataLayoutType::kNHWC,
old_type->device());
}
}
}
if (node.AsStmt().op_type() == "fetch") {
for (auto& inp : node.inlinks) {
bool change = true;
for (auto& inst : inp->inlinks) {
if (inst->AsStmt().op_type() != "subgraph") {
change = false;
break;
}
}
if (change) {
const auto* old_type = inp->AsArg().type;
inp->AsArg().type =
LiteType::GetTensorTy(old_type->target(),
old_type->precision(),
::paddle::lite_api::DataLayoutType::kNHWC,
old_type->device());
}
}
}
}
}
void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// currently for non-persistent input and output args, mlu subgraph op
// only support float16/float32 data type
// in two situations as folllows:
// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
// arg_in and arg_out are assumed to be NHWC which user should be aware of.
// Thus here we change these args' layout to NHWC
ModifyLayout(graph.get());
// insert io_copy, layout and precision cast of subgraph's inputs and outputs
for (auto& node : graph->mutable_nodes()) {
if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
const Type* subgraph_arg_type = nullptr;
GetSubgraphOpArgType(&node, &subgraph_arg_type, graph.get());
auto links_tmp = node.inlinks;
for (auto p_in : links_tmp) {
if (NeedInsert(p_in, subgraph_arg_type)) {
InsertBefore(graph.get(), p_in, &node, subgraph_arg_type);
}
}
links_tmp.assign(node.outlinks.begin(), node.outlinks.end());
for (auto p_out : links_tmp) {
if (NeedInsert(p_out, subgraph_arg_type)) {
InsertAfter(graph.get(), p_out, &node, subgraph_arg_type);
}
}
}
}
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(mlu_postprocess_pass, paddle::lite::mir::MLUPostprocessPass)
.BindTargets({TARGET(kMLU)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "lite/core/mir/pass.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace mir {
static void UpdateInputTo(cpp::OpDesc* desc,
const std::string& from,
const std::string& to) {
for (auto& item : *desc->mutable_inputs()) {
for (auto& input : item.second) {
if (input == from) {
input = to;
}
}
}
if (desc->Type() != "subgraph") return;
auto input_names =
desc->GetAttr<std::vector<std::string>>("input_data_names");
for (size_t i = 0; i < input_names.size(); ++i) {
if (input_names[i] == from) {
input_names[i] = to;
}
}
desc->SetAttr<std::vector<std::string>>("input_data_names", input_names);
}
static void UpdateOutputTo(cpp::OpDesc* desc,
const std::string& from,
const std::string& to) {
for (auto& item : *desc->mutable_outputs()) {
for (auto& output : item.second) {
if (output == from) {
output = to;
}
}
}
if (desc->Type() != "subgraph") return;
auto output_names =
desc->GetAttr<std::vector<std::string>>("output_data_names");
for (size_t i = 0; i < output_names.size(); ++i) {
if (output_names[i] == from) {
output_names[i] = to;
}
}
desc->SetAttr<std::vector<std::string>>("output_data_names", output_names);
}
/*
* The pass changes the node's target to mlu which follows a mlu subgraph op
* */
class MLUPostprocessPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
private:
void GetSubgraphOpArgType(Node* inst_node,
const Type** arg_type,
SSAGraph* graph);
void ModifyLayout(SSAGraph* graph);
bool NeedInsert(Node* node, const Type* inst_type);
void InsertBefore(SSAGraph* graph,
Node* head_node,
Node* inst_node,
const Type* type);
void InsertAfter(SSAGraph* graph,
Node* tail_node,
Node* inst_node,
const Type* type);
Node* InsertCastBefore(const std::string& op_type,
const std::string& cast_arg_name,
SSAGraph* graph,
Node* cur_node,
Node* inst_node,
const Type* cast_type);
Node* InsertCastAfter(const std::string& op_type,
const std::string& cast_arg_name,
SSAGraph* graph,
Node* cur_node,
Node* inst_node,
const Type* cast_type);
void RecreateOp(Node* inst_node, SSAGraph* graph);
};
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/pass.h"
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
class SubgraphCastDisplayPass : public DebugPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
VLOG(3) << "== Argument types ==";
for (auto& node : graph->mutable_nodes()) {
if (!node.IsArg()) continue;
auto* type = node.AsArg().type;
if (type) {
VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type;
} else {
VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK";
}
}
VLOG(3) << "---------------------";
//
VLOG(0) << "== SubgraphOp Debug Info ==";
for (auto& node : graph->mutable_nodes()) {
if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
VLOG(0) << "FOUND SUBGRAPH OP";
display_debug_info(node, "subgraph");
break;
}
}
VLOG(0) << "---------------------";
}
void display_debug_info(const Node& node,
std::string op_type,
bool display_in_nodes = true,
bool display_out_nodes = true) {
CHECK(node.IsStmt());
VLOG(0) << node.AsStmt();
if (display_in_nodes) {
for (auto p_in_arg_node : node.inlinks) {
CHECK(p_in_arg_node->IsArg());
VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name
<< " type: " << *p_in_arg_node->AsArg().type
<< " is_weight: " << p_in_arg_node->AsArg().is_weight
<< " is_persist: " << p_in_arg_node->AsArg().is_persist
<< " input_count: " << p_in_arg_node->inlinks.size();
if (p_in_arg_node->inlinks.size() == 0) {
VLOG(0) << "** END with No Op";
}
for (auto p_in_stmt_node : p_in_arg_node->inlinks) {
CHECK(p_in_stmt_node->IsStmt());
std::string stmt_op_type = p_in_stmt_node->AsStmt().op_type();
if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
stmt_op_type == "io_copy") {
display_debug_info(*p_in_stmt_node, stmt_op_type, true, false);
} else {
VLOG(0) << "** END with op type: " << stmt_op_type;
}
}
}
}
if (display_out_nodes) {
for (auto p_out_arg_node : node.outlinks) {
CHECK(p_out_arg_node->IsArg());
VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name
<< " type: " << *p_out_arg_node->AsArg().type
<< " is_weight: " << p_out_arg_node->AsArg().is_weight
<< " is_persist: " << p_out_arg_node->AsArg().is_persist
<< " output_count: " << p_out_arg_node->outlinks.size();
if (p_out_arg_node->outlinks.size() == 0) {
VLOG(0) << "** END with No Op";
}
for (auto p_out_stmt_node : p_out_arg_node->outlinks) {
CHECK(p_out_stmt_node->IsStmt());
std::string stmt_op_type = p_out_stmt_node->AsStmt().op_type();
if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
stmt_op_type == "io_copy") {
display_debug_info(*p_out_stmt_node, stmt_op_type, false, true);
} else {
VLOG(0) << "** END with op type: " << stmt_op_type;
}
}
}
}
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(subgraph_cast_display_pass,
paddle::lite::mir::SubgraphCastDisplayPass)
.BindTargets({TARGET(kAny)});
if(NOT LITE_WITH_MLU)
return()
endif()
add_subdirectory(bridges)
add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges})
add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
if(NOT LITE_WITH_MLU)
return()
endif()
lite_cc_library(subgraph_bridge_utility_mlu SRCS utility.cc DEPS ${mlu_builder_libs} tensor)
lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs})
lite_cc_library(subgraph_bridge_graph_mlu SRCS graph.cc DEPS subgraph_bridge_utility_mlu subgraph_bridge_tensor_mlu)
set(mlu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_mlu subgraph_bridge_graph_mlu)
lite_cc_library(subgraph_bridge_act_op_mlu SRCS act_op.cc DEPS ${mlu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_batch_norm_op_mlu SRCS batch_norm_op.cc DEPS ${mlu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_conv_op_mlu SRCS conv_op.cc DEPS ${mlu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_elementwise_ops_mlu SRCS elementwise_ops.cc DEPS ${mlu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_pool_op_mlu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_softmax_op_mlu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_deps_mlu})
set(mlu_subgraph_bridges
subgraph_bridge_registry
subgraph_bridge_utility_mlu
subgraph_bridge_graph_mlu
subgraph_bridge_act_op_mlu
subgraph_bridge_conv_op_mlu
subgraph_bridge_elementwise_ops_mlu
subgraph_bridge_pool_op_mlu
subgraph_bridge_softmax_op_mlu
subgraph_bridge_fc_op_mlu
subgraph_bridge_batch_norm_op_mlu
CACHE INTERNAL "mlu_subgraph_bridges")
# lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
# lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
// Create act node and set params from op
auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize();
auto output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
CHECK(graph->HasNode(x_var_name));
auto input_tensor = graph->GetNode(x_var_name);
cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type);
cnmlBaseOp_t activation_op;
CNML_CALL(cnmlCreateActiveOp(&activation_op,
act_type,
input_tensor->mlu_tensor(),
output_tensor->mlu_tensor()));
graph->FuseOp(activation_op);
return SUCCESS;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/operators/activation_ops.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int ActConverter(void* ctx, OpLite* op);
template void FillTensor<float, int>(Tensor* x,
float lower = -2,
float upper = -2);
void act_ref(const std::shared_ptr<operators::ActivationOp> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto op_type = op_info->Type();
auto x = scope->FindTensor("x");
auto out = scope->FindMutableTensor("out");
auto out_ref = scope->FindMutableTensor("out_ref");
out->Resize(x->dims());
out_ref->Resize(x->dims());
auto x_data = x->data<float>();
auto out_data = out->mutable_data<float>();
CHECK_EQ(x->numel(), out->numel());
// "sigmoid","relu","tanh","relu_clipped","leaky_relu","softsign","hard_sigmoid"
if (op_type == "sigmoid") {
for (size_t i = 0; i < out->numel(); i++) {
out_data[i] = 1.f / (1.f + std::exp(-x_data[i]));
}
} else if (op_type == "relu") {
for (size_t i = 0; i < out->numel(); i++) {
out_data[i] = std::max(0.f, x_data[i]);
}
} else if (op_type == "tanh") {
for (size_t i = 0; i < out->numel(); i++) {
out_data[i] = (std::exp(x_data[i]) - std::exp(-x_data[i])) /
(std::exp(x_data[i]) + std::exp(-x_data[i]));
}
} else if (op_type == "relu_clipped") {
auto relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
for (size_t i = 0; i < out->numel(); i++) {
out_data[i] = std::min(std::max(0.f, x_data[i]), relu_clipped_coef);
}
} else if (op_type == "relu6") {
for (size_t i = 0; i < out->numel(); i++) {
out_data[i] = std::min(std::max(0.f, x_data[i]), 6.f);
}
} else if (op_type == "leaky_relu") {
auto alpha = op_info->GetAttr<float>("alpha");
for (size_t i = 0; i < out->numel(); i++) {
out_data[i] = std::max(x_data[i], x_data[i] * alpha);
}
} else if (op_type == "softsign") {
for (size_t i = 0; i < out->numel(); i++) {
out_data[i] = x_data[i] / (1 + std::abs(x_data[i]));
}
} else if (op_type == "hard_sigmoid") {
auto slope = op_info->GetAttr<float>("slope");
auto offset = op_info->GetAttr<float>("offset");
for (size_t i = 0; i < out->numel(); i++) {
out_data[i] = std::min(1.f, slope * x_data[i] + offset);
out_data[i] = std::max(0.f, out_data[i]);
}
} else {
LOG(FATAL) << "unsupported activation type: " << op_type;
}
}
void test_act(std::vector<int64_t> x_shape, std::string op_type) {
// prepare input&output variables
Scope scope;
std::string x_var_name("x");
std::string out_var_name("out");
std::string out_ref_var_name("out_ref");
auto* x = scope.NewTensor(x_var_name);
auto* out = scope.NewTensor(out_var_name);
auto* out_ref = scope.NewTensor(out_ref_var_name);
x->Resize(x_shape);
// initialize input&output data
FillTensor<float>(x, 2, 8);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType(op_type);
opdesc.SetInput("X", {x_var_name});
opdesc.SetOutput("Out", {out_var_name});
if (op_type == "relu_clipped") {
opdesc.SetAttr("Relu_clipped_coef", 3.f);
} else if (op_type == "relu6") {
opdesc.SetAttr("Relu_clipped_coef", 6.f);
} else if (op_type == "leaky_relu") {
opdesc.SetAttr("alpha", 0.02f);
} else if (op_type == "hard_sigmoid") {
opdesc.SetAttr("slope", 0.2f);
opdesc.SetAttr("offset", 0.5f);
}
// create and convert op to NPU model, then run it on NPU
auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
// execute reference implementation and save to output tensor
act_ref(op);
out_ref->CopyDataFrom(*out);
LaunchOp(op, {x_var_name}, {out_var_name});
// compare results
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
}
}
TEST(MLUBridges, activation) {
std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
std::vector<std::string> types{"sigmoid", "relu", "tanh"};
for (auto x_shape : shapes) {
for (auto op_type : types) {
test_act(x_shape, op_type);
}
}
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU, relu, paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
sigmoid,
paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU, tanh, paddle::lite::subgraph::mlu::ActConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
// Get input vars and op attributes
auto x_var_name = op_info->Input("X").front();
auto scale_var_name = op_info->Input("Scale").front();
auto bias_var_name = op_info->Input("Bias").front();
auto mean_var_name = op_info->Input("Mean").front();
auto variance_var_name = op_info->Input("Variance").front();
auto y_var_name = op_info->Output("Y").front();
auto epsilon = op_info->GetAttr<float>("epsilon");
auto output = scope->FindVar(y_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize();
auto output_tensor = graph->AddNode(
y_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
CHECK(graph->HasNode(x_var_name));
auto mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
auto mean_dims = mean->dims().Vectorize();
auto mean_tensor = graph->AddNode(
mean_var_name, mean_dims, CNML_CONST, CNML_CNHW, graph->FPType());
auto variance = scope->FindVar(variance_var_name)->GetMutable<Tensor>();
auto variance_dims = variance->dims().Vectorize();
auto variance_tensor = graph->AddNode(
variance_var_name, variance_dims, CNML_CONST, CNML_CNHW, graph->FPType());
auto scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
auto bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
int co = static_cast<int>(mean_dims[0]);
for (int i = 0; i < co; ++i) {
variance->mutable_data<float>()[i] =
scale->data<float>()[i] / sqrtf(variance->data<float>()[i] + epsilon);
mean->mutable_data<float>()[i] =
mean->data<float>()[i] -
bias->data<float>()[i] / variance->data<float>()[i];
}
auto input_tensor = graph->GetNode(x_var_name);
cnmlBaseOp_t bn_op;
CNML_CALL(cnmlCreateBatchNormOpForward(&bn_op,
input_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
mean_tensor->mlu_tensor(),
variance_tensor->mlu_tensor()));
graph->BindConstData(variance_var_name, variance);
graph->BindConstData(mean_var_name, mean);
graph->FuseOp(bn_op);
return SUCCESS;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(batch_norm,
kMLU,
paddle::lite::subgraph::mlu::BatchNormConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/batch_norm_op.h"
#include <gtest/gtest.h>
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int BatchNormConverter(void* ctx, OpLite* op);
template <typename dtype>
void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable<Tensor>();
auto bias =
scope->FindVar(op_info->Input("Bias").front())->GetMutable<Tensor>();
auto scale =
scope->FindVar(op_info->Input("Scale").front())->GetMutable<Tensor>();
auto mean =
scope->FindVar(op_info->Input("Mean").front())->GetMutable<Tensor>();
auto variance =
scope->FindVar(op_info->Input("Variance").front())->GetMutable<Tensor>();
auto x_data = x->data<dtype>();
auto y_data = y->mutable_data<dtype>();
auto scale_data = scale->mutable_data<dtype>();
auto bias_data = bias->mutable_data<dtype>();
auto mean_data = mean->mutable_data<dtype>();
auto variance_data = variance->mutable_data<dtype>();
DDim x_dims = x->dims();
float epsilon = op_info->GetAttr<float>("epsilon");
// float momentum = op_info->GetAttr<float>("momentum");
auto data_layout = op_info->GetAttr<std::string>("data_layout");
bool global_stats = op_info->GetAttr<bool>("use_global_stats");
if (global_stats) {
int64_t outer_size = 0;
int64_t channel_size = 0;
int64_t inner_size = 0;
if (data_layout == "NCHW") {
outer_size = x_dims[0];
channel_size = x_dims[1];
inner_size = x_dims.Slice(2, x_dims.size()).production();
} else {
LOG(FATAL) << "Unknown storage order: " << data_layout;
}
auto x_ptr = x_data;
auto y_ptr = y_data;
for (int o = 0; o < outer_size; o++) {
for (int c = 0; c < channel_size; c++) {
for (int i = 0; i < inner_size; i++) {
dtype norm_x =
(*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon);
*y_ptr = norm_x * scale_data[c] + bias_data[c];
x_ptr++;
y_ptr++;
}
}
}
}
}
void test_batch_norm(
int bs, int ic, int ih, int iw, float epsilon, float momentum) {
// prepare input&output variables
Scope scope;
std::string x_var_name = "x";
std::string out_var_name = "out";
std::string out_ref_var_name = "out_ref";
std::string scale_var_name = "scale";
std::string bias_var_name = "bias";
std::string mean_var_name = "mean";
std::string variance_var_name = "variance";
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* scale = scope.Var(scale_var_name)->GetMutable<Tensor>();
auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
auto* mean = scope.Var(mean_var_name)->GetMutable<Tensor>();
auto* variance = scope.Var(variance_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
x->Resize({bs, ic, ih, iw});
scale->Resize({ic});
bias->Resize({ic});
mean->Resize({ic});
variance->Resize({ic});
// initialize input&output data
FillTensor<float, float>(x, -100, 100);
FillTensor<float, float>(scale, -6.7, 13.78);
FillTensor<float, float>(bias, -12.11, 12.94);
FillTensor<float, float>(mean, -23.45, 67.89);
// variance > 0
FillTensor<float, float>(variance, 1.5f, 76.78f);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("batch_norm");
opdesc.SetInput("X", {x_var_name});
opdesc.SetInput("Scale", {scale_var_name});
opdesc.SetInput("Bias", {bias_var_name});
opdesc.SetInput("Mean", {mean_var_name});
opdesc.SetInput("Variance", {variance_var_name});
opdesc.SetOutput("Y", {out_var_name});
opdesc.SetAttr("is_test", 1);
opdesc.SetAttr("use_global_stats", true);
opdesc.SetAttr("epsilon", epsilon);
opdesc.SetAttr("momentum", momentum);
opdesc.SetAttr("data_layout", std::string("NCHW"));
// create and convert op to MLU model, then run it on MLU
auto op = CreateOp<operators::BatchNormOp>(opdesc, &scope);
// execute reference implementation and save to output tensor
batch_norm_ref<float>(op);
out_ref->CopyDataFrom(*out);
Tensor input_trans;
input_trans.Resize({bs, ic, ih, iw});
transpose(x->mutable_data<float>(),
input_trans.mutable_data<float>(),
{bs, ic, ih, iw},
{0, 2, 3, 1});
out->Resize({bs, ih, iw, ic});
x->CopyDataFrom(input_trans);
x->Resize({bs, ih, iw, ic});
LaunchOp(op, {x_var_name}, {out_var_name});
// compare results
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
Tensor output_trans;
output_trans.Resize({bs, ic, ih, iw});
transpose(out_data,
output_trans.mutable_data<float>(),
{bs, ih, iw, ic},
{0, 3, 1, 2});
out_data = output_trans.mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
}
}
TEST(MLUBridges, batch_norm) {
for (auto bs : {1, 4, 7}) {
for (auto ic : {1, 4, 7}) {
for (auto ih : {1, 4, 7}) {
for (auto iw : {1, 4, 7}) {
for (auto epsilon : {1e-4f, 1e-5f}) {
for (auto momentum : {0.9f, 0.99f}) {
test_batch_norm(bs, ic, ih, iw, epsilon, momentum);
}
}
}
}
}
}
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
batch_norm,
paddle::lite::subgraph::mlu::BatchNormConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/conv_op.h"
#include <algorithm>
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto* graph = static_cast<Graph*>(ctx);
const auto* op_info = op->op_info();
const auto* scope = op->scope();
VLOG(3) << "[MLU] Converting " << op_info->Type() << "... ";
// Get input, filter and op attributes
const auto input_var_name = op_info->Input("Input").front();
const auto& input_dims_nhwc =
scope->FindVar(input_var_name)->GetMutable<Tensor>()->dims();
const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
const auto filter_var_name = op_info->Input("Filter").front();
auto* filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
const auto& filter_dims = filter->dims();
const auto output_var_name = op_info->Output("Output").front();
const auto bs = input_dims[0];
const auto oc = filter_dims[0];
CHECK_EQ(input_dims.size(), 4);
CHECK_EQ(filter_dims.size(), 4);
const auto strides = op_info->GetAttr<std::vector<int>>("strides");
auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
CHECK_EQ(strides.size(), 2L);
CHECK_EQ(dilations.size(), 2L);
if (paddings.size() == 2L) {
for (size_t i = 0; i < strides.size(); ++i) {
int copy_pad = *(paddings.begin() + 2 * i);
paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
}
}
CHECK_EQ(paddings.size(), 4L)
<< "Paddings size should be the same or twice as the input size.";
const std::string padding_algorithm =
op_info->HasAttr("padding_algorithm")
? op_info->GetAttr<std::string>("padding_algorithm")
: "";
operators::UpdatePaddingAndDilation(&paddings,
&dilations,
strides,
padding_algorithm,
input_dims,
filter_dims);
std::vector<int64_t> output_shape({bs, oc});
for (size_t i = 0; i < 2; i++) {
const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1;
output_shape.push_back(
(input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
strides[i] +
1);
}
const auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
const auto output_tensor = graph->AddNode(output_var_name,
output_shape_nhwc,
CNML_TENSOR,
CNML_NHWC,
graph->FPType());
scope->FindVar(output_var_name)
->GetMutable<::paddle::lite::Tensor>()
->Resize(output_shape_nhwc);
// Create filter node
const auto filter_tensor = graph->AddNode(filter_var_name,
filter_dims.Vectorize(),
CNML_FILTER,
CNML_NCHW,
graph->FPType());
const auto weight_scale =
op_info->GetAttr<std::vector<float>>("weight_scale");
if (filter->precision() == PrecisionType::kUnk ||
filter->precision() == PrecisionType::kInt8) {
std::vector<float> filter_dequant(filter->data_size());
dequant(filter_dequant.data(),
filter->mutable_data<int8_t>(),
1,
filter_dims[0],
filter_dims[1] * filter_dims[2] * filter_dims[3],
weight_scale);
transpose(filter_dequant.data(),
filter->mutable_data<float>(),
{static_cast<int>(filter_dims[0]),
static_cast<int>(filter_dims[1]),
static_cast<int>(filter_dims[2]),
static_cast<int>(filter_dims[3])},
{0, 2, 3, 1});
filter->set_precision(PrecisionType::kFloat);
} else if (filter->precision() != PrecisionType::kFloat) {
LOG(FATAL) << "UnSupported weight precision!";
}
cnmlConvOpParam_t conv_param;
CNML_CALL(cnmlCreateConvOpParam(&conv_param,
strides[0],
strides[1],
dilations[0],
dilations[1],
paddings[0] * 2,
paddings[2] * 2));
std::string bias_var_name;
std::shared_ptr<MLUTensor> bias_tensor;
if (HasInputArg(op_info, scope, "Bias")) {
const DDim output_dims(output_shape);
bias_var_name = op_info->Input("Bias").front();
auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
const auto& bias_dims = bias->dims();
const auto bias_data_size = bias_dims.production();
const auto output_data_size = output_dims.production();
std::vector<int64_t> bias_shape;
if (bias_data_size == oc) {
// 0: {oc}
bias_shape = {oc};
} else if (bias_data_size == output_data_size / bs) {
LOG(FATAL) << "Unsupported ... ...";
// 1: {1, oc, oh, ow}
bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
} else if (bias_data_size == output_data_size) {
LOG(FATAL) << "Unsupported ... ...";
// 2: {n, oc, oh, ow}
bias_shape = output_dims.Vectorize();
} else {
LOG(ERROR) << "[MLU] Bias dimension " << bias_dims
<< " isn't supported in conv2d Op when output dimension is "
<< output_dims;
}
bias_tensor = graph->AddNode(bias_var_name,
bias_dims.Vectorize(),
CNML_CONST,
CNML_CNHW,
graph->FPType());
graph->BindConstData(bias_var_name, bias);
}
cnmlBaseOp_t conv_op;
const auto input_scale = op_info->GetAttr<float>("input_scale");
CNML_CALL(cnmlCreateConvOpForward(
&conv_op,
conv_param,
graph->GetNode(input_var_name)->mlu_tensor(),
output_tensor->mlu_tensor(),
filter_tensor->mlu_tensor(),
bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
graph->SetComputingDataType(
conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
graph->SetComputingDataType(
conv_op,
filter_tensor->mlu_tensor(),
1 / *min_element(weight_scale.begin(), weight_scale.end()));
CNML_CALL(cnmlSetOperationComputingLayout(conv_op, CNML_NHWC));
if (HasInputArg(op_info, scope, "Bias")) {
auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
graph->BindConstData(bias_var_name, bias);
}
graph->BindConstData(filter_var_name, filter);
graph->FuseOp(conv_op);
CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(conv2d,
kMLU,
paddle::lite::subgraph::mlu::ConvConverter);
REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
kMLU,
paddle::lite::subgraph::mlu::ConvConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/conv_op.h"
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int ConvConverter(void* ctx, OpLite* op);
void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto input =
scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
auto filter =
scope->FindVar(op_info->Input("Filter").front())->GetMutable<Tensor>();
auto output =
scope->FindVar(op_info->Output("Output").front())->GetMutable<Tensor>();
std::vector<int32_t> strides =
op_info->GetAttr<std::vector<int32_t>>("strides");
std::vector<int32_t> paddings =
op_info->GetAttr<std::vector<int32_t>>("paddings");
int32_t groups = op_info->GetAttr<int32_t>("groups");
std::vector<int32_t> dilations =
op_info->GetAttr<std::vector<int32_t>>("dilations");
bool fuse_relu = op_info->GetAttr<bool>("fuse_relu");
auto input_dims = input->dims();
auto filter_dims = filter->dims();
auto output_dims = output->dims();
auto input_data = input->mutable_data<float>();
auto filter_data = filter->mutable_data<float>();
auto output_data = output->mutable_data<float>();
int kernel_w = filter_dims[3];
int kernel_h = filter_dims[2];
int stride_w = strides[1];
int stride_h = strides[0];
int dila_w = dilations[1];
int dila_h = dilations[0];
int pad_w = paddings[2];
int pad_h = paddings[0];
int batch_size = input_dims[0];
int in_ch_size = input_dims[1];
int in_h = input_dims[2];
int in_w = input_dims[3];
int out_ch_size = output_dims[1];
int out_h = output_dims[2];
int out_w = output_dims[3];
int out_c_group = out_ch_size / groups;
int in_c_group = in_ch_size / groups;
Tensor* bias = nullptr;
float* bias_data = nullptr;
bool is_channel_bias = false;
if (op_info->HasInput("Bias")) {
auto bias_var_names = op_info->Input("Bias");
if (bias_var_names.size() > 0) {
auto bias_var_name = bias_var_names.front();
bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
auto bias_dims = bias->dims();
is_channel_bias = bias_dims.production() == out_ch_size;
bias_data = bias->mutable_data<float>();
}
}
for (int n = 0; n < batch_size; ++n) {
for (int g = 0; g < groups; ++g) {
for (int oc = 0; oc < out_c_group; ++oc) {
for (int oh = 0; oh < out_h; ++oh) {
for (int ow = 0; ow < out_w; ++ow) {
int out_idx = n * groups * out_c_group * out_h * out_w +
g * out_c_group * out_h * out_w + oc * out_h * out_w +
oh * out_w + ow;
float out_value =
bias_data != nullptr
? (is_channel_bias ? bias_data[g * out_c_group + oc]
: bias_data[out_idx])
: 0;
// + out_value *= beta;
for (int ic = 0; ic < in_c_group; ++ic) {
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
int iw = ow * stride_w - pad_w + kw * (dila_w);
int ih = oh * stride_h - pad_h + kh * (dila_h);
if (iw < 0 || iw >= in_w) continue;
if (ih < 0 || ih >= in_h) continue;
int in_idx = n * in_ch_size * in_h * in_w +
g * in_c_group * in_h * in_w + ic * in_h * in_w +
ih * in_w + iw;
int filter_idx =
g * out_c_group * in_c_group * kernel_h * kernel_w +
oc * in_c_group * kernel_h * kernel_w +
ic * kernel_h * kernel_w + kh * kernel_w + kw;
out_value += input_data[in_idx] * filter_data[filter_idx];
}
}
}
if (fuse_relu) {
out_value = out_value > 0 ? out_value : 0;
}
output_data[out_idx] = out_value;
}
}
}
}
}
}
void test_conv(int bs,
int ic,
int oc,
int ih,
int iw,
bool has_bias,
bool is_channel_bias,
bool fuse_relu,
bool depthwise,
int dilation,
int stride,
int padding,
int kernel) {
// prepare input&output variables
Scope scope;
std::string input_var_name("input");
std::string filter_var_name("filter");
std::string filter_int_var_name("filter_int");
std::string bias_var_name("bias");
std::string output_var_name("output");
std::string output_ref_var_name("output_ref");
auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
auto* filter = scope.Var(filter_var_name)->GetMutable<Tensor>();
auto* filter_int = scope.Var(filter_int_var_name)->GetMutable<Tensor>();
auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
auto* output = scope.Var(output_var_name)->GetMutable<Tensor>();
auto* output_ref = scope.Var(output_ref_var_name)->GetMutable<Tensor>();
// get group size and input&filter shape
int groups = 1;
if (depthwise) { // depthwise convolution ?
groups = oc = ic;
}
std::vector<int64_t> input_shape = {bs, ic, ih, iw};
std::vector<int64_t> filter_shape = {oc, ic / groups, kernel, kernel};
std::vector<int64_t> output_shape({bs, oc});
for (size_t i = 0; i < 2; i++) {
const int dkernel = dilation * (kernel - 1) + 1;
int output_size = (input_shape[i + 2] + 2 * padding - dkernel) / stride + 1;
output_shape.push_back(output_size);
}
input->Resize(input_shape);
filter->Resize(filter_shape);
filter_int->Resize(filter_shape);
// initialize input&output data
FillTensor<int8_t, int8_t>(filter_int, -4, 4);
float filter_scale = 1. / 16;
float input_scale = 1. / 8;
Tensor input_int;
input_int.Resize(input_shape);
FillTensor<int8_t, int8_t>(&input_int, -127, 127);
for (int i = 0; i < input->data_size(); i++) {
input->mutable_data<float>()[i] = input_int.data<int8_t>()[i] * input_scale;
}
for (int i = 0; i < filter->data_size(); i++) {
filter->mutable_data<float>()[i] =
filter_int->data<int8_t>()[i] * filter_scale;
}
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType(depthwise ? "depthwise_conv2d" : "conv2d");
opdesc.SetInput("Input", {input_var_name});
opdesc.SetInput("Filter", {filter_var_name});
opdesc.SetOutput("Output", {output_var_name});
opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
opdesc.SetAttr("paddings",
std::vector<int32_t>({padding, padding, padding, padding}));
opdesc.SetAttr("groups", groups);
opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
if (has_bias) {
if (is_channel_bias) {
bias->Resize({oc});
} else {
bias->Resize({output_shape});
}
FillTensor<float>(bias);
opdesc.SetInput("Bias", {bias_var_name});
}
auto op_cpu = CreateOp<operators::ConvOpLite>(opdesc, &scope);
// execute reference implementation and save to output tensor('out')
conv_ref(op_cpu);
output_ref->CopyDataFrom(*output);
// initialize op desc
cpp::OpDesc opdesc_mlu;
opdesc_mlu.SetType(depthwise ? "depthwise_conv2d" : "conv2d");
opdesc_mlu.SetInput("Input", {input_var_name});
opdesc_mlu.SetInput("Filter", {filter_int_var_name});
opdesc_mlu.SetOutput("Output", {output_var_name});
opdesc_mlu.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
opdesc_mlu.SetAttr("strides", std::vector<int32_t>({stride, stride}));
opdesc_mlu.SetAttr(
"paddings", std::vector<int32_t>({padding, padding, padding, padding}));
opdesc_mlu.SetAttr("groups", groups);
opdesc_mlu.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
opdesc_mlu.SetAttr("weight_scale", std::vector<float>(oc, filter_scale));
opdesc_mlu.SetAttr("input_scale", input_scale);
if (has_bias) {
if (is_channel_bias) {
bias->Resize({oc});
} else {
bias->Resize({output_shape});
}
FillTensor<float>(bias);
opdesc_mlu.SetInput("Bias", {bias_var_name});
}
for (int i = 0; i < bs; i++) {
for (int j = 0; j < ic; j++) {
for (int k = 0; k < ih * iw; k++) {
input->mutable_data<float>()[i * ic * ih * iw + k * ic + j] =
input_int.data<int8_t>()[i * ic * ih * iw + j * ih * iw + k] *
input_scale;
}
}
}
input->Resize({bs, ih, iw, ic});
output->Resize(
{output_shape[0], output_shape[2], output_shape[3], output_shape[1]});
// create and convert op to MLU model, then run it on MLU
auto op = CreateOp<operators::ConvOpLite>(opdesc_mlu, &scope);
LaunchOp(op, {input_var_name}, {output_var_name});
// compare results
auto* output_data = output->mutable_data<float>();
auto* output_ref_data = output_ref->mutable_data<float>();
Tensor output_trans;
output_trans.Resize({output_shape});
transpose(output_data,
output_trans.mutable_data<float>(),
{static_cast<int>(output_shape[0]),
static_cast<int>(output_shape[2]),
static_cast<int>(output_shape[3]),
static_cast<int>(output_shape[1])},
{0, 3, 1, 2});
output_data = output_trans.mutable_data<float>();
for (int i = 0; i < output->dims().production(); i++) {
VLOG(5) << i;
EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
}
}
TEST(MLUBridges, conv) {
#if 1
for (auto bs : {1}) {
for (auto ic : {3}) {
for (auto oc : {32}) {
for (auto ih : {13}) {
for (auto iw : {13}) {
for (auto has_bias : {false}) {
for (auto is_channel_bias : {true}) {
for (auto fuse_relu : {false}) {
for (auto depthwise : {false}) {
for (auto dilation : {1}) {
for (auto stride : {1}) {
for (auto kernel : {3}) {
// std::vector<int> paddings = {kernel / 2};
std::vector<int> paddings = {0};
if (kernel / 2 != 0) {
// paddings.push_back(0);
}
for (auto padding : paddings) {
VLOG(3) << "bs: " << bs << " ic: " << ic
<< " oc: " << oc << " ih: " << ih
<< " iw: " << iw
<< " has_bias: " << has_bias
<< " is_channel_bias: " << is_channel_bias
<< " fuse_relu: " << fuse_relu
<< " depthwise: " << depthwise
<< " dilation: " << dilation
<< " stride: " << stride
<< " padding: " << padding
<< " kernel: " << kernel;
test_conv(bs,
ic,
oc,
ih,
iw,
has_bias,
is_channel_bias,
fuse_relu,
depthwise,
dilation,
stride,
padding,
kernel);
}
}
}
}
}
}
}
}
}
}
}
}
}
#else
test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 1, 3);
test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 3);
test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 2, 5);
test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 5);
#endif
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
conv2d,
paddle::lite::subgraph::mlu::ConvConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
depthwise_conv2d,
paddle::lite::subgraph::mlu::ConvConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
auto x_dims = x.dims();
CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
auto y_dims = y->dims();
CHECK_GE(x_dims.size(), y_dims.size());
if (axis < 0) {
axis += x_dims.size();
}
std::vector<int64_t> y_new_shape(y_dims.Vectorize());
if (y_new_shape.size() == 4UL) {
return y_new_shape;
}
for (int i = 0; i < axis; i++) {
y_new_shape.insert(y_new_shape.begin(), 1);
}
while (y_new_shape.size() < 4) {
y_new_shape.push_back(1);
}
CHECK_EQ(y_new_shape.size(), 4UL);
return y_new_shape;
}
int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
auto x_var_name = op_info->Input("X").front();
auto y_var_name = op_info->Input("Y").front();
auto out_var_name = op_info->Output("Out").front();
auto axis = op_info->GetAttr<int>("axis");
auto x_tensor = graph->GetNode(x_var_name);
auto x = scope->FindTensor(x_var_name);
std::shared_ptr<MLUTensor> y_tensor;
if (graph->HasNode(y_var_name)) {
y_tensor = graph->GetNode(y_var_name);
} else {
auto y = scope->FindMutableTensor(y_var_name);
auto y_new_shape = CvtYShape(*x, y, axis);
// all subgraph input tensor are built at first
// If we can not find the tensor, it should be const tensor
y_tensor = graph->AddNode(
y_var_name, y_new_shape, CNML_CONST, CNML_NCHW, graph->FPType());
graph->BindConstData(y_var_name, y);
}
auto output_tensor = graph->AddNode(out_var_name,
x->dims().Vectorize(),
CNML_TENSOR,
CNML_NHWC,
graph->FPType());
cnmlBaseOp_t elementwise_op;
if (op_type == "elementwise_add") {
CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op,
x_tensor->mlu_tensor(),
y_tensor->mlu_tensor(),
output_tensor->mlu_tensor()));
} else if (op_type == "fusion_elementwise_add_activation") {
auto mid_tensor = graph->AddNode(out_var_name + "_mid",
x->dims().Vectorize(),
CNML_TENSOR,
CNML_NHWC,
graph->FPType());
CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op,
x_tensor->mlu_tensor(),
y_tensor->mlu_tensor(),
mid_tensor->mlu_tensor()));
} else if (op_type == "elementwise_sub") {
CNML_CALL(cnmlCreateBroadcastSubOp(&elementwise_op,
x_tensor->mlu_tensor(),
y_tensor->mlu_tensor(),
output_tensor->mlu_tensor()));
} else if (op_type == "elementwise_mul") {
CNML_CALL(cnmlCreateBroadcastMultOp(&elementwise_op,
x_tensor->mlu_tensor(),
y_tensor->mlu_tensor(),
output_tensor->mlu_tensor()));
} else if (op_type == "elementwise_div") {
CNML_CALL(cnmlCreateRealDivOp(&elementwise_op,
x_tensor->mlu_tensor(),
y_tensor->mlu_tensor(),
output_tensor->mlu_tensor()));
} else {
LOG(WARNING) << "[MLU] Unsupported op type: " << op_type;
return FAILED;
}
graph->FuseOp(elementwise_op);
cnmlBaseOp_t act_op;
if (op_type == "fusion_elementwise_add_activation") {
auto mid_tensor = graph->GetNode(out_var_name + "_mid");
auto type_string = op_info->GetAttr<std::string>("act_type");
cnmlActiveFunction_t act_type = OpTypeToCNMLActType(type_string);
CNML_CALL(cnmlCreateActiveOp(&act_op,
act_type,
mid_tensor->mlu_tensor(),
output_tensor->mlu_tensor()));
graph->FuseOp(act_op);
}
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
kMLU,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation,
kMLU,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
kMLU,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
kMLU,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(elementwise_div,
kMLU,
paddle::lite::subgraph::mlu::ElementwiseConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/elementwise_ops.h"
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int ElementwiseConverter(void* ctx, OpLite* op);
template <typename dtype>
void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto x = scope->FindTensor("x");
auto y = scope->FindTensor("y");
auto out = scope->FindMutableTensor("out_ref");
out->Resize(x->dims());
auto x_data = x->data<dtype>();
auto y_data = y->data<dtype>();
auto out_data = out->mutable_data<dtype>();
auto x_dims = x->dims();
auto y_dims = y->dims();
int axis = op_info->GetAttr<int>("axis");
if (axis < 0) {
axis += x_dims.size();
}
int batch = 1;
int channels = y->numel();
int num = x->numel() / channels / batch;
// do elementwise add/sub/max...
std::string op_type = op_info->Type();
if (op_type == "elementwise_add") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype* din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype* dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *din_ptr + diny_data;
dout_ptr++;
din_ptr++;
}
}
}
} else if (op_type == "elementwise_sub") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype* din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype* dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *din_ptr - diny_data;
dout_ptr++;
din_ptr++;
}
}
}
} else if (op_type == "elementwise_mul") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype* din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype* dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *din_ptr * diny_data;
dout_ptr++;
din_ptr++;
}
}
}
} else if (op_type == "elementwise_div") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype* din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype* dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *din_ptr / diny_data;
dout_ptr++;
din_ptr++;
}
}
}
} else if (op_type == "elementwise_max") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype* din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype* dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = std::max(*din_ptr, diny_data);
dout_ptr++;
din_ptr++;
}
}
}
} else {
LOG(FATAL) << "unsupported Elementwise type: " << op_type;
}
}
void test_elementwise_add(const std::vector<int64_t>& x_shape,
const std::vector<int64_t>& y_shape,
int axis,
std::string elt_type) {
// prepare input&output variables
Scope scope;
std::string x_var_name = "x";
std::string y_var_name = "y";
std::string out_var_name = "out";
std::string out_ref_var_name = "out_ref";
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
x->Resize(x_shape);
y->Resize(y_shape);
// initialize input&output data
FillTensor<float>(x, 1, 3);
FillTensor<float>(y, 1, 3);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("elementwise_" + elt_type);
opdesc.SetInput("X", {x_var_name});
opdesc.SetInput("Y", {y_var_name});
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("axis", axis);
// create and convert op to NPU model, then run it on NPU
auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
// execute reference implementation and save to output tensor
elementwise_add_ref<float>(op);
out_ref->CopyDataFrom(*out);
LaunchOp(op, {x_var_name, y_var_name}, {out_var_name});
// compare results
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
}
}
TEST(MLUBridges, elementwise_add) {
for (auto elt_type : {"add", "sub", "mul", "div"}) {
// test_elementwise_add({1, 2, 3, 4}, {2}, 1, elt_type);
// test_elementwise_add({1, 2, 3, 4}, {1, 2, 1, 1}, 1, elt_type);
test_elementwise_add({1, 2, 3, 4}, {1, 2, 3, 4}, 3, elt_type);
}
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_add,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_sub,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_mul,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_div,
paddle::lite::subgraph::mlu::ElementwiseConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
auto x_var_name = op_info->Input("Input").front();
auto w_var_name = op_info->Input("W").front();
auto output_var_name = op_info->Output("Out").front();
// int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
auto x_dims = x->dims();
auto w_dims = w->dims();
CHECK_GE(x_dims.size(), 2UL);
CHECK_EQ(w_dims.size(), 2UL);
// Create w node
std::vector<int64_t> w_shape{w_dims[1], w_dims[0]};
auto w_tensor = graph->AddNode(
w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
auto input_scale = op_info->GetAttr<float>("input_scale");
std::vector<int64_t> output_shape_nhwc({1, 1, 1, w_dims[1]});
auto output_tensor = graph->AddNode(output_var_name,
output_shape_nhwc,
CNML_TENSOR,
CNML_NHWC,
graph->FPType());
scope->FindVar(output_var_name)
->GetMutable<::paddle::lite::Tensor>()
->Resize(output_shape_nhwc);
std::string bias_var_name;
std::shared_ptr<MLUTensor> bias_tensor;
// Add bias node if bias tensor exists
if (HasInputArg(op_info, scope, "Bias")) {
bias_var_name = op_info->Input("Bias").front();
auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
auto bias_dims = bias->dims();
CHECK(!graph->HasNode(bias_var_name));
// CHECK_EQ(bias_dims.production(), n);
bias_tensor = graph->AddNode(bias_var_name,
bias_dims.Vectorize(),
CNML_CONST,
CNML_CNHW,
graph->FPType());
graph->BindConstData(bias_var_name, bias);
}
cnmlBaseOp_t fc_op;
CNML_CALL(cnmlCreateMlpOp(&fc_op,
graph->GetNode(x_var_name)->mlu_tensor(),
output_tensor->mlu_tensor(),
w_tensor->mlu_tensor(),
bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
graph->SetComputingDataType(
fc_op, graph->GetNode(x_var_name)->mlu_tensor(), 1 / input_scale);
auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
// LOG(INFO) << "W precision " << int(w->precision());
if (w->precision() == PrecisionType::kUnk ||
w->precision() == PrecisionType::kInt8) {
std::vector<float> w_dequant(w->data_size());
dequant(w_dequant.data(),
w->mutable_data<int8_t>(),
1,
w_dims[1],
w_dims[0],
weight_scale);
for (int i = 0; i < w_dims[1]; i++) {
for (int j = 0; j < w_dims[0]; j++) {
w->mutable_data<float>()[i * w_dims[0] + j] =
w_dequant[i + j * w_dims[1]];
}
}
w->set_precision(PrecisionType::kFloat);
} else if (w->precision() != PrecisionType::kFloat) {
LOG(FATAL) << "UnSupported weight precision!";
}
// graph->BindConstData(w_var_name, w_dequant.data());
graph->BindConstData(w_var_name, w);
graph->SetComputingDataType(
fc_op,
w_tensor->mlu_tensor(),
1 / *min_element(weight_scale.begin(), weight_scale.end()));
graph->FuseOp(fc_op);
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(fc, kMLU, paddle::lite::subgraph::mlu::FCConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/fc_op.h"
#include <gtest/gtest.h>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int FCConverter(void* ctx, OpLite* op);
void fc_ref(const std::shared_ptr<operators::FcOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto input =
scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
auto w = scope->FindVar(op_info->Input("W").front())->GetMutable<Tensor>();
auto out =
scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
int32_t in_num_col_dims = op_info->GetAttr<int32_t>("in_num_col_dims");
Tensor* bias = nullptr;
float* bias_data = nullptr;
if (op_info->HasInput("Bias")) {
auto bias_var_names = op_info->Input("Bias");
if (bias_var_names.size() > 0) {
auto bias_var_name = bias_var_names.front();
bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
bias_data = bias->mutable_data<float>();
}
}
auto input_data = input->data<float>();
auto w_data = w->mutable_data<float>();
auto out_data = out->mutable_data<float>();
auto in_mat_dims = input->dims().Flatten2D(in_num_col_dims);
int out_num_classes = w->dims()[1];
const int M = in_mat_dims[0];
const int K = in_mat_dims[1];
const int N = out_num_classes;
for (int m = 0; m < M; ++m) {
for (int n = 0; n < N; ++n) {
out_data[m * N + n] = 0;
for (int k = 0; k < K; ++k) {
out_data[m * N + n] += input_data[m * K + k] * w_data[k * N + n];
}
}
}
if (bias_data != nullptr) {
for (int m = 0; m < M; ++m) {
for (int n = 0; n < N; ++n) {
out_data[m * N + n] += bias_data[n];
}
}
}
}
void test_fc(const std::vector<int64_t>& input_shape,
const std::vector<int64_t>& w_shape,
int in_num_col_dims,
bool has_bias) {
CHECK_EQ(w_shape.size(), 2UL);
Scope scope;
std::string input_var_name("Input");
std::string w_var_name("W");
std::string w_int_var_name("W_int");
std::string bias_var_name("Bias");
std::string out_var_name("Out");
std::string out_ref_var_name("out_ref");
auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
auto* w = scope.Var(w_var_name)->GetMutable<Tensor>();
auto* w_int = scope.Var(w_int_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
input->Resize(input_shape);
w->Resize(w_shape);
w_int->Resize(w_shape);
FillTensor<int8_t, int8_t>(w_int, -127, 127);
float w_scale = 1. / 1024;
float input_scale = 1. / 8;
Tensor input_int;
input_int.Resize(input_shape);
FillTensor<int8_t, int8_t>(&input_int, -127, 127);
for (int i = 0; i < input->data_size(); i++) {
input->mutable_data<float>()[i] = input_int.data<int8_t>()[i] * input_scale;
}
for (int i = 0; i < w->data_size(); i++) {
w->mutable_data<float>()[i] = w_int->data<int8_t>()[i] * w_scale;
}
// create fc op
cpp::OpDesc fc_op_desc;
fc_op_desc.SetType("fc");
fc_op_desc.SetInput("Input", {input_var_name});
fc_op_desc.SetInput("W", {w_var_name});
fc_op_desc.SetOutput("Out", {out_var_name});
fc_op_desc.SetAttr("in_num_col_dims", static_cast<int>(in_num_col_dims));
if (has_bias) {
auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
bias->Resize({w_shape[1]});
FillTensor<float, int>(bias);
fc_op_desc.SetInput("Bias", {bias_var_name});
}
auto fc_op = CreateOp<operators::FcOpLite>(fc_op_desc, &scope);
fc_ref(fc_op);
out_ref->CopyDataFrom(*out);
// create fc imlu op
cpp::OpDesc fc_op_desc_mlu;
fc_op_desc_mlu.SetType("fc");
fc_op_desc_mlu.SetInput("Input", {input_var_name});
fc_op_desc_mlu.SetInput("W", {w_int_var_name});
fc_op_desc_mlu.SetOutput("Out", {out_var_name});
fc_op_desc_mlu.SetAttr("in_num_col_dims", static_cast<int>(in_num_col_dims));
fc_op_desc_mlu.SetAttr("weight_scale",
std::vector<float>(w_shape[1], w_scale));
fc_op_desc_mlu.SetAttr("input_scale", input_scale);
if (has_bias) {
fc_op_desc_mlu.SetInput("Bias", {bias_var_name});
}
auto fc_op_mlu = CreateOp<operators::FcOpLite>(fc_op_desc_mlu, &scope);
input->Resize({static_cast<int>(input_shape[0]),
static_cast<int>(input_shape[2]),
static_cast<int>(input_shape[3]),
static_cast<int>(input_shape[1])});
out->Resize({static_cast<int>(input_shape[0]), static_cast<int>(w_shape[1])});
LaunchOp(fc_op_mlu, {input_var_name}, {out_var_name});
// compare results
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
}
}
TEST(MLUBridges, fc) {
for (bool use_bias : {true, false}) {
// test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias);
// test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias);
// test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
test_fc({1, 1024, 1, 1}, {1024, 32}, 1, use_bias);
}
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU, fc, paddle::lite::subgraph::mlu::FCConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include <utility>
#include <vector>
#include "lite/kernels/mlu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name,
std::vector<int64_t> shape,
cnmlTensorType_t tensor_type,
cnmlDataOrder_t data_order,
cnmlDataType_t mlu_dtype,
void* raw_ptr) {
CHECK(!HasNode(name));
auto node = std::shared_ptr<MLUTensor>(
new MLUTensor(shape, tensor_type, data_order, mlu_dtype));
node->set_mlu_ptr(raw_ptr);
nodes_.insert(std::make_pair(name, node));
return node;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cmath>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
#include "lite/kernels/mlu/bridges/tensor.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
// The Context of the converters which used for converting the ops of subgraph
// to the MLU IR graph
class Graph {
public:
Graph() { CNML_CALL(cnmlCreateFusionOp(&fusion_op_)); }
~Graph() {
CNML_CALL(cnmlDestroyFusionOp(&fusion_op_));
for (auto op : ops_) {
CNML_CALL(cnmlDestroyBaseOp(&op));
}
}
// Data node
std::shared_ptr<MLUTensor> AddNode(
const std::string& name,
std::vector<int64_t> shape,
cnmlTensorType_t tensor_type = CNML_TENSOR,
cnmlDataOrder_t data_order = CNML_NCHW,
cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32,
void* raw_ptr = nullptr);
std::shared_ptr<MLUTensor> GetNode(const std::string& name) {
CHECK(HasNode(name)) << "[MLU] Node " << name << " not found.";
return nodes_.at(name);
}
bool HasNode(const std::string& name) {
return nodes_.find(name) != nodes_.end();
}
void AddInput(std::shared_ptr<MLUTensor> tensor) {
inputs_.push_back(tensor->mlu_tensor());
input_tensors_.push_back(tensor);
}
void AddOutput(std::shared_ptr<MLUTensor> tensor) {
outputs_.push_back(tensor->mlu_tensor());
output_tensors_.push_back(tensor);
}
void FuseOp(cnmlBaseOp_t op) { CNML_CALL(cnmlFuseOp(op, fusion_op_)); }
void Compile(cnmlCoreVersion_t core_version, int core_number) {
CNML_CALL(cnmlSetFusionIO(fusion_op_,
inputs_.data(),
inputs_.size(),
outputs_.data(),
outputs_.size()));
CNML_CALL(cnmlSetFusionOpCorenum(fusion_op_, core_number));
CNML_CALL(cnmlSetFusionOpCoreVersion(fusion_op_, core_version));
CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_));
for (auto in : input_tensors_) {
input_addrs_.push_back(in->mlu_data());
}
for (auto out : output_tensors_) {
output_addrs_.push_back(out->mlu_data());
}
}
void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
input_addrs_.data(),
input_addrs_.size(),
output_addrs_.data(),
output_addrs_.size(),
&forward_param,
que));
CNRT_CALL(cnrtSyncQueue(que));
}
void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) {
const float* data = tensor->data<float>();
size_t len = tensor->data_size();
if (fp_type_ == CNML_DATA_FLOAT32) {
CNML_CALL(cnmlBindConstData_V2(
nodes_[tensor_name]->mlu_tensor(),
const_cast<void*>(static_cast<const void*>(data)),
false));
} else if (fp_type_ == CNML_DATA_FLOAT16) {
auto* data_fp16 = tensor->mutable_data<::paddle::lite::fluid::float16>();
for (size_t i = 0; i < len; ++i) {
data_fp16[i] = static_cast<::paddle::lite::fluid::float16>(data[i]);
}
CNML_CALL(cnmlBindConstData_V2(nodes_[tensor_name]->mlu_tensor(),
static_cast<void*>(data_fp16),
false));
} else {
CHECK(0);
}
}
void SetComputingDataType(cnmlBaseOp_t op,
cnmlTensor_t tensor,
float scale,
cnmlDataType_t data_type = CNML_DATA_INT8) {
cnmlQuantizedParam_t quant_param;
CNML_CALL(
cnmlCreateQuantizedParam(&quant_param, scale2position(scale), 1, 0.0));
CNML_CALL(
cnmlSetOperationComputingDataType(op, tensor, data_type, quant_param));
CNML_CALL(cnmlDestroyQuantizedParam(&quant_param));
}
void SetFPType(::paddle::lite_api::PrecisionType type) {
switch (type) {
case ::paddle::lite_api::PrecisionType::kFP16:
fp_type_ = CNML_DATA_FLOAT16;
break;
case ::paddle::lite_api::PrecisionType::kFloat:
fp_type_ = CNML_DATA_FLOAT32;
break;
default:
CHECK(0);
}
}
cnmlDataType_t FPType() { return fp_type_; }
private:
cnmlDataType_t fp_type_{CNML_DATA_FLOAT32};
std::unordered_map<std::string, std::shared_ptr<MLUTensor>> nodes_;
std::vector<cnmlTensor_t> inputs_;
std::vector<cnmlTensor_t> outputs_;
std::vector<void*> input_addrs_;
std::vector<void*> output_addrs_;
std::vector<std::shared_ptr<MLUTensor>> input_tensors_;
std::vector<std::shared_ptr<MLUTensor>> output_tensors_;
std::vector<cnmlBaseOp_t> ops_;
cnmlFusionOp_t fusion_op_;
};
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
USE_SUBGRAPH_BRIDGE(relu, kMLU);
USE_SUBGRAPH_BRIDGE(conv2d, kMLU);
USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU);
USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU);
USE_SUBGRAPH_BRIDGE(pool2d, kMLU);
USE_SUBGRAPH_BRIDGE(softmax, kMLU);
USE_SUBGRAPH_BRIDGE(batch_norm, kMLU);
USE_SUBGRAPH_BRIDGE(fc, kMLU);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/pool_op.h"
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
inline cnmlPoolMode_t ToCnmlPoolMode(const std::string& pool_mode) {
cnmlPoolMode_t cnml_pool_mode;
if (pool_mode == "max") {
cnml_pool_mode = CNML_POOL_MAX;
} else if (pool_mode == "avg") {
cnml_pool_mode = CNML_POOL_AVG;
} else {
CHECK(false) << "Unexpected pool mode " << pool_mode;
}
return cnml_pool_mode;
}
int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
// Get input, and attributes
auto x_var_name = op_info->Input("X").front();
auto x = scope->FindTensor(x_var_name);
auto input_dims_nhwc = x->dims();
const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
auto output_var_name = op_info->Output("Out").front();
auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
auto global_pooling = op_info->GetAttr<bool>("global_pooling");
auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
auto strides = op_info->GetAttr<std::vector<int>>("strides");
if (paddings.size() == 2L) {
for (size_t i = 0; i < 2L; ++i) {
int copy_pad = *(paddings.begin() + 2 * i);
paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
}
}
int pad_height = paddings[0];
int pad_width = paddings[2];
std::string padding_algorithm("");
if (op_info->HasAttr("padding_algorithm")) {
padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
}
bool adaptive = false;
if (op_info->HasAttr("adaptive")) {
adaptive = op_info->GetAttr<bool>("adaptive");
}
lite::operators::UpdatePadding(&paddings,
global_pooling,
adaptive,
padding_algorithm,
x->dims(),
strides,
ksize);
std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
for (size_t i = 0; i < 2; i++) {
output_shape.push_back(
(input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - ksize[0]) /
strides[i] +
1);
}
auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
auto output_tensor = graph->AddNode(output_var_name,
output_shape_nhwc,
CNML_TENSOR,
CNML_NHWC,
graph->FPType());
scope->FindVar(output_var_name)
->GetMutable<::paddle::lite::Tensor>()
->Resize(output_shape_nhwc);
cnmlPoolOpParam_t pool_param;
CNML_CALL(
cnmlCreatePoolOpParam_V2(&pool_param,
ksize[0],
ksize[1],
strides[0],
strides[1],
pad_height,
pad_width,
1, // dilation
1,
ToCnmlPoolMode(pooling_type),
ceil_mode ? CNML_POOL_KVALID : CNML_POOL_KFULL,
true, /* real */
1 /* blend factor */));
cnmlBaseOp_t pool_op;
CNML_CALL(cnmlCreatePoolOp(&pool_op,
pool_param,
graph->GetNode(x_var_name)->mlu_tensor(),
output_tensor->mlu_tensor()));
CNML_CALL(cnmlDestroyPoolOpParam(&pool_param));
graph->FuseOp(pool_op);
return SUCCESS;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(pool2d,
kMLU,
paddle::lite::subgraph::mlu::PoolConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/pool_op.h"
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int PoolConverter(void* ctx, OpLite* op);
void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
auto out =
scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
auto& in_dims = x->dims();
auto& out_dims = out->dims();
const float* src_ptr = x->data<const float>();
float* dst_ptr = out->mutable_data<float>();
std::vector<int> ksize = op_info->GetAttr<std::vector<int>>("ksize");
std::vector<int> strides = op_info->GetAttr<std::vector<int>>("strides");
std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
bool exclusive = op_info->GetAttr<bool>("exclusive");
std::string pooling_type = op_info->GetAttr<std::string>("pooling_type");
bool global_pooling = op_info->GetAttr<bool>("global_pooling");
int in_n = in_dims[0];
int in_c = in_dims[1];
int in_h = in_dims[2];
int in_w = in_dims[3];
int size_in_n = in_c * in_h * in_w;
int size_in_c = in_h * in_w;
int out_h = out_dims[2];
int out_w = out_dims[3];
int size_out_n = in_c * out_h * out_w;
int size_out_c = out_h * out_w;
int window_h = ksize[0];
int window_w = ksize[1];
int stride_h = strides[0];
int stride_w = strides[1];
int pad_h = paddings[0];
int pad_w = paddings[2];
if (global_pooling == true) {
for (int n = 0; n < in_n; ++n) {
for (int c = 0; c < in_c; ++c) {
const float* src = src_ptr + n * size_in_n + c * size_in_c;
float res = src[0];
if (pooling_type == "max") {
for (int i = 1; i < size_in_c; ++i) {
float cur_val = src[i];
res = cur_val > res ? cur_val : res;
}
} else if (pooling_type == "avg") {
for (int i = 1; i < size_in_c; ++i) {
float cur_val = src[i];
res += cur_val;
}
res /= size_in_c;
}
dst_ptr[n * size_out_n + c] = res;
}
}
} else {
for (int n = 0; n < in_n; ++n) {
for (int c = 0; c < in_c; ++c) {
for (int h = 0; h < out_h; ++h) {
int sh = h * stride_h;
int eh = sh + window_h;
sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
for (int w = 0; w < out_w; ++w) {
int sw = w * stride_w;
int ew = sw + window_w;
sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
int pooling_size = (ew - sw) * (eh - sh);
if (pooling_size == 0) continue;
float res = 0.f;
for (int kh = sh; kh < eh; ++kh) {
for (int kw = sw; kw < ew; ++kw) {
int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
if (kh == sh && kw == sw) {
res = src_ptr[src_idx];
} else {
if (pooling_type == "max") {
res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
}
if (pooling_type == "avg") {
res += src_ptr[src_idx];
}
}
}
}
if (pooling_type == "avg") {
if (exclusive) {
res /= pooling_size;
} else {
res /= window_h * window_w;
}
}
dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
}
}
}
}
}
}
void test_pool(int bs,
int ic,
int ih,
int iw,
std::string pooling_type,
bool ceil_mode,
bool global_pooling,
bool exclusive,
int ksize,
int stride,
int padding) {
// prepare input&output variables
Scope scope;
std::string x_var_name = "x";
std::string out_var_name = "out";
std::string out_ref_var_name = "out_ref";
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
x->Resize({bs, ic, ih, iw});
// initialize input&output data
FillTensor<float>(x);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("pool2d");
opdesc.SetInput("X", {x_var_name});
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("pooling_type", pooling_type);
opdesc.SetAttr("ksize", std::vector<int>({ksize, ksize}));
opdesc.SetAttr("global_pooling", global_pooling);
opdesc.SetAttr("exclusive", exclusive);
opdesc.SetAttr("ceil_mode", ceil_mode);
opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
opdesc.SetAttr("paddings",
std::vector<int>({padding, padding, padding, padding}));
// create and convert op to MLU model, then run it on MLU
auto op = CreateOp<operators::PoolOpLite>(opdesc, &scope);
// execute reference implementation and save to output tensor
pool_ref(op);
out_ref->CopyDataFrom(*out);
Tensor input_trans;
input_trans.Resize({bs, ic, ih, iw});
transpose(x->mutable_data<float>(),
input_trans.mutable_data<float>(),
{bs, ic, ih, iw},
{0, 2, 3, 1});
auto os = out->dims();
out->Resize({static_cast<int>(os[0]),
static_cast<int>(os[2]),
static_cast<int>(os[3]),
static_cast<int>(os[1])});
x->CopyDataFrom(input_trans);
x->Resize({bs, ih, iw, ic});
LaunchOp(op, {x_var_name}, {out_var_name});
// compare results
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
Tensor output_trans;
output_trans.Resize(out->dims());
transpose(out_data,
output_trans.mutable_data<float>(),
{static_cast<int>(os[0]),
static_cast<int>(os[2]),
static_cast<int>(os[3]),
static_cast<int>(os[1])},
{0, 3, 1, 2});
out_data = output_trans.mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
}
}
TEST(MLUBridges, pool) {
// for (auto pooling_type : {"max", "avg"}) {
// for (auto ceil_mode : {true, false}) {
// for (auto global_pooling : {/*true, */ false}) {
// for (auto exclusive : {true /*, false*/}) {
// for (auto ksize : {2, 3}) {
// for (auto stride : {1, 2}) {
// for (auto padding : {0, 1}) {
// for (auto bs : {1, 3}) {
// for (auto ic : {1, 3}) {
// for (auto ih : {3, 7}) {
// for (auto iw : {3, 7}) {
// test_pool(bs,
// ic,
// ih,
// iw,
// pooling_type,
// ceil_mode,
// global_pooling,
// exclusive,
// ksize,
// stride,
// padding);
// }
// }
// }
// }
// }
// }
// }
// }
// }
// }
// }
for (auto pooling_type : {"max", "avg"}) {
for (auto ceil_mode : {true, false}) {
bool global_pooling = false;
bool exclusive = true;
int ksize = 2;
int stride = 1;
int padding = 0;
int bs = 6;
int ic = 6;
int ih = 6;
int iw = 6;
test_pool(bs,
ic,
ih,
iw,
pooling_type,
ceil_mode,
global_pooling,
exclusive,
ksize,
stride,
padding);
}
}
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
pool2d,
paddle::lite::subgraph::mlu::PoolConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
// Get op's attributes
auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize();
// nchw axis to nhwc aixs
int nchw_to_nhwc_aixs_map[4] = {0, 3, 1, 2};
int axis = 1;
if (op_info->HasAttr("axis")) {
axis = op_info->GetAttr<int>("axis");
if (axis < 0) {
axis = output_dims.size() + axis;
}
}
int nhwc_axis = nchw_to_nhwc_aixs_map[axis];
auto output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
cnmlBaseOp_t softmax_op;
CNML_CALL(cnmlCreateNdSoftmaxOp(&softmax_op,
nhwc_axis,
graph->GetNode(x_var_name)->mlu_tensor(),
output_tensor->mlu_tensor()));
graph->FuseOp(softmax_op);
return SUCCESS;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(softmax,
kMLU,
paddle::lite::subgraph::mlu::SoftmaxConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/softmax_op.h"
#include <gtest/gtest.h>
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int SoftmaxConverter(void* ctx, OpLite* op);
template <typename dtype>
void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
auto out =
scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
auto x_data = x->data<dtype>();
auto out_data = out->mutable_data<dtype>();
DDim x_dims = x->dims();
auto x_rank = x_dims.size();
int axis = op_info->GetAttr<int>("axis");
if (axis < 0) {
axis += x_rank;
}
int axis_size = x_dims[axis];
int outer_num = x_dims.Slice(0, axis).production();
int inner_num = x_dims.Slice(axis + 1, x_rank).production();
int compute_size = outer_num * inner_num;
for (int i = 0; i < compute_size; i++) {
int idx_inner = i % inner_num;
int idx_outer = (i / inner_num) * axis_size;
int start = idx_outer * inner_num + idx_inner;
int offset;
offset = start;
dtype max_data = std::numeric_limits<dtype>::lowest();
for (int j = 0; j < axis_size; j++) {
max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
offset += inner_num;
}
offset = start;
dtype sum_data = (dtype)0;
for (int j = 0; j < axis_size; j++) {
out_data[offset] = exp(x_data[offset] - max_data);
sum_data += out_data[offset];
offset += inner_num;
}
offset = start;
for (int j = 0; j < axis_size; j++) {
out_data[offset] /= sum_data;
offset += inner_num;
}
}
}
void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
// prepare input&output variables
Scope scope;
std::string x_var_name = "x";
std::string out_var_name = "out";
std::string out_ref_var_name = "out_ref";
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
x->Resize(input_shape);
// initialize input&output data
FillTensor<float>(x);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("softmax");
opdesc.SetInput("X", {x_var_name});
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("axis", axis);
// create and convert op to NPU model, then run it on NPU
auto op = CreateOp<operators::SoftmaxOp>(opdesc, &scope);
// execute reference implementation and save to output tensor
softmax_ref<float>(op);
out_ref->CopyDataFrom(*out);
int bs = x->dims()[0];
int ic = x->dims()[1];
int ih = x->dims()[2];
int iw = x->dims()[3];
Tensor input_trans;
input_trans.Resize({bs, ic, ih, iw});
transpose(x->mutable_data<float>(),
input_trans.mutable_data<float>(),
{bs, ic, ih, iw},
{0, 2, 3, 1});
out->Resize({bs, ih, iw, ic});
x->CopyDataFrom(input_trans);
x->Resize({bs, ih, iw, ic});
LaunchOp(op, {x_var_name}, {out_var_name});
// compare results
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
Tensor output_trans;
output_trans.Resize({bs, ic, ih, iw});
transpose(out_data,
output_trans.mutable_data<float>(),
{bs, ih, iw, ic},
{0, 3, 1, 2});
out_data = output_trans.mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
}
}
TEST(MLUBridges, softmax) {
// test_softmax({1, 4}, -1);
// // Bug exists in HiAI DDK when the number of items > 16500
// test_softmax({1, 16500}, -1);
// test_softmax({1, 4}, 0);
// test_softmax({1, 4}, 1);
// test_softmax({3, 4}, -1);
// test_softmax({3, 4}, 0);
// test_softmax({3, 4}, 1);
// test_softmax({1, 4, 7}, -1);
// test_softmax({1, 4, 7}, 0);
// // Bug exists in HiAI DDK when axis is 1 and iw > 1
// // test_softmax({1, 4, 7}, 1);
// test_softmax({1, 4, 1}, 1);
// test_softmax({1, 4, 7}, 2);
// test_softmax({3, 4, 7}, -1);
// test_softmax({3, 4, 7}, 0);
// test_softmax({3, 4, 1}, 1);
// test_softmax({3, 4, 7}, 2);
test_softmax({1, 4, 7, 9}, -1);
test_softmax({1, 4, 7, 9}, 0);
test_softmax({1, 4, 7, 9}, 1);
// Bug exists in HiAI DDK when axis is 2 and iw > 1
// test_softmax({1, 4, 7, 9}, 2);
test_softmax({1, 4, 7, 1}, 2);
test_softmax({1, 4, 7, 9}, 3);
test_softmax({3, 4, 7, 9}, -1);
test_softmax({3, 4, 7, 9}, 0);
test_softmax({3, 4, 7, 9}, 1);
test_softmax({3, 4, 7, 1}, 2);
test_softmax({3, 4, 7, 9}, 3);
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU,
softmax,
paddle::lite::subgraph::mlu::SoftmaxConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/tensor.h"
#include <glog/logging.h>
#include <algorithm>
#include <climits>
#include <vector>
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
MLUTensor::MLUTensor(const std::vector<int64_t>& shape,
cnmlTensorType_t tensor_type,
cnmlDataOrder_t data_order,
cnmlDataType_t mlu_dtype)
: mlu_tensor_(nullptr), tensor_type_(tensor_type), mlu_ptr_(nullptr) {
std::vector<int> int_shape;
for (auto i : shape) {
if (i <= INT_MAX) {
int_shape.push_back(i);
} else {
LOG(FATAL) << "Shape size is beyond the limitation of MLUTensor!";
}
}
remember(int_shape, tensor_type, mlu_dtype, data_order);
}
void MLUTensor::remember(const std::vector<int>& shape,
cnmlTensorType_t tensor_type,
cnmlDataType_t mlu_dtype,
cnmlDataOrder_t shape_order) {
tensor_type_ = tensor_type;
mlu_dtype_ = mlu_dtype;
int size = 4;
if (shape.size() > 4 || shape_order == CNML_ARRAY) {
size = shape.size();
}
shape_.resize(size);
if (shape.size() <= 4) {
switch (shape_order) {
case CNML_NCHW:
shape_[0] = shape.size() > 0 ? shape[0] : 1;
shape_[3] = shape.size() > 1 ? shape[1] : 1;
shape_[1] = shape.size() > 2 ? shape[2] : 1;
shape_[2] = shape.size() > 3 ? shape[3] : 1;
break;
case CNML_NCWH:
shape_[0] = shape.size() > 0 ? shape[0] : 1;
shape_[3] = shape.size() > 1 ? shape[1] : 1;
shape_[2] = shape.size() > 3 ? shape[3] : 1;
shape_[1] = shape.size() > 2 ? shape[2] : 1;
break;
case CNML_NHWC:
shape_[0] = shape.size() > 0 ? shape[0] : 1;
shape_[3] = shape.size() > 3 ? shape[3] : 1;
shape_[1] = shape.size() > 1 ? shape[1] : 1;
shape_[2] = shape.size() > 2 ? shape[2] : 1;
break;
case CNML_NHCW:
shape_[0] = shape.size() > 0 ? shape[0] : 1;
shape_[3] = shape.size() > 2 ? shape[2] : 1;
shape_[1] = shape.size() > 1 ? shape[1] : 1;
shape_[2] = shape.size() > 3 ? shape[3] : 1;
break;
case CNML_NWCH:
shape_[0] = shape.size() > 0 ? shape[0] : 1;
shape_[3] = shape.size() > 2 ? shape[2] : 1;
shape_[1] = shape.size() > 3 ? shape[3] : 1;
shape_[2] = shape.size() > 1 ? shape[1] : 1;
break;
case CNML_NWHC:
shape_[0] = shape.size() > 0 ? shape[0] : 1;
shape_[3] = shape.size() > 3 ? shape[3] : 1;
shape_[1] = shape.size() > 2 ? shape[2] : 1;
shape_[2] = shape.size() > 1 ? shape[1] : 1;
break;
case CNML_CNHW:
shape_[0] = shape.size() > 1 ? shape[1] : 1;
shape_[3] = shape.size() > 0 ? shape[0] : 1;
shape_[1] = shape.size() > 2 ? shape[2] : 1;
shape_[2] = shape.size() > 3 ? shape[3] : 1;
break;
case CNML_CNWH:
shape_[0] = shape.size() > 1 ? shape[1] : 1;
shape_[3] = shape.size() > 0 ? shape[0] : 1;
shape_[1] = shape.size() > 3 ? shape[3] : 1;
shape_[2] = shape.size() > 2 ? shape[2] : 1;
break;
case CNML_CHWN:
shape_[0] = shape.size() > 3 ? shape[3] : 1;
shape_[3] = shape.size() > 0 ? shape[0] : 1;
shape_[1] = shape.size() > 1 ? shape[1] : 1;
shape_[2] = shape.size() > 2 ? shape[2] : 1;
break;
case CNML_CHNW:
shape_[0] = shape.size() > 2 ? shape[2] : 1;
shape_[3] = shape.size() > 0 ? shape[0] : 1;
shape_[1] = shape.size() > 1 ? shape[1] : 1;
shape_[2] = shape.size() > 3 ? shape[3] : 1;
break;
case CNML_CWNH:
shape_[0] = shape.size() > 2 ? shape[2] : 1;
shape_[3] = shape.size() > 0 ? shape[0] : 1;
shape_[1] = shape.size() > 3 ? shape[3] : 1;
shape_[2] = shape.size() > 1 ? shape[1] : 1;
break;
case CNML_CWHN:
shape_[0] = shape.size() > 3 ? shape[3] : 1;
shape_[3] = shape.size() > 0 ? shape[0] : 1;
shape_[1] = shape.size() > 2 ? shape[2] : 1;
shape_[2] = shape.size() > 1 ? shape[1] : 1;
break;
case CNML_HNCW:
shape_[0] = shape.size() > 1 ? shape[1] : 1;
shape_[3] = shape.size() > 2 ? shape[2] : 1;
shape_[1] = shape.size() > 0 ? shape[0] : 1;
shape_[2] = shape.size() > 3 ? shape[3] : 1;
break;
case CNML_HNWC:
shape_[0] = shape.size() > 1 ? shape[1] : 1;
shape_[3] = shape.size() > 3 ? shape[3] : 1;
shape_[1] = shape.size() > 0 ? shape[0] : 1;
shape_[2] = shape.size() > 2 ? shape[2] : 1;
break;
case CNML_HCWN:
shape_[0] = shape.size() > 3 ? shape[3] : 1;
shape_[3] = shape.size() > 1 ? shape[1] : 1;
shape_[1] = shape.size() > 0 ? shape[0] : 1;
shape_[2] = shape.size() > 2 ? shape[2] : 1;
break;
case CNML_HCNW:
shape_[0] = shape.size() > 2 ? shape[2] : 1;
shape_[3] = shape.size() > 1 ? shape[1] : 1;
shape_[1] = shape.size() > 0 ? shape[0] : 1;
shape_[2] = shape.size() > 3 ? shape[3] : 1;
break;
case CNML_HWNC:
shape_[0] = shape.size() > 2 ? shape[2] : 1;
shape_[3] = shape.size() > 3 ? shape[3] : 1;
shape_[1] = shape.size() > 0 ? shape[0] : 1;
shape_[2] = shape.size() > 1 ? shape[1] : 1;
break;
case CNML_HWCN:
shape_[0] = shape.size() > 3 ? shape[3] : 1;
shape_[3] = shape.size() > 2 ? shape[2] : 1;
shape_[1] = shape.size() > 0 ? shape[0] : 1;
shape_[2] = shape.size() > 1 ? shape[1] : 1;
break;
case CNML_WNCH:
shape_[0] = shape.size() > 1 ? shape[1] : 1;
shape_[3] = shape.size() > 2 ? shape[2] : 1;
shape_[1] = shape.size() > 3 ? shape[3] : 1;
shape_[2] = shape.size() > 0 ? shape[0] : 1;
break;
case CNML_WNHC:
shape_[0] = shape.size() > 1 ? shape[1] : 1;
shape_[3] = shape.size() > 3 ? shape[3] : 1;
shape_[1] = shape.size() > 2 ? shape[2] : 1;
shape_[2] = shape.size() > 0 ? shape[0] : 1;
break;
case CNML_WCHN:
shape_[0] = shape.size() > 3 ? shape[3] : 1;
shape_[3] = shape.size() > 1 ? shape[1] : 1;
shape_[1] = shape.size() > 2 ? shape[2] : 1;
shape_[2] = shape.size() > 0 ? shape[0] : 1;
break;
case CNML_WCNH:
shape_[0] = shape.size() > 2 ? shape[2] : 1;
shape_[3] = shape.size() > 1 ? shape[1] : 1;
shape_[1] = shape.size() > 3 ? shape[3] : 1;
shape_[2] = shape.size() > 0 ? shape[0] : 1;
break;
case CNML_WHNC:
shape_[0] = shape.size() > 2 ? shape[2] : 1;
shape_[3] = shape.size() > 3 ? shape[3] : 1;
shape_[1] = shape.size() > 1 ? shape[1] : 1;
shape_[2] = shape.size() > 0 ? shape[0] : 1;
break;
case CNML_WHCN:
shape_[0] = shape.size() > 3 ? shape[3] : 1;
shape_[3] = shape.size() > 2 ? shape[2] : 1;
shape_[1] = shape.size() > 1 ? shape[1] : 1;
shape_[2] = shape.size() > 0 ? shape[0] : 1;
break;
case CNML_ARRAY:
shape_ = shape;
break;
default:
LOG(FATAL) << "Unsupported mluDataOrder! " << int(shape_order);
break;
}
} else {
switch (shape_order) {
case CNML_NCDHW:
shape_[0] = shape[0];
shape_[4] = shape[1];
shape_[1] = shape[2];
shape_[2] = shape[3];
shape_[3] = shape[4];
break;
case CNML_NDHWC:
shape_[0] = shape[0];
shape_[4] = shape[4];
shape_[1] = shape[1];
shape_[2] = shape[2];
shape_[3] = shape[3];
break;
case CNML_DHWCN:
shape_[0] = shape[4];
shape_[4] = shape[3];
shape_[1] = shape[0];
shape_[2] = shape[1];
shape_[3] = shape[2];
break;
case CNML_ARRAY:
shape_ = shape;
break;
default:
shape_[0] = shape[0];
shape_[4] = shape[1];
shape_[1] = shape[2];
shape_[2] = shape[3];
shape_[3] = shape[4];
break;
}
}
dim_ = shape_.size();
}
void MLUTensor::Create() {
if (mlu_tensor_ == nullptr) {
CNML_CALL(cnmlCreateTensor_V2(&mlu_tensor_, tensor_type_));
std::vector<int> dim_shape(shape_);
int* dim_strides = nullptr;
CNML_CALL(cnmlSetTensorShape_V2(
mlu_tensor_, dim_, dim_shape.data(), dim_strides));
CNML_CALL(cnmlSetTensorDataType(mlu_tensor_, mlu_dtype_));
}
}
cnmlTensor_t MLUTensor::mlu_tensor() {
Create();
return mlu_tensor_;
}
MLUTensor::~MLUTensor() {
if (mlu_tensor_ != nullptr) {
CNML_CALL(cnmlDestroyTensor(&mlu_tensor_));
mlu_tensor_ = nullptr;
}
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "lite/kernels/mlu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
class MLUTensor {
public:
MLUTensor()
: mlu_tensor_(nullptr),
tensor_type_(CNML_TENSOR),
mlu_dtype_(CNML_DATA_FLOAT32) {}
void set_mlu_ptr(void* mlu_data) { mlu_ptr_ = mlu_data; }
MLUTensor(const std::vector<int64_t>& shape,
cnmlTensorType_t tensor_type = CNML_TENSOR,
cnmlDataOrder_t data_order = CNML_NCHW,
cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32);
void remember(const std::vector<int>& shape,
cnmlTensorType_t tensor_type,
cnmlDataType_t mlu_dtype,
cnmlDataOrder_t shape_order);
void Create();
cnmlTensor_t mlu_tensor();
void* mlu_data() {
CHECK(mlu_ptr_ != nullptr);
return mlu_ptr_;
}
~MLUTensor();
private:
cnmlTensor_t mlu_tensor_;
std::vector<int> shape_;
cnmlTensorType_t tensor_type_;
cnmlDataType_t mlu_dtype_;
int dim_{0};
cnmlDataOrder_t data_order_;
void* mlu_ptr_;
};
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/test_helper.h"
#include <utility>
#include "lite/core/device_info.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/mlu/subgraph_compute.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
void LaunchOp(const std::shared_ptr<lite::OpLite> op,
const std::vector<std::string>& input_var_names,
const std::vector<std::string>& output_var_names) {
CNRT_CALL(cnrtInit(0));
SetMluDevice(0);
cnrtQueue_t queue_;
cnrtInvokeFuncParam_t forward_param;
u32_t affinity = 1;
int data_param = 1;
forward_param.data_parallelism = &data_param;
forward_param.affinity = &affinity;
forward_param.end = CNRT_PARAM_END;
CNRT_CALL(cnrtCreateQueue(&queue_));
cnrtDev_t dev_handle;
CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, 0));
CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
auto scope = op->scope();
auto op_type = op->op_info()->Type();
paddle::lite::subgraph::mlu::Graph graph;
// convert op to IR graph
const auto& bridges = subgraph::Registry::Instance();
CHECK(bridges.Exists(op_type, TARGET(kMLU)));
// Convert all of input data vars and added into the MLU IR graph
for (auto& input_name : input_var_names) {
auto input_tensor = scope->FindMutableTensor(input_name);
CHECK(input_tensor);
Tensor temp_input;
temp_input.Resize(input_tensor->dims().Vectorize());
temp_input.CopyDataFrom(*input_tensor);
auto input_node =
graph.AddNode(input_name,
input_tensor->dims().Vectorize(),
CNML_TENSOR,
CNML_NHWC,
graph.FPType(),
reinterpret_cast<void*>(
input_tensor->mutable_data<float>(TARGET(kMLU))));
CHECK(input_node);
CNRT_CHECK(cnrtMemcpy(input_tensor->mutable_data<float>(),
temp_input.mutable_data<float>(),
sizeof(float) * input_tensor->dims().production(),
CNRT_MEM_TRANS_DIR_HOST2DEV));
}
bridges.Select(op_type, TARGET(kMLU))(
reinterpret_cast<void*>(&graph), const_cast<OpLite*>(op.get()), nullptr);
for (auto& output_name : output_var_names) {
if (graph.HasNode(output_name)) {
graph.AddOutput(graph.GetNode(output_name));
}
auto output_tensor = scope->FindMutableTensor(output_name);
void* p_data =
static_cast<void*>(output_tensor->mutable_data<float>(TARGET(kMLU)));
auto node = graph.GetNode(output_name);
CHECK(p_data);
node->set_mlu_ptr(p_data);
}
for (auto& input_name : input_var_names) {
graph.AddInput(graph.GetNode(input_name));
}
graph.Compile(CNML_MLU270, 1);
graph.Compute(forward_param, queue_);
for (auto& output_name : output_var_names) {
auto output_tensor = scope->FindMutableTensor(output_name);
Tensor temp_out;
temp_out.Resize(output_tensor->dims().Vectorize());
CNRT_CHECK(cnrtMemcpy(temp_out.mutable_data<float>(TARGET(kHost)),
output_tensor->mutable_data<float>(),
sizeof(float) * output_tensor->dims().production(),
CNRT_MEM_TRANS_DIR_DEV2HOST));
output_tensor->mutable_data<float>(TARGET(kHost));
output_tensor->CopyDataFrom(temp_out);
}
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// USE_LITE_OP(graph_op);
// USE_LITE_KERNEL(graph_op, kMLU, kFloat, kNHWC, def);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <random>
#include <string>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/kernels/mlu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
template <typename T>
std::shared_ptr<T> CreateOp(const cpp::OpDesc& opdesc, lite::Scope* scope) {
auto op = std::make_shared<T>(opdesc.Type());
op->SetValidPlaces(
{Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)},
Place{TARGET(kMLU), PRECISION(kFloat), DATALAYOUT(kNHWC)}});
CHECK(op->Attach(opdesc, scope));
CHECK(op->CheckShape());
CHECK(op->InferShape());
return op;
}
// T is the target data type
// R is the range data type, e.g. int, half
template <typename T, typename R = float>
void FillTensor(Tensor* x,
T lower = static_cast<T>(-2),
T upper = static_cast<T>(2)) {
static unsigned int seed = 100;
std::mt19937 rng(seed++);
std::uniform_real_distribution<double> uniform_dist(0, 1);
T* x_data = x->mutable_data<T>();
for (int i = 0; i < x->dims().production(); ++i) {
auto r = uniform_dist(rng) * (upper - lower) + lower;
x_data[i] = static_cast<T>(static_cast<R>(r));
}
}
void LaunchOp(const std::shared_ptr<lite::OpLite> op,
const std::vector<std::string>& input_var_names,
const std::vector<std::string>& output_var_names);
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/utility.h"
#include <utility>
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
void transpose(float* input_data,
float* output_data,
std::vector<int> input_shape,
std::vector<int> axis) {
int old_index = -1;
int new_index = -1;
int dim[4] = {0};
std::vector<int> shape = input_shape;
for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) {
old_index = dim[0] * shape[1] * shape[2] * shape[3] +
dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3];
new_index =
dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
output_data[new_index] = input_data[old_index];
}
}
}
}
}
int scale2position(float scale) { return static_cast<int>(-std::log2(scale)); }
void dequant(float* dst, int8_t* src, size_t size, float scale) {
for (size_t i = 0; i < size; ++i) {
dst[i] = static_cast<float>(src[i]) * scale;
}
}
void dequant(float* dst,
int8_t* src,
size_t size_o,
size_t size,
size_t size_in,
std::vector<float> scales) {
for (int out = 0; out < size_o; ++out) {
for (int s = 0; s < size; ++s) {
auto scale = scales[s];
for (int in = 0; in < size_in; ++in) {
int idx = in + s * size_in + out * size_in * size;
dst[idx] = static_cast<float>(src[idx]) * scale;
}
}
}
}
cnmlActiveFunction_t OpTypeToCNMLActType(std::string op_type) {
if (op_type == "relu") {
return CNML_ACTIVE_RELU;
} else if (op_type == "sigmoid") {
return CNML_ACTIVE_SIGMOID;
} else if (op_type == "tanh") {
return CNML_ACTIVE_TANH;
} else if (op_type == "relu1") {
return CNML_ACTIVE_RELU1;
} else if (op_type == "relu6") {
return CNML_ACTIVE_RELU6;
} else if (op_type == "hard_sigmoid") {
return CNML_ACTIVE_HARD_SIGMOID;
}
LOG(FATAL) << "CNML Unspoorted op type " << op_type;
return CNML_ACTIVE_NONE;
}
bool HasInputArg(const OpInfo* op_info,
const Scope* scope,
const std::string& argname) {
auto iarg_names = op_info->input_argnames();
if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
iarg_names.end()) {
auto inputs = op_info->Input(argname);
if (inputs.empty()) {
return false;
}
auto var_name = inputs.front();
auto var = scope->FindVar(var_name);
return var != nullptr;
} else {
return false;
}
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cnml.h>
#include <cnrt.h>
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/mlu/mlu_utils.h"
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
#include "lite/fluid/data_type.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
void transpose(float* input_data,
float* output_data,
std::vector<int> input_shape,
std::vector<int> axis);
int scale2position(float scale);
void dequant(float* dst, int8_t* src, size_t size, float scale);
void dequant(float* dst,
int8_t* src,
size_t size_o,
size_t size,
size_t size_in,
std::vector<float> scales);
template <typename T>
std::vector<T> recip(std::vector<T> x);
// Type/tensor converters for converting Paddle type/tensor to MLU type/tensor
bool HasInputArg(const OpInfo* op_info,
const Scope* scope,
const std::string& argname);
cnmlActiveFunction_t OpTypeToCNMLActType(std::string op_type);
inline const ::paddle::lite::DDimLite DimNHWC2NCHW(
const ::paddle::lite::DDimLite& dim) {
return ::paddle::lite::DDimLite(
std::vector<int64_t>({dim[0], dim[3], dim[1], dim[2]}));
}
inline const ::paddle::lite::DDimLite DimNCHW2NHWC(
const ::paddle::lite::DDimLite& dim) {
return ::paddle::lite::DDimLite(
std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]}));
}
inline const std::vector<int64_t> DimNHWC2NCHW(
const std::vector<int64_t>& dim) {
return std::vector<int64_t>({dim[0], dim[3], dim[1], dim[2]});
}
inline const std::vector<int64_t> DimNCHW2NHWC(
const std::vector<int64_t>& dim) {
return std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]});
}
template <paddle::lite_api::PrecisionType>
struct FPTypeTraits {};
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
typedef float T;
};
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
typedef ::paddle::lite::fluid::float16 T;
};
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/calib_compute.h"
#include <vector>
#include "lite/backends/arm/math/type_trans.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace mlu {
void CalibComputeFp32ToInt8::Run() {
// auto& param = this->Param<operators::CalibParam>();
// std::vector<float> scale = {param.scale};
// const auto* din = param.input->data<float>();
// auto* dout = param.output->mutable_data<signed char>();
// lite::arm::math::fp32_to_int8(
// din, dout, scale.data(), 1, 1, param.input->numel());
// return;
}
void CalibComputeInt8ToFp32::Run() {
// auto& param = this->Param<operators::CalibParam>();
// const auto* din = param.input->data<signed char>();
// std::vector<float> scale = {param.scale};
// auto* dout = param.output->mutable_data<float>();
// lite::arm::math::int8_to_fp32(
// din, dout, scale.data(), 1, 1, param.input->numel());
// return;
}
} // namespace mlu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(calib,
kMLU,
kInt8,
kNCHW,
paddle::lite::kernels::mlu::CalibComputeFp32ToInt8,
fp32_to_int8)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))})
.Finalize();
REGISTER_LITE_KERNEL(calib,
kMLU,
kInt8,
kNCHW,
paddle::lite::kernels::mlu::CalibComputeInt8ToFp32,
int8_to_fp32)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))})
.Finalize();
REGISTER_LITE_KERNEL(calib_once,
kMLU,
kInt8,
kNCHW,
paddle::lite::kernels::mlu::CalibComputeFp32ToInt8,
fp32_to_int8)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))})
.Finalize();
REGISTER_LITE_KERNEL(calib_once,
kMLU,
kInt8,
kNCHW,
paddle::lite::kernels::mlu::CalibComputeInt8ToFp32,
int8_to_fp32)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))})
.Finalize();
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
#include "lite/operators/calib_op.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace mlu {
class CalibComputeFp32ToInt8
: public KernelLite<TARGET(kMLU), PRECISION(kInt8)> {
public:
using param_t = operators::CalibParam;
void Run() override;
~CalibComputeFp32ToInt8() override{};
private:
};
class CalibComputeInt8ToFp32
: public KernelLite<TARGET(kMLU), PRECISION(kInt8)> {
public:
using param_t = operators::CalibParam;
void Run() override;
~CalibComputeInt8ToFp32() override{};
private:
};
} // namespace mlu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 Cambricon Authors. All Rights Reserved.
#include <Eigen/Core>
#include "lite/backends/mlu/target_wrapper.h"
#include "lite/core/kernel.h"
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace mlu {
using TargetW = TargetWrapper<TARGET(kMLU)>;
// Host to MLU memory.
void CopyFromHostSync(void* target, const void* source, size_t size) {
TargetW::MemcpySync(target, source, size, IoDirection::HtoD);
}
// MLU to Host memory.
void CopyToHostSync(void* target, const void* source, size_t size) {
TargetW::MemcpySync(target, source, size, IoDirection::DtoH);
}
/*
* This kernel copies a tensor from host to MLU space.
*/
template <PrecisionType Precision>
class IoCopyHostToMluCompute
: public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
public:
using handler_t = KernelBase::type_infer_handler_t;
using param_t = operators::IoCopyParam;
void Run() override {
auto& param = this->template Param<param_t>();
CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kX86));
auto mem_size = param.x->memory_size();
// LOG(INFO) << "copy size " << mem_size;
auto* data = param.y->mutable_data(TARGET(kMLU), mem_size);
CopyFromHostSync(data, param.x->raw_data(), mem_size);
}
std::unique_ptr<handler_t> GetTypeInferHandler() override {
std::unique_ptr<handler_t> res(new handler_t);
*res = [](const std::map<std::string, const Type*>& inputs,
const std::string& out) -> const Type* {
CHECK(!inputs.empty());
auto* type = inputs.at("Input");
CHECK(type->target() == TARGET(kHost));
auto out_place = type->place();
out_place.target = TARGET(kMLU);
auto* out_type = Type::Get(type->id(),
out_place.target,
out_place.precision,
out_place.layout,
out_place.device);
return out_type;
};
return res;
}
std::string doc() const override { return "Copy IO from HOST to MLU"; }
};
/*
* This kernel copies a tensor from MLU to host space.
*/
template <PrecisionType Precision>
class IoCopyMluToHostCompute
: public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
public:
void Run() override {
auto& param = this->template Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kMLU));
auto mem_size = param.x->memory_size();
auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
CopyToHostSync(data, param.x->raw_data(), mem_size);
}
std::string doc() const override { return "Copy IO from MLU to HOST"; }
};
} // namespace mlu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
io_copy,
kMLU,
kFloat,
kNHWC,
paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFloat)>,
host_to_device_kFloat)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
.Finalize();
REGISTER_LITE_KERNEL(
io_copy,
kMLU,
kFP16,
kNHWC,
paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFP16)>,
host_to_device_kFP16)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
.Finalize();
REGISTER_LITE_KERNEL(
io_copy,
kMLU,
kFloat,
kNHWC,
paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFloat)>,
device_to_host_kFloat)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
REGISTER_LITE_KERNEL(
io_copy,
kMLU,
kFP16,
kNHWC,
paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFP16)>,
device_to_host_kFP16)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
// kMLU,
// kFloat,
// kNHWC,
// paddle::lite::kernels::mlu::IoCopyHostToMluCompute,
// host_to_device)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
// .Finalize();
//
//
// kMLU,
// kFloat,
// kNHWC,
// paddle::lite::kernels::mlu::IoCopyMluToHostCompute,
// device_to_host)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
// .Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/subgraph_compute.h"
#include <sys/time.h>
#include <time.h>
#include <utility>
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/kernels/mlu/bridges/paddle_use_bridges.h"
#include "lite/kernels/mlu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace mlu {} // namespace mlu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
subgraph,
kMLU,
kFloat,
kNHWC,
paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFloat)>,
def_kFloat)
.BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
.Finalize();
REGISTER_LITE_KERNEL(
subgraph,
kMLU,
kFP16,
kNHWC,
paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFP16)>,
def_FP16)
.BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/core/types.h"
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/npu/bridges/engine.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace mlu {
template <PrecisionType Precision>
class SubgraphEngine : public subgraph::Engine {
public:
SubgraphEngine(KernelContext* ctx,
int block_idx,
cpp::BlockDesc* block_desc,
const std::vector<std::string>& input_names,
const std::vector<std::string>& output_names,
Scope* scope,
::paddle::lite_api::PrecisionType type)
: subgraph::Engine(
ctx, block_idx, block_desc, input_names, output_names, scope) {
graph_.SetFPType(type);
}
protected:
int BuildDeviceProgram() override {
int status = 0;
// Convert all of input data vars and added into the MLU IR graph
for (auto& input_name : input_names_) {
auto input_tensor = scope_->FindMutableTensor(input_name);
CHECK(input_tensor);
auto input_node =
graph_.AddNode(input_name,
input_tensor->dims().Vectorize(),
CNML_TENSOR,
CNML_NHWC,
graph_.FPType(),
const_cast<void*>(input_tensor->raw_data()));
CHECK(input_node);
// MLU doesn't support dynamic dimensions/shapes, so need to rebuild
// the program when the shape of any input tensor is changed.
status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
}
LOG(INFO) << "START TO CONVERT ";
// Convert all of ops and its weights and added into the MLU IR graph
const auto& bridges = subgraph::Registry::Instance();
for (auto& inst : origin_program_) {
auto op = inst.op();
CHECK(op);
op->CheckShape();
op->InferShape();
std::string op_type = op->op_info()->Type();
if (!bridges.Exists(op_type, TARGET(kMLU))) {
LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type;
return subgraph::FAILED;
}
auto kernel = inst.kernel();
status |= bridges.Select(op_type, TARGET(kMLU))(
reinterpret_cast<void*>(&graph_),
const_cast<OpLite*>(op),
const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) {
return subgraph::FAILED;
}
}
// Obtain the output nodes of the MLU IR graph and build the graph to MLU
// runtime
std::vector<std::string> valid_output_names;
for (auto& output_name : output_names_) {
if (graph_.HasNode(output_name)) {
graph_.AddOutput(graph_.GetNode(output_name));
auto output_tensor = scope_->FindMutableTensor(output_name);
void* p_data = static_cast<void*>(
output_tensor->mutable_data<typename ::paddle::lite::subgraph::mlu::
FPTypeTraits<Precision>::T>(
TARGET(kMLU)));
auto node = graph_.GetNode(output_name);
CHECK(p_data);
node->set_mlu_ptr(p_data);
valid_output_names.push_back(output_name);
}
}
for (auto& input_name : input_names_) {
graph_.AddInput(graph_.GetNode(input_name));
}
CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
// auto& mlu_context = this->ctx_->template As<MLUContext>();
// auto core_version = mlu_context.MLUCoreVersion();
// auto core_number = mlu_context.MLUCoreNumber();
// graph_.Compile(core_version, core_number);
return status;
}
int LaunchDeviceProgram() override {
// auto& mlu_context = this->ctx_->template As<MLUContext>();
// auto exec_queue = mlu_context.exec_queue();
// u32_t affinity = mlu_context.affinity();
// cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
// int data_param = 1;
// forward_param.data_parallelism = &data_param;
// forward_param.affinity = &affinity;
// forward_param.end = CNRT_PARAM_END;
// graph_.Compute(forward_param, exec_queue);
return 0;
}
paddle::lite::subgraph::mlu::Graph graph_;
};
template <PrecisionType Precision>
class SubgraphCompute
: public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
public:
using param_t = operators::SubgraphParam;
void PrepareForRun() override {
auto& param = this->template Param<param_t>();
// LOG(INFO) << "SUBGRAP Prepare RUN index " << param.sub_block_idx;
engine_.reset(new SubgraphEngine<Precision>(this->ctx_.get(),
param.sub_block_idx,
param.sub_block_desc,
param.input_data_names,
param.output_data_names,
param.scope,
this->precision()));
CHECK(engine_);
engine_->Build();
}
void Run() override {
CHECK(engine_);
engine_->Launch();
}
virtual ~SubgraphCompute() = default;
private:
std::unique_ptr<SubgraphEngine<Precision>> engine_;
};
} // namespace mlu
} // namespace kernels
} // namespace lite
} // namespace paddle
#!/bin/bash
set -ex
# global variables with default value
NEUWARE_HOME="${NEUWARE_HOME}" # XPU SDK
TARGET_NAME="all" # default target
BUILD_EXTRA=OFF # ON(with sequence ops)/OFF
WITH_TESTING=OFF # ON/OFF
function print_usage {
echo -e "\nUSAGE:"
echo
echo "----------------------------------------"
echo -e "--mlu_sdk_root=<mlu sdk directory>"
echo -e "--target_name=<target name>"
echo "----------------------------------------"
echo
}
# readonly variables with default value
readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
-DWITH_PYTHON=OFF \
-DLITE_WITH_ARM=OFF"
readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1}
readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
readonly workspace=$(pwd)
function prepare_thirdparty {
if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
rm -rf $workspace/third-party
if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
wget $THIRDPARTY_TAR
fi
tar xzf third-party-05b862.tar.gz
else
# git submodule update --init --recursive
echo "third-party is in ready"
fi
}
# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
# here we fake an empty file to make cmake works.
function prepare_workspace {
# in build directory
# 1. Prepare gen_code file
GEN_CODE_PATH_PREFIX=lite/gen_code
mkdir -p ./${GEN_CODE_PATH_PREFIX}
touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
# 2.Prepare debug tool
DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
# cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
# clone submodule
# git submodule update --init --recursive
prepare_thirdparty
}
function build_mlu {
build_dir=${workspace}/build.lite.mlu
mkdir -p $build_dir
cd $build_dir
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
prepare_workspace
cmake .. \
${CMAKE_COMMON_OPTIONS} \
-DWITH_GPU=OFF \
-DWITH_MKLDNN=OFF \
-DLITE_WITH_X86=ON \
-DWITH_MKL=ON \
-DLITE_WITH_MLU=ON \
-DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
-DWITH_TESTING=${WITH_TESTING} \
-DMLU_SDK_ROOT=${XPU_SDK_ROOT}
make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE
cd -
echo "Done"
}
function main {
# Parse command line.
for i in "$@"; do
case $i in
--target_name=*)
TARGET_NAME="${i#*=}"
shift
;;
--build_extra=*)
BUILD_EXTRA="${i#*=}"
shift
;;
--neuware_home=*)
NEUWARE_HOME="${i#*=}"
shift
;;
build)
build_mlu
shift
;;
full_publish)
TARGET_NAME=publish_inference
build_mlu
shift
;;
*)
# unknown option
print_usage
exit 1
;;
esac
done
}
main $@
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册