From 80d35725ad2ae5b139dadcbd5ce8f1fa12912a17 Mon Sep 17 00:00:00 2001 From: Zhaolong Xing Date: Mon, 14 Oct 2019 18:24:09 +0800 Subject: [PATCH] align yolov3 cuda int8 (#2183) test=develop --- lite/backends/cuda/math/cudnn_conv.cc | 20 ++++++------ lite/core/mir/type_layout_cast_pass.cc | 40 +++++++++++------------ lite/core/mir/type_precision_cast_pass.cc | 10 +++--- lite/core/mir/type_target_cast_pass.cc | 10 +++--- lite/kernels/cuda/calib_compute.cu | 9 ++--- 5 files changed, 42 insertions(+), 47 deletions(-) diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc index 1c4cbc74b0..72ed3951f6 100644 --- a/lite/backends/cuda/math/cudnn_conv.cc +++ b/lite/backends/cuda/math/cudnn_conv.cc @@ -537,16 +537,16 @@ bool CudnnConv2DInt8::run(const operators::ConvParam& param) { static_cast(scale), this->stream_); } else { - bias_int8_nhwc(num, - static_cast(temp_out), - static_cast(b_data), - static_cast(temp_out), - n, - c, - h, - w, - static_cast(scale), - this->stream_); + bias_int8_nhwc(num, + static_cast(temp_out), + static_cast(b_data), + static_cast(temp_out), + n, + c, + h, + w, + static_cast(scale), + this->stream_); } return true; } diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc index 346ae35687..81046fb186 100644 --- a/lite/core/mir/type_layout_cast_pass.cc +++ b/lite/core/mir/type_layout_cast_pass.cc @@ -30,17 +30,17 @@ void TypeLayoutTransformPass::Apply(const std::unique_ptr& graph) { // Start from inputs of the graph, those should have place set. VLOG(4) << "\n" << Visualize(graph.get()); std::list nodes; - for (auto& node : graph->mutable_nodes()) { - nodes.push_back(&node); + for (auto& node : graph->StmtTopologicalOrder()) { + nodes.push_back(node); } - LOG(INFO) << "nodes.size():" << nodes.size(); + VLOG(4) << "nodes.size():" << nodes.size(); for (auto& node : nodes) { - LOG(INFO) << "!node->IsStmt():" << !node->IsStmt(); + VLOG(4) << "!node->IsStmt():" << !node->IsStmt(); if (!node->IsStmt()) continue; auto inlinks = node->inlinks; - LOG(INFO) << "node->AsStmt().desc:" << node->AsStmt().desc - << " inlinks.size():" << inlinks.size(); + VLOG(4) << "node->AsStmt().desc:" << node->AsStmt().desc + << " inlinks.size():" << inlinks.size(); for (auto* in : inlinks) { ComplementInputs(graph.get(), node, in); } @@ -58,7 +58,7 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph, CHECK(inst_node->IsStmt()); auto& inst = inst_node->AsStmt(); - LOG(INFO) << "found Target tensor: " << in->AsArg().name; + VLOG(4) << "found Target tensor: " << in->AsArg().name; CHECK(in->IsRoleSet()); CHECK(in->IsArg()); auto in_arg_name = in->AsArg().name; @@ -66,15 +66,15 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph, CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp)); auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp); CHECK(in->AsArg().type); - LOG(INFO) << "\n tmp:" << tmp << "\n in->AsArg().name:" << in->AsArg().name - << "\n *in->AsArg().type:" << *in->AsArg().type - << "\n *decl_arg_type:" << *decl_arg_type - << "\n inst.op()->DebugString():" << inst.op()->DebugString(); + VLOG(4) << "\n tmp:" << tmp << "\n in->AsArg().name:" << in->AsArg().name + << "\n *in->AsArg().type:" << *in->AsArg().type + << "\n *decl_arg_type:" << *decl_arg_type + << "\n inst.op()->DebugString():" << inst.op()->DebugString(); if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) { - LOG(INFO) << "found Layout unmatched tensor: " << in->AsArg().name - << " for kernel " << inst.op()->DebugString() << " " - << *in->AsArg().type << " -> " << *decl_arg_type; + VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name + << " for kernel " << inst.op()->DebugString() << " " + << *in->AsArg().type << " -> " << *decl_arg_type; AddLayoutInst(*in->AsArg().type, *decl_arg_type, in, @@ -94,9 +94,9 @@ void TypeLayoutTransformPass::AddLayoutInst( CHECK(!valid_places.empty()) << "valid_place should be set"; CHECK(in->IsArg()); - auto node_id = [&] { return graph->nodes().size(); }; + // auto node_id = [&] { return graph->nodes().size(); }; auto layout_output_name = - string_format("%s/layout_trans/%d", in->AsArg().name.c_str(), node_id()); + string_format("%s/layout_trans", in->AsArg().name.c_str()); auto* layout_output_arg = graph->NewArgumentNode(layout_output_name); layout_output_arg->AsArg().type = LiteType::GetTensorTy(from.target(), from.precision(), to.layout()); @@ -145,10 +145,10 @@ void TypeLayoutTransformPass::AddLayoutInst( CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":" << in->AsArg().name << "->" << to << ":" << inst_node->AsStmt().op_info()->Type(); - LOG(INFO) << "========= final picked kernel [info]:" - << layout_inst->AsStmt().picked_kernel().name() - << " [summary]:" << layout_inst->AsStmt().picked_kernel().summary() - << "\n"; + VLOG(4) << "========= final picked kernel [info]:" + << layout_inst->AsStmt().picked_kernel().name() + << " [summary]:" << layout_inst->AsStmt().picked_kernel().summary() + << "\n"; // Remove the old link RemoveDirectedLink(in, inst_node); diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc index c44f4cd0ea..f24fa05c9f 100644 --- a/lite/core/mir/type_precision_cast_pass.cc +++ b/lite/core/mir/type_precision_cast_pass.cc @@ -28,8 +28,8 @@ namespace mir { void PrecisionCastPass::Apply(const std::unique_ptr& graph) { // Start from inputs of the graph, those should have place set. std::list nodes; - for (auto& node : graph->mutable_nodes()) { - nodes.push_back(&node); + for (auto& node : graph->StmtTopologicalOrder()) { + nodes.push_back(node); } for (auto& node : nodes) { @@ -86,9 +86,9 @@ void PrecisionCastPass::AddCastInst(const Type& from, // var -> new_transform_op -> new_var -> inst // So there will be a new Argument node and a new Cast Statement Node. CHECK(in->IsArg()); - auto node_id = [&] { return graph->nodes().size(); }; - auto cast_op_output_name = - in->AsArg().name + "/precision_trans/" + std::to_string(node_id()); + // auto node_id = [&] { return graph->nodes().size(); }; + auto cast_op_output_name = in->AsArg().name + "/precision_trans"; + // in->AsArg().name + "/precision_trans/" + std::to_string(node_id()); auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name); cast_op_output_arg->AsArg().type = LiteType::GetTensorTy(from.target(), to.precision(), from.layout()); diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc index 909e9bc29f..3e4e2021c7 100644 --- a/lite/core/mir/type_target_cast_pass.cc +++ b/lite/core/mir/type_target_cast_pass.cc @@ -29,8 +29,8 @@ namespace mir { void TypeTargetTransformPass::Apply(const std::unique_ptr& graph) { // Start from inputs of the graph, those should have place set. std::list nodes; - for (auto& node : graph->mutable_nodes()) { - nodes.push_back(&node); + for (auto& node : graph->StmtTopologicalOrder()) { + nodes.push_back(node); } CHECK(!valid_places_.empty()); @@ -60,7 +60,6 @@ void TypeTargetTransformPass::ComplementInputs(SSAGraph* graph, auto in_arg_name = in->AsArg().name; std::string tmp; CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp)); - LOG(INFO) << "tmp:" << tmp; auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp); CHECK(in->AsArg().type); if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) { @@ -85,9 +84,10 @@ void TypeTargetTransformPass::AddIoCopyInst( // So there will be a new Argument node and a new IoCopy Statement Node. CHECK(in->IsArg()); - auto node_id = [&] { return graph->nodes().size(); }; + // auto node_id = [&] { return graph->nodes().size(); }; auto io_copy_output_name = - string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id()); + string_format("%s/target_trans", in->AsArg().name.c_str()); + // string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id()); // TODO(MyPandaShaoxiang) should set same place with input? auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name); // Set the place for io_copy_output_arg node, the target should be equal to diff --git a/lite/kernels/cuda/calib_compute.cu b/lite/kernels/cuda/calib_compute.cu index e7f3e8b643..77f233e00e 100644 --- a/lite/kernels/cuda/calib_compute.cu +++ b/lite/kernels/cuda/calib_compute.cu @@ -13,6 +13,7 @@ // limitations under the License. #include +#include "lite/backends/cuda/math/utils.h" #include "lite/core/op_registry.h" #include "lite/core/type_system.h" #include "lite/kernels/cuda/calib_compute.h" @@ -22,19 +23,13 @@ namespace lite { namespace kernels { namespace cuda { -__device__ __forceinline__ int8_t float2int8(float x) { - x = fmaxf(x, INT8_MIN); - x = fminf(x, INT8_MAX); - return __float2int_rn(x); -} - __global__ void Fp32ToInt8Kernel(const int num, const float scale, const float* input, int8_t* output) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < num) { - output[index] = float2int8(input[index] / scale); + output[index] = lite::cuda::math::from_float(input[index] / scale); } } -- GitLab