From 80d35725ad2ae5b139dadcbd5ce8f1fa12912a17 Mon Sep 17 00:00:00 2001
From: Zhaolong Xing <nhzlx.dragon@gmail.com>
Date: Mon, 14 Oct 2019 18:24:09 +0800
Subject: [PATCH] align yolov3 cuda int8 (#2183)

test=develop
---
 lite/backends/cuda/math/cudnn_conv.cc     | 20 ++++++------
 lite/core/mir/type_layout_cast_pass.cc    | 40 +++++++++++------------
 lite/core/mir/type_precision_cast_pass.cc | 10 +++---
 lite/core/mir/type_target_cast_pass.cc    | 10 +++---
 lite/kernels/cuda/calib_compute.cu        |  9 ++---
 5 files changed, 42 insertions(+), 47 deletions(-)
diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc
index 1c4cbc74b0..72ed3951f6 100644
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -537,16 +537,16 @@ bool CudnnConv2DInt8<Ptype_out>::run(const operators::ConvParam& param) {
                                static_cast<const void*>(scale),
                                this->stream_);
       } else {
-        bias_int8_nhwc<int8_t>(num,
-                               static_cast<const void*>(temp_out),
-                               static_cast<const void*>(b_data),
-                               static_cast<void*>(temp_out),
-                               n,
-                               c,
-                               h,
-                               w,
-                               static_cast<const void*>(scale),
-                               this->stream_);
+        bias_int8_nhwc<float>(num,
+                              static_cast<const void*>(temp_out),
+                              static_cast<const void*>(b_data),
+                              static_cast<void*>(temp_out),
+                              n,
+                              c,
+                              h,
+                              w,
+                              static_cast<const void*>(scale),
+                              this->stream_);
       }
       return true;
     }
diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc
index 346ae35687..81046fb186 100644
--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -30,17 +30,17 @@ void TypeLayoutTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // Start from inputs of the graph, those should have place set.
   VLOG(4) << "\n" << Visualize(graph.get());
   std::list<Node*> nodes;
-  for (auto& node : graph->mutable_nodes()) {
-    nodes.push_back(&node);
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    nodes.push_back(node);
   }
 
-  LOG(INFO) << "nodes.size():" << nodes.size();
+  VLOG(4) << "nodes.size():" << nodes.size();
   for (auto& node : nodes) {
-    LOG(INFO) << "!node->IsStmt():" << !node->IsStmt();
+    VLOG(4) << "!node->IsStmt():" << !node->IsStmt();
     if (!node->IsStmt()) continue;
     auto inlinks = node->inlinks;
-    LOG(INFO) << "node->AsStmt().desc:" << node->AsStmt().desc
-              << " inlinks.size():" << inlinks.size();
+    VLOG(4) << "node->AsStmt().desc:" << node->AsStmt().desc
+            << " inlinks.size():" << inlinks.size();
     for (auto* in : inlinks) {
       ComplementInputs(graph.get(), node, in);
     }
@@ -58,7 +58,7 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
 
   CHECK(inst_node->IsStmt());
   auto& inst = inst_node->AsStmt();
-  LOG(INFO) << "found Target tensor: " << in->AsArg().name;
+  VLOG(4) << "found Target tensor: " << in->AsArg().name;
   CHECK(in->IsRoleSet());
   CHECK(in->IsArg());
   auto in_arg_name = in->AsArg().name;
@@ -66,15 +66,15 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
   CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
   auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
   CHECK(in->AsArg().type);
-  LOG(INFO) << "\n tmp:" << tmp << "\n in->AsArg().name:" << in->AsArg().name
-            << "\n *in->AsArg().type:" << *in->AsArg().type
-            << "\n *decl_arg_type:" << *decl_arg_type
-            << "\n inst.op()->DebugString():" << inst.op()->DebugString();
+  VLOG(4) << "\n tmp:" << tmp << "\n in->AsArg().name:" << in->AsArg().name
+          << "\n *in->AsArg().type:" << *in->AsArg().type
+          << "\n *decl_arg_type:" << *decl_arg_type
+          << "\n inst.op()->DebugString():" << inst.op()->DebugString();
 
   if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) {
-    LOG(INFO) << "found Layout unmatched tensor: " << in->AsArg().name
-              << " for kernel " << inst.op()->DebugString() << " "
-              << *in->AsArg().type << " -> " << *decl_arg_type;
+    VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name
+            << " for kernel " << inst.op()->DebugString() << " "
+            << *in->AsArg().type << " -> " << *decl_arg_type;
     AddLayoutInst(*in->AsArg().type,
                   *decl_arg_type,
                   in,
@@ -94,9 +94,9 @@ void TypeLayoutTransformPass::AddLayoutInst(
   CHECK(!valid_places.empty()) << "valid_place should be set";
 
   CHECK(in->IsArg());
-  auto node_id = [&] { return graph->nodes().size(); };
+  // auto node_id = [&] { return graph->nodes().size(); };
   auto layout_output_name =
-      string_format("%s/layout_trans/%d", in->AsArg().name.c_str(), node_id());
+      string_format("%s/layout_trans", in->AsArg().name.c_str());
   auto* layout_output_arg = graph->NewArgumentNode(layout_output_name);
   layout_output_arg->AsArg().type =
       LiteType::GetTensorTy(from.target(), from.precision(), to.layout());
@@ -145,10 +145,10 @@ void TypeLayoutTransformPass::AddLayoutInst(
   CHECK(is_found) << "Can't find a layout  kernel for layout op: " << from
                   << ":" << in->AsArg().name << "->" << to << ":"
                   << inst_node->AsStmt().op_info()->Type();
-  LOG(INFO) << "========= final picked kernel [info]:"
-            << layout_inst->AsStmt().picked_kernel().name()
-            << " [summary]:" << layout_inst->AsStmt().picked_kernel().summary()
-            << "\n";
+  VLOG(4) << "========= final picked kernel [info]:"
+          << layout_inst->AsStmt().picked_kernel().name()
+          << " [summary]:" << layout_inst->AsStmt().picked_kernel().summary()
+          << "\n";
 
   // Remove the old link
   RemoveDirectedLink(in, inst_node);
diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc
index c44f4cd0ea..f24fa05c9f 100644
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -28,8 +28,8 @@ namespace mir {
 void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // Start from inputs of the graph, those should have place set.
   std::list<Node*> nodes;
-  for (auto& node : graph->mutable_nodes()) {
-    nodes.push_back(&node);
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    nodes.push_back(node);
   }
 
   for (auto& node : nodes) {
@@ -86,9 +86,9 @@ void PrecisionCastPass::AddCastInst(const Type& from,
   // var -> new_transform_op -> new_var -> inst
   // So there will be a new Argument node and a new Cast Statement Node.
   CHECK(in->IsArg());
-  auto node_id = [&] { return graph->nodes().size(); };
-  auto cast_op_output_name =
-      in->AsArg().name + "/precision_trans/" + std::to_string(node_id());
+  // auto node_id = [&] { return graph->nodes().size(); };
+  auto cast_op_output_name = in->AsArg().name + "/precision_trans";
+  // in->AsArg().name + "/precision_trans/" + std::to_string(node_id());
   auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
   cast_op_output_arg->AsArg().type =
       LiteType::GetTensorTy(from.target(), to.precision(), from.layout());
diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc
index 909e9bc29f..3e4e2021c7 100644
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -29,8 +29,8 @@ namespace mir {
 void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // Start from inputs of the graph, those should have place set.
   std::list<Node*> nodes;
-  for (auto& node : graph->mutable_nodes()) {
-    nodes.push_back(&node);
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    nodes.push_back(node);
   }
 
   CHECK(!valid_places_.empty());
@@ -60,7 +60,6 @@ void TypeTargetTransformPass::ComplementInputs(SSAGraph* graph,
   auto in_arg_name = in->AsArg().name;
   std::string tmp;
   CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
-  LOG(INFO) << "tmp:" << tmp;
   auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
   CHECK(in->AsArg().type);
   if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) {
@@ -85,9 +84,10 @@ void TypeTargetTransformPass::AddIoCopyInst(
   // So there will be a new Argument node and a new IoCopy Statement Node.
 
   CHECK(in->IsArg());
-  auto node_id = [&] { return graph->nodes().size(); };
+  // auto node_id = [&] { return graph->nodes().size(); };
   auto io_copy_output_name =
-      string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
+      string_format("%s/target_trans", in->AsArg().name.c_str());
+  // string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
   // TODO(MyPandaShaoxiang) should set same place with input?
   auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
   // Set the place for io_copy_output_arg node, the target should be equal to
diff --git a/lite/kernels/cuda/calib_compute.cu b/lite/kernels/cuda/calib_compute.cu
index e7f3e8b643..77f233e00e 100644
--- a/lite/kernels/cuda/calib_compute.cu
+++ b/lite/kernels/cuda/calib_compute.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <vector>
+#include "lite/backends/cuda/math/utils.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 #include "lite/kernels/cuda/calib_compute.h"
@@ -22,19 +23,13 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-__device__ __forceinline__ int8_t float2int8(float x) {
-  x = fmaxf(x, INT8_MIN);
-  x = fminf(x, INT8_MAX);
-  return __float2int_rn(x);
-}
-
 __global__ void Fp32ToInt8Kernel(const int num,
                                  const float scale,
                                  const float* input,
                                  int8_t* output) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < num) {
-    output[index] = float2int8(input[index] / scale);
+    output[index] = lite::cuda::math::from_float<int8_t>(input[index] / scale);
   }
 }
 
-- 
GitLab