align yolov3 cuda int8 (#2183)

test=develop

align yolov3 cuda int8 (#2183)
test=develop
80d35725 · Zhaolong Xing · GitHub · 56151776 · 80d35725 · 80d35725
5 changed file
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -537,16 +537,16 @@ bool CudnnConv2DInt8<Ptype_out>::run(const operators::ConvParam& param) {
                               static_cast<const void*>(scale),
                               this->stream_);
      } else {
-        bias_int8_nhwc<int8_t>(num,
-                               static_cast<const void*>(temp_out),
-                               static_cast<const void*>(b_data),
-                               static_cast<void*>(temp_out),
-                               n,
-                               c,
-                               h,
-                               w,
-                               static_cast<const void*>(scale),
-                               this->stream_);
+        bias_int8_nhwc<float>(num,
+                              static_cast<const void*>(temp_out),
+                              static_cast<const void*>(b_data),
+                              static_cast<void*>(temp_out),
+                              n,
+                              c,
+                              h,
+                              w,
+                              static_cast<const void*>(scale),
+                              this->stream_);
      }
      return true;
    }

--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -30,17 +30,17 @@ void TypeLayoutTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  // Start from inputs of the graph, those should have place set.
  VLOG(4) << "\n" << Visualize(graph.get());
  std::list<Node*> nodes;
-  for (auto& node : graph->mutable_nodes()) {
-    nodes.push_back(&node);
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    nodes.push_back(node);
  }

-  LOG(INFO) << "nodes.size():" << nodes.size();
+  VLOG(4) << "nodes.size():" << nodes.size();
  for (auto& node : nodes) {
-    LOG(INFO) << "!node->IsStmt():" << !node->IsStmt();
+    VLOG(4) << "!node->IsStmt():" << !node->IsStmt();
    if (!node->IsStmt()) continue;
    auto inlinks = node->inlinks;
-    LOG(INFO) << "node->AsStmt().desc:" << node->AsStmt().desc
-              << " inlinks.size():" << inlinks.size();
+    VLOG(4) << "node->AsStmt().desc:" << node->AsStmt().desc
+            << " inlinks.size():" << inlinks.size();
    for (auto* in : inlinks) {
      ComplementInputs(graph.get(), node, in);
    }
@@ -58,7 +58,7 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,

  CHECK(inst_node->IsStmt());
  auto& inst = inst_node->AsStmt();
-  LOG(INFO) << "found Target tensor: " << in->AsArg().name;
+  VLOG(4) << "found Target tensor: " << in->AsArg().name;
  CHECK(in->IsRoleSet());
  CHECK(in->IsArg());
  auto in_arg_name = in->AsArg().name;
@@ -66,15 +66,15 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
  CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
  auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
  CHECK(in->AsArg().type);
-  LOG(INFO) << "\n tmp:" << tmp << "\n in->AsArg().name:" << in->AsArg().name
-            << "\n *in->AsArg().type:" << *in->AsArg().type
-            << "\n *decl_arg_type:" << *decl_arg_type
-            << "\n inst.op()->DebugString():" << inst.op()->DebugString();
+  VLOG(4) << "\n tmp:" << tmp << "\n in->AsArg().name:" << in->AsArg().name
+          << "\n *in->AsArg().type:" << *in->AsArg().type
+          << "\n *decl_arg_type:" << *decl_arg_type
+          << "\n inst.op()->DebugString():" << inst.op()->DebugString();

  if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) {
-    LOG(INFO) << "found Layout unmatched tensor: " << in->AsArg().name
-              << " for kernel " << inst.op()->DebugString() << " "
-              << *in->AsArg().type << " -> " << *decl_arg_type;
+    VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name
+            << " for kernel " << inst.op()->DebugString() << " "
+            << *in->AsArg().type << " -> " << *decl_arg_type;
    AddLayoutInst(*in->AsArg().type,
                  *decl_arg_type,
                  in,
@@ -94,9 +94,9 @@ void TypeLayoutTransformPass::AddLayoutInst(
  CHECK(!valid_places.empty()) << "valid_place should be set";

  CHECK(in->IsArg());
-  auto node_id = [&] { return graph->nodes().size(); };
+  // auto node_id = [&] { return graph->nodes().size(); };
  auto layout_output_name =
-      string_format("%s/layout_trans/%d", in->AsArg().name.c_str(), node_id());
+      string_format("%s/layout_trans", in->AsArg().name.c_str());
  auto* layout_output_arg = graph->NewArgumentNode(layout_output_name);
  layout_output_arg->AsArg().type =
      LiteType::GetTensorTy(from.target(), from.precision(), to.layout());
@@ -145,10 +145,10 @@ void TypeLayoutTransformPass::AddLayoutInst(
  CHECK(is_found) << "Can't find a layout  kernel for layout op: " << from
                  << ":" << in->AsArg().name << "->" << to << ":"
                  << inst_node->AsStmt().op_info()->Type();
-  LOG(INFO) << "========= final picked kernel [info]:"
-            << layout_inst->AsStmt().picked_kernel().name()
-            << " [summary]:" << layout_inst->AsStmt().picked_kernel().summary()
-            << "\n";
+  VLOG(4) << "========= final picked kernel [info]:"
+          << layout_inst->AsStmt().picked_kernel().name()
+          << " [summary]:" << layout_inst->AsStmt().picked_kernel().summary()
+          << "\n";

  // Remove the old link
  RemoveDirectedLink(in, inst_node);

--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -28,8 +28,8 @@ namespace mir {
 void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  // Start from inputs of the graph, those should have place set.
  std::list<Node*> nodes;
-  for (auto& node : graph->mutable_nodes()) {
-    nodes.push_back(&node);
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    nodes.push_back(node);
  }

  for (auto& node : nodes) {
@@ -86,9 +86,9 @@ void PrecisionCastPass::AddCastInst(const Type& from,
  // var -> new_transform_op -> new_var -> inst
  // So there will be a new Argument node and a new Cast Statement Node.
  CHECK(in->IsArg());
-  auto node_id = [&] { return graph->nodes().size(); };
-  auto cast_op_output_name =
-      in->AsArg().name + "/precision_trans/" + std::to_string(node_id());
+  // auto node_id = [&] { return graph->nodes().size(); };
+  auto cast_op_output_name = in->AsArg().name + "/precision_trans";
+  // in->AsArg().name + "/precision_trans/" + std::to_string(node_id());
  auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
  cast_op_output_arg->AsArg().type =
      LiteType::GetTensorTy(from.target(), to.precision(), from.layout());

--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -29,8 +29,8 @@ namespace mir {
 void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  // Start from inputs of the graph, those should have place set.
  std::list<Node*> nodes;
-  for (auto& node : graph->mutable_nodes()) {
-    nodes.push_back(&node);
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    nodes.push_back(node);
  }

  CHECK(!valid_places_.empty());
@@ -60,7 +60,6 @@ void TypeTargetTransformPass::ComplementInputs(SSAGraph* graph,
  auto in_arg_name = in->AsArg().name;
  std::string tmp;
  CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
-  LOG(INFO) << "tmp:" << tmp;
  auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
  CHECK(in->AsArg().type);
  if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) {
@@ -85,9 +84,10 @@ void TypeTargetTransformPass::AddIoCopyInst(
  // So there will be a new Argument node and a new IoCopy Statement Node.

  CHECK(in->IsArg());
-  auto node_id = [&] { return graph->nodes().size(); };
+  // auto node_id = [&] { return graph->nodes().size(); };
  auto io_copy_output_name =
-      string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
+      string_format("%s/target_trans", in->AsArg().name.c_str());
+  // string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
  // TODO(MyPandaShaoxiang) should set same place with input?
  auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
  // Set the place for io_copy_output_arg node, the target should be equal to

--- a/lite/kernels/cuda/calib_compute.cu
+++ b/lite/kernels/cuda/calib_compute.cu
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include <vector>
+#include "lite/backends/cuda/math/utils.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 #include "lite/kernels/cuda/calib_compute.h"
@@ -22,19 +23,13 @@ namespace lite {
 namespace kernels {
 namespace cuda {

-__device__ __forceinline__ int8_t float2int8(float x) {
-  x = fmaxf(x, INT8_MIN);
-  x = fminf(x, INT8_MAX);
-  return __float2int_rn(x);
-}
-
 __global__ void Fp32ToInt8Kernel(const int num,
                                 const float scale,
                                 const float* input,
                                 int8_t* output) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < num) {
-    output[index] = float2int8(input[index] / scale);
+    output[index] = lite::cuda::math::from_float<int8_t>(input[index] / scale);
  }
 }