[APU] Adapting to the changing of the quantization parameters (#3863)

698c7e76 · hong19860320 · GitHub · cb138726 · 698c7e76 · 698c7e76
8 changed file
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -38,12 +38,12 @@ void QuantizedOpAttributesInferencePass::Apply(
    auto op_info = inst.op_info();
    auto op_type = op_info->Type();

-    // Check only if all of the inputs of the op have scale value
-    bool has_input_scale = true;
+    // Check if any of the inputs of the op have scale value
+    bool has_input_scale = false;
    for (auto in_var_node : op_node->inlinks) {
      CHECK(in_var_node->IsArg());
      auto in_var_node_name = in_var_node->arg()->name;
-      has_input_scale &= op_info->HasInputScale(in_var_node_name);
+      has_input_scale |= op_info->HasInputScale(in_var_node_name);
    }
    if (!has_input_scale) continue;

@@ -52,32 +52,31 @@ void QuantizedOpAttributesInferencePass::Apply(
    bool is_quantized = true;
    for (auto out_var_node : op_node->outlinks) {
      CHECK(out_var_node->IsArg());
-      bool found = false;
-      float output_scale;
+      std::vector<float> output_scale;
+      bool has_output_scale = false;
      auto out_var_node_name = out_var_node->arg()->name;
      for (auto out_op_node : out_var_node->outlinks) {
        CHECK(out_op_node->IsStmt());
        auto& out_inst = out_op_node->AsStmt();
        auto out_op_info = out_inst.op_info();
        if (!out_op_info->HasInputScale(out_var_node_name)) continue;
-        auto input_scale = out_op_info->GetInputScale(out_var_node_name)[0];
-        if (!found) {
-          found = true;
+        auto input_scale = out_op_info->GetInputScale(out_var_node_name);
+        if (!has_output_scale) {
          output_scale = input_scale;
+          has_output_scale = true;
        } else {
-          CHECK_EQ(output_scale, input_scale);
+          CHECK_EQ(output_scale.size(), input_scale.size());
        }
      }
-      if (found) {
-        inst.mutable_op_info()->SetOutputScale(out_var_node_name,
-                                               {output_scale});
+      if (has_output_scale) {
+        inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale);
      } else if (op_info->HasAttr("out_threshold")) {
        // Only consider one output, there are only one out_threshold
        int bit_length = op_info->GetAttr<int>("bit_length");
        int range = (1 << (bit_length - 1)) - 1;
-        output_scale = op_info->GetAttr<float>("out_threshold");
-        inst.mutable_op_info()->SetOutputScale(out_var_node_name,
-                                               {output_scale / range});
+        output_scale = std::vector<float>{
+            op_info->GetAttr<float>("out_threshold") / range};
+        inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale);
      } else {
        is_quantized = false;
      }

--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -452,39 +452,6 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
  subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
                                                     output_var_names);

-  // Set input/output scale values of input/output var nodes for
-  // type_precision_cast_pass.
-  std::vector<float> input_data_scales;
-  std::vector<float> output_data_scales;
-  for (auto &var_node : input_var_nodes) {
-    auto var_node_name = var_node->arg()->name;
-    auto any_op_node = var_node->outlinks.front();
-    CHECK(any_op_node->IsStmt());
-    auto &any_inst = any_op_node->AsStmt();
-    if (any_inst.op_info()->HasInputScale(var_node_name)) {
-      input_data_scales.push_back(
-          any_inst.op_info()->GetInputScale(var_node_name)[0]);
-    }
-  }
-  for (auto &var_node : output_var_nodes) {
-    auto var_node_name = var_node->arg()->name;
-    auto any_op_node = var_node->inlinks.front();
-    CHECK(any_op_node->IsStmt());
-    auto &any_inst = any_op_node->AsStmt();
-    if (any_inst.op_info()->HasOutputScale(var_node_name)) {
-      output_data_scales.push_back(
-          any_inst.op_info()->GetOutputScale(var_node_name)[0]);
-    }
-  }
-  if (input_data_scales.size() > 0) {
-    subgraph_op_desc.SetAttr<std::vector<float>>("input_data_scales",
-                                                 input_data_scales);
-  }
-  if (output_data_scales.size() > 0) {
-    subgraph_op_desc.SetAttr<std::vector<float>>("output_data_scales",
-                                                 output_data_scales);
-  }
-
  // Set all of the inputs and outputs to the target subgraph op
  // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
  for (auto &var_node : weight_var_nodes) {
@@ -504,6 +471,29 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
  auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
  subgraph_op->Attach(subgraph_op_desc, any_op->scope());

+  // Export the scale values of the input/output var nodes of the inner op nodes
+  // only for type_precision_cast_pass.
+  for (auto &var_node : input_var_nodes) {
+    auto var_node_name = var_node->arg()->name;
+    auto any_op_node = var_node->outlinks.front();
+    CHECK(any_op_node->IsStmt());
+    auto &any_inst = any_op_node->AsStmt();
+    if (any_inst.op_info()->HasInputScale(var_node_name)) {
+      subgraph_op->mutable_op_info()->SetInputScale(
+          var_node_name, any_inst.op_info()->GetInputScale(var_node_name));
+    }
+  }
+  for (auto &var_node : output_var_nodes) {
+    auto var_node_name = var_node->arg()->name;
+    auto any_op_node = var_node->inlinks.front();
+    CHECK(any_op_node->IsStmt());
+    auto &any_inst = any_op_node->AsStmt();
+    if (any_inst.op_info()->HasOutputScale(var_node_name)) {
+      subgraph_op->mutable_op_info()->SetOutputScale(
+          var_node_name, any_inst.op_info()->GetOutputScale(var_node_name));
+    }
+  }
+
  // Create and add a new subgraph node into the graph
  auto subgraph_op_node =
      graph->GraphCreateInstructNode(subgraph_op, any_op->valid_places());

--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -66,65 +66,30 @@ void UpdateInputs(OpLite* op, const std::string& from, const std::string& to) {
  }
 }

-// Infer the scale value for the new calib op from the subgraph op
-static bool InferScaleFromSubgraph(std::string var_name,
-                                   const OpInfo* op_info,
-                                   float* scale,
-                                   bool reverse = false) {
-  std::string attr_name = reverse ? "output_data_names" : "input_data_names";
-  if (!op_info->HasAttr(attr_name)) return false;
-  auto input_or_output_names =
-      op_info->GetAttr<std::vector<std::string>>(attr_name);
-  attr_name = reverse ? "output_data_scales" : "input_data_scales";
-  if (!op_info->HasAttr(attr_name)) return false;
-  auto input_or_output_scales = op_info->GetAttr<std::vector<float>>(attr_name);
-  auto size = input_or_output_names.size();
-  CHECK(size == input_or_output_scales.size());
-  for (size_t i = 0; i < size; i++) {
-    if (input_or_output_names[i] == var_name) {
-      *scale = input_or_output_scales[i];
-      return true;
-    }
-  }
-  return false;
-}
-
 // Infer the scale value for the new calib op from the input_scale of the
 // current op and output_scale of the previous op.
 // case 1: prev_op->var_node->op_node(int8->any op, with input_scale).
-// case 2: prev_op->var_node->op_node(subgraph op, int8->any, with
-// input_data_scales).
-// case 3: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any,
+// case 2: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any,
 // without input_scale).
-// case 4: prev_op(any->int8, subgraph_op, with
-// output_data_scales)->var_node->op_node(fp32->any, without input_scale).
 static bool InferScale(Node* var_node, Node* op_node, float* scale) {
  bool found = false;
  auto& inst = op_node->AsStmt();
  auto op_info = inst.op_info();
  auto op_type = op_info->Type();
  auto var_name = var_node->AsArg().name;
-  if (op_type == "subgraph") {
-    found = InferScaleFromSubgraph(var_name, op_info, scale, false);
+  if (op_info->HasInputScale(var_name)) {
+    *scale = op_info->GetInputScale(var_name)[0];
+    found = true;
  } else {
-    if (op_info->HasInputScale(var_name)) {
-      *scale = op_info->GetInputScale(var_name)[0];
+    // Obtain the output_scale from one of its previous Ops
+    auto prev_op_node = var_node->inlinks.front();
+    CHECK(prev_op_node->IsStmt());
+    auto& prev_inst = prev_op_node->AsStmt();
+    auto prev_op_info = prev_inst.op_info();
+    auto prev_op_type = prev_op_info->Type();
+    if (prev_op_info->HasOutputScale(var_name)) {
+      *scale = prev_op_info->GetOutputScale(var_name)[0];
      found = true;
-    } else {
-      // Obtain the output_scale from one of its previous Ops
-      auto prev_op_node = var_node->inlinks.front();
-      CHECK(prev_op_node->IsStmt());
-      auto& prev_inst = prev_op_node->AsStmt();
-      auto prev_op_info = prev_inst.op_info();
-      auto prev_op_type = prev_op_info->Type();
-      if (prev_op_type == "subgraph") {
-        found = InferScaleFromSubgraph(var_name, prev_op_info, scale, true);
-      } else {
-        if (prev_op_info->HasOutputScale(var_name)) {
-          *scale = prev_op_info->GetOutputScale(var_name)[0];
-          found = true;
-        }
-      }
    }
  }
  return found;

--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
@@ -35,6 +35,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  int neuron_errCode;
  VLOG(3) << "[APU] Converting [" << op_type << "]";

+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
  // Get input and output vars and op attributes
  auto input_name = op_info->Input("Input").front();
  auto input = scope->FindMutableTensor(input_name);
@@ -94,34 +97,18 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                      input_dims,
                                      filter_dims);

-  float input_scale;
-  float output_scale;
-  std::vector<float> weight_scale;
-  if (op_info->HasAttr("enable_int8")) {
-    if (op_info->GetAttr<bool>("enable_int8")) {
-      auto input_name = op_info->Input("Input").front();
-      auto filter_name = op_info->Input("Filter").front();
-      auto output_name = op_info->Output("Output").front();
-      if (op_info->HasInputScale(input_name))
-        input_scale = op_info->GetInputScale(input_name)[0];
-      if (op_info->HasInputScale(filter_name))
-        weight_scale = op_info->GetInputScale(filter_name);
-      if (op_info->HasOutputScale(output_name)) {
-        output_scale = op_info->GetOutputScale(output_name)[0];
-      }
-      VLOG(3) << "has output scale:" << output_scale;
-    } else {
-      return FAILED;
-    }
-  } else {
-    return FAILED;
-  }
+  CHECK(op_info->HasInputScale(input_name));
+  auto input_scale = op_info->GetInputScale(input_name)[0];
+  CHECK(op_info->HasInputScale(filter_name));
+  auto filter_scale = op_info->GetInputScale(filter_name);
+  CHECK(op_info->HasOutputScale(output_name));
+  auto output_scale = op_info->GetOutputScale(output_name)[0];

  VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
          << " ,dilations: " << dilations[0] << ":" << dilations[1];
  VLOG(3) << "with_act: " << with_act << " ,act_type:" << act_type;
  VLOG(3) << "input_dims: " << input_dims << " ,output_dims: " << output_dims
-          << " ,weight_scale size: " << weight_scale.size();
+          << " ,filter_scale size: " << filter_scale.size();
  VLOG(3) << "filter_dims: " << filter_dims
          << " ,memory_size: " << filter->memory_size()
          << " ,data_size: " << filter->data_size();
@@ -220,10 +207,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  NeuronOperandType filterType;
  NeuronOperandType channelFilterType;
  NeuronSymmPerChannelQuantParams symmPerChannelQuantParams;
-  if (1 == weight_scale.size()) {
+  if (1 == filter_scale.size()) {
    // Per layer type
    filterType.type = NEURON_TENSOR_QUANT8_ASYMM;
-    filterType.scale = weight_scale[0];
+    filterType.scale = filter_scale[0];
    filterType.zeroPoint = 128;
    filterType.dimensionCount = filter_dims.size();
    filterType.dimensions = &dims_filter[0];
@@ -241,17 +228,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      symmPerChannelQuantParams.channelDim = 3;
    else
      symmPerChannelQuantParams.channelDim = 0;
-    symmPerChannelQuantParams.scaleCount = weight_scale.size();
-    symmPerChannelQuantParams.scales = weight_scale.data();
+    symmPerChannelQuantParams.scaleCount = filter_scale.size();
+    symmPerChannelQuantParams.scales = filter_scale.data();
    biasType.scale = 0;
  }

  std::shared_ptr<Node> filter_node = nullptr;
-  if (1 == weight_scale.size()) {
+  if (1 == filter_scale.size()) {
    NeuronModel_addOperand(model, &filterType);  // 1: filter
    filter_node = graph->Add(filter_name, dims_filter);
-    VLOG(3) << "filter node idx: " << filter_node->index() << "w_scale[0]"
-            << weight_scale[0] << ": filterType: " << filterType.dimensions[0]
+    VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]"
+            << filter_scale[0] << ": filterType: " << filterType.dimensions[0]
            << ":" << filterType.dimensions[1] << ":"
            << filterType.dimensions[2] << ":" << filterType.dimensions[3];
    memcpy(filter->mutable_data<int8_t>(),
@@ -267,8 +254,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    NeuronModel_addOperand(model, &channelFilterType);  // 1: filter
    filter_node = graph->Add(filter_name, dims_filter);
    VLOG(3) << "chennel filter node idx: " << filter_node->index()
-            << " ,scale_count:" << weight_scale.size()
-            << " weight_scale[0]:" << weight_scale.data()[0]
+            << " ,scale_count:" << filter_scale.size()
+            << " filter_scale[0]:" << filter_scale.data()[0]
            << " ,channelFilterType: " << channelFilterType.dimensions[0] << ":"
            << channelFilterType.dimensions[1] << ":"
            << channelFilterType.dimensions[2] << ":"
@@ -302,7 +289,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  std::shared_ptr<Node> bias_node = nullptr;
  if (HasInputArg(op_info, scope, "Bias")) {
    auto bias_name = op_info->Input("Bias").front();
-    auto bias_type = kernel->GetInputDeclType("Bias");
    auto bias = scope->FindMutableTensor(bias_name);
    auto bias_dims = bias->dims();

@@ -368,10 +354,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Add output tensor type
  NeuronOperandType outType;
  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
-  if (graph->IsOutput(output_name))
-    outType.scale = output_scale / 127;
-  else
-    outType.scale = output_scale;
+  outType.scale = output_scale;
  outType.zeroPoint = 128;
  outType.dimensionCount = output_dims.size();
  std::vector<uint32_t> dims_out = {(uint32_t)output_dims[0],
@@ -405,7 +388,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    int32_t* int32_bias_data =
        reinterpret_cast<int32_t*>(bias->mutable_data<float>());
    float2int32(
-        bias->data<float>(), input_scale, weight_scale, int32_bias_data);
+        bias->data<float>(), input_scale, filter_scale, int32_bias_data);

    VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << " : "
            << int32_bias_data[1] << " : " << int32_bias_data[2] << " : "

--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
@@ -31,6 +31,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto scope = op->scope();
  VLOG(3) << "[APU] Converting [" + op_type + "]";

+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
+  // Get input and output vars and op attributes
  auto input_name = op_info->Input("Input").front();
  auto input = scope->FindMutableTensor(input_name);
  auto input_dims = input->dims();
@@ -52,26 +56,12 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
          << " out_dims: " << out_dims << " m: " << m << " k: " << k
          << " n: " << n;

-  float input_scale = 1.0f;
-  float out_scale = 1.0f;
-  std::vector<float> w_scale;
-  if (op_info->HasAttr("enable_int8")) {
-    if (op_info->GetAttr<bool>("enable_int8")) {
-      auto input_name = op_info->Input("Input").front();
-      auto weight_name = op_info->Input("W").front();
-      auto out_name = op_info->Output("Out").front();
-      if (op_info->HasInputScale(input_name))
-        input_scale = op_info->GetInputScale(input_name)[0];
-      if (op_info->HasInputScale(weight_name))
-        w_scale = op_info->GetInputScale(weight_name);
-      if (op_info->HasOutputScale(out_name))
-        out_scale = op_info->GetOutputScale(out_name)[0];
-    } else {
-      return FAILED;
-    }
-  } else {
-    return FAILED;
-  }
+  CHECK(op_info->HasInputScale(input_name));
+  auto input_scale = op_info->GetInputScale(input_name)[0];
+  CHECK(op_info->HasInputScale(w_name));
+  auto w_scale = op_info->GetInputScale(w_name);
+  CHECK(op_info->HasOutputScale(out_name));
+  auto out_scale = op_info->GetOutputScale(out_name)[0];

  // Add input tensor type
  NeuronOperandType inType;

--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
@@ -32,6 +32,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto scope = op->scope();
  VLOG(3) << "[APU] Converting [" + op_type + "] ";

+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
  // Get input and output vars and op attributes
  auto x_name = op_info->Input("X").front();
  auto x = scope->FindMutableTensor(x_name);
@@ -87,24 +90,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                 ksize);

  // Add x tensor type
-  float x_scale = 1.0f;
-  float out_scale = 1.0f;
-  if (op_info->HasAttr("enable_int8")) {
-    if (op_info->GetAttr<bool>("enable_int8")) {
-      auto x_name = op_info->Input("X").front();
-      auto out_name = op_info->Output("Out").front();
-      if (op_info->HasInputScale(x_name))
-        x_scale = op_info->GetInputScale(x_name)[0];
-      if (op_info->HasOutputScale(out_name))
-        out_scale = op_info->GetOutputScale(out_name)[0];
-    } else {
-      LOG(WARNING) << "Do not enable_int8";
-      return FAILED;
-    }
-  } else {
-    LOG(WARNING) << "Do not enable_int8";
-    return FAILED;
-  }
+  CHECK(op_info->HasInputScale(x_name));
+  auto x_scale = op_info->GetInputScale(x_name)[0];
+  CHECK(op_info->HasOutputScale(out_name));
+  auto out_scale = op_info->GetOutputScale(out_name)[0];

  NeuronOperandType xType;
  xType.type = NEURON_TENSOR_QUANT8_ASYMM;

--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
@@ -31,6 +31,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto scope = op->scope();
  VLOG(3) << "[APU] Converting [" + op_type + "]";

+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
  // Get input and output vars and op attributes
  auto x_name = op_info->Input("X").front();
  auto x = scope->FindMutableTensor(x_name);
@@ -45,24 +48,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    axis += x_rank;
  }

-  float input_scale = 1.0f;
-  float out_scale = 1.0f;
-  if (op_info->HasAttr("enable_int8")) {
-    if (op_info->GetAttr<bool>("enable_int8")) {
-      auto x_name = op_info->Input("X").front();
-      auto out_name = op_info->Output("Out").front();
-      if (op_info->HasInputScale(x_name))
-        input_scale = op_info->GetInputScale(x_name)[0];
-      if (op_info->HasOutputScale(out_name))
-        out_scale = op_info->GetOutputScale(out_name)[0];
-    } else {
-      LOG(WARNING) << "Do not enable_int8";
-      return FAILED;
-    }
-  } else {
-    LOG(WARNING) << "Do not enable_int8";
-    return FAILED;
-  }
+  CHECK(op_info->HasInputScale(x_name));
+  auto input_scale = op_info->GetInputScale(x_name)[0];
+  CHECK(op_info->HasOutputScale(out_name));
+  auto out_scale = op_info->GetOutputScale(out_name)[0];

  // Check output scale
  NeuronOperandType xType;
@@ -106,14 +95,14 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Add out operand
  NeuronOperandType outType;
  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
-  outType.scale = out_scale / 127;
+  outType.scale = out_scale;
  outType.zeroPoint = 128;
  outType.dimensionCount = x_dims.size();
  outType.dimensions = &dims_x[0];
  NeuronModel_addOperand(model, &outType);  // 3: output
  std::shared_ptr<Node> out_node = nullptr;
  out_node = graph->Add(out_name, dims_x);
-  VLOG(3) << "output_scale: " << out_scale;
+  VLOG(3) << "out_scale: " << out_scale;

  float beta_val[] = {1.0f};
  NeuronModel_setOperandValue(

--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -153,18 +153,15 @@ int SubgraphEngine::LaunchDeviceProgram() {
  }

  // Set input buffer
-  Tensor input_temp;
  for (size_t i = 0; i < origin_itensors_.size(); i++) {
-    input_temp.Resize({origin_idims_[i]});
-    uint8_t* input_data = input_temp.mutable_data<uint8_t>();
-    memcpy(input_data,
-           origin_itensors_[i]->raw_data(),
-           origin_itensors_[i]->memory_size());
+    auto origin_data = origin_itensors_[i]->mutable_data<int8_t>();
+    auto converted_data = reinterpret_cast<uint8_t*>(origin_data);
    for (int j = 0; j < origin_itensors_[i]->data_size(); j++) {
-      input_data[j] += (uint8_t)128;
+      converted_data[j] =
+          static_cast<uint8_t>(static_cast<int16_t>(origin_data[j]) + 128);
    }
    NeuronExecution_setInput(
-        run, i, NULL, input_data, origin_itensors_[i]->memory_size());
+        run, i, NULL, converted_data, origin_itensors_[i]->memory_size());
  }

  // Set output buffer
@@ -184,10 +181,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
  }

  for (size_t i = 0; i < origin_otensors_.size(); i++) {
-    int8_t* output_data = origin_otensors_[i]->mutable_data<int8_t>();
-    VLOG(3) << "output size:" << origin_otensors_[i]->memory_size();
+    auto converted_data = origin_otensors_[i]->mutable_data<int8_t>();
+    auto origin_data = reinterpret_cast<uint8_t*>(converted_data);
    for (int j = 0; j < origin_otensors_[i]->data_size(); j++) {
-      output_data[j] -= (int8_t)128;
+      converted_data[j] =
+          static_cast<int8_t>(static_cast<int16_t>(origin_data[j]) - 128);
    }
  }
  NeuronExecution_free(run);