未验证 提交 698c7e76 编写于 作者: H hong19860320 提交者: GitHub

[APU] Adapting to the changing of the quantization parameters (#3863)

上级 cb138726
......@@ -38,12 +38,12 @@ void QuantizedOpAttributesInferencePass::Apply(
auto op_info = inst.op_info();
auto op_type = op_info->Type();
// Check only if all of the inputs of the op have scale value
bool has_input_scale = true;
// Check if any of the inputs of the op have scale value
bool has_input_scale = false;
for (auto in_var_node : op_node->inlinks) {
CHECK(in_var_node->IsArg());
auto in_var_node_name = in_var_node->arg()->name;
has_input_scale &= op_info->HasInputScale(in_var_node_name);
has_input_scale |= op_info->HasInputScale(in_var_node_name);
}
if (!has_input_scale) continue;
......@@ -52,32 +52,31 @@ void QuantizedOpAttributesInferencePass::Apply(
bool is_quantized = true;
for (auto out_var_node : op_node->outlinks) {
CHECK(out_var_node->IsArg());
bool found = false;
float output_scale;
std::vector<float> output_scale;
bool has_output_scale = false;
auto out_var_node_name = out_var_node->arg()->name;
for (auto out_op_node : out_var_node->outlinks) {
CHECK(out_op_node->IsStmt());
auto& out_inst = out_op_node->AsStmt();
auto out_op_info = out_inst.op_info();
if (!out_op_info->HasInputScale(out_var_node_name)) continue;
auto input_scale = out_op_info->GetInputScale(out_var_node_name)[0];
if (!found) {
found = true;
auto input_scale = out_op_info->GetInputScale(out_var_node_name);
if (!has_output_scale) {
output_scale = input_scale;
has_output_scale = true;
} else {
CHECK_EQ(output_scale, input_scale);
CHECK_EQ(output_scale.size(), input_scale.size());
}
}
if (found) {
inst.mutable_op_info()->SetOutputScale(out_var_node_name,
{output_scale});
if (has_output_scale) {
inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale);
} else if (op_info->HasAttr("out_threshold")) {
// Only consider one output, there are only one out_threshold
int bit_length = op_info->GetAttr<int>("bit_length");
int range = (1 << (bit_length - 1)) - 1;
output_scale = op_info->GetAttr<float>("out_threshold");
inst.mutable_op_info()->SetOutputScale(out_var_node_name,
{output_scale / range});
output_scale = std::vector<float>{
op_info->GetAttr<float>("out_threshold") / range};
inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale);
} else {
is_quantized = false;
}
......
......@@ -452,39 +452,6 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
output_var_names);
// Set input/output scale values of input/output var nodes for
// type_precision_cast_pass.
std::vector<float> input_data_scales;
std::vector<float> output_data_scales;
for (auto &var_node : input_var_nodes) {
auto var_node_name = var_node->arg()->name;
auto any_op_node = var_node->outlinks.front();
CHECK(any_op_node->IsStmt());
auto &any_inst = any_op_node->AsStmt();
if (any_inst.op_info()->HasInputScale(var_node_name)) {
input_data_scales.push_back(
any_inst.op_info()->GetInputScale(var_node_name)[0]);
}
}
for (auto &var_node : output_var_nodes) {
auto var_node_name = var_node->arg()->name;
auto any_op_node = var_node->inlinks.front();
CHECK(any_op_node->IsStmt());
auto &any_inst = any_op_node->AsStmt();
if (any_inst.op_info()->HasOutputScale(var_node_name)) {
output_data_scales.push_back(
any_inst.op_info()->GetOutputScale(var_node_name)[0]);
}
}
if (input_data_scales.size() > 0) {
subgraph_op_desc.SetAttr<std::vector<float>>("input_data_scales",
input_data_scales);
}
if (output_data_scales.size() > 0) {
subgraph_op_desc.SetAttr<std::vector<float>>("output_data_scales",
output_data_scales);
}
// Set all of the inputs and outputs to the target subgraph op
// To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
for (auto &var_node : weight_var_nodes) {
......@@ -504,6 +471,29 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
subgraph_op->Attach(subgraph_op_desc, any_op->scope());
// Export the scale values of the input/output var nodes of the inner op nodes
// only for type_precision_cast_pass.
for (auto &var_node : input_var_nodes) {
auto var_node_name = var_node->arg()->name;
auto any_op_node = var_node->outlinks.front();
CHECK(any_op_node->IsStmt());
auto &any_inst = any_op_node->AsStmt();
if (any_inst.op_info()->HasInputScale(var_node_name)) {
subgraph_op->mutable_op_info()->SetInputScale(
var_node_name, any_inst.op_info()->GetInputScale(var_node_name));
}
}
for (auto &var_node : output_var_nodes) {
auto var_node_name = var_node->arg()->name;
auto any_op_node = var_node->inlinks.front();
CHECK(any_op_node->IsStmt());
auto &any_inst = any_op_node->AsStmt();
if (any_inst.op_info()->HasOutputScale(var_node_name)) {
subgraph_op->mutable_op_info()->SetOutputScale(
var_node_name, any_inst.op_info()->GetOutputScale(var_node_name));
}
}
// Create and add a new subgraph node into the graph
auto subgraph_op_node =
graph->GraphCreateInstructNode(subgraph_op, any_op->valid_places());
......
......@@ -66,65 +66,30 @@ void UpdateInputs(OpLite* op, const std::string& from, const std::string& to) {
}
}
// Infer the scale value for the new calib op from the subgraph op
static bool InferScaleFromSubgraph(std::string var_name,
const OpInfo* op_info,
float* scale,
bool reverse = false) {
std::string attr_name = reverse ? "output_data_names" : "input_data_names";
if (!op_info->HasAttr(attr_name)) return false;
auto input_or_output_names =
op_info->GetAttr<std::vector<std::string>>(attr_name);
attr_name = reverse ? "output_data_scales" : "input_data_scales";
if (!op_info->HasAttr(attr_name)) return false;
auto input_or_output_scales = op_info->GetAttr<std::vector<float>>(attr_name);
auto size = input_or_output_names.size();
CHECK(size == input_or_output_scales.size());
for (size_t i = 0; i < size; i++) {
if (input_or_output_names[i] == var_name) {
*scale = input_or_output_scales[i];
return true;
}
}
return false;
}
// Infer the scale value for the new calib op from the input_scale of the
// current op and output_scale of the previous op.
// case 1: prev_op->var_node->op_node(int8->any op, with input_scale).
// case 2: prev_op->var_node->op_node(subgraph op, int8->any, with
// input_data_scales).
// case 3: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any,
// case 2: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any,
// without input_scale).
// case 4: prev_op(any->int8, subgraph_op, with
// output_data_scales)->var_node->op_node(fp32->any, without input_scale).
static bool InferScale(Node* var_node, Node* op_node, float* scale) {
bool found = false;
auto& inst = op_node->AsStmt();
auto op_info = inst.op_info();
auto op_type = op_info->Type();
auto var_name = var_node->AsArg().name;
if (op_type == "subgraph") {
found = InferScaleFromSubgraph(var_name, op_info, scale, false);
if (op_info->HasInputScale(var_name)) {
*scale = op_info->GetInputScale(var_name)[0];
found = true;
} else {
if (op_info->HasInputScale(var_name)) {
*scale = op_info->GetInputScale(var_name)[0];
// Obtain the output_scale from one of its previous Ops
auto prev_op_node = var_node->inlinks.front();
CHECK(prev_op_node->IsStmt());
auto& prev_inst = prev_op_node->AsStmt();
auto prev_op_info = prev_inst.op_info();
auto prev_op_type = prev_op_info->Type();
if (prev_op_info->HasOutputScale(var_name)) {
*scale = prev_op_info->GetOutputScale(var_name)[0];
found = true;
} else {
// Obtain the output_scale from one of its previous Ops
auto prev_op_node = var_node->inlinks.front();
CHECK(prev_op_node->IsStmt());
auto& prev_inst = prev_op_node->AsStmt();
auto prev_op_info = prev_inst.op_info();
auto prev_op_type = prev_op_info->Type();
if (prev_op_type == "subgraph") {
found = InferScaleFromSubgraph(var_name, prev_op_info, scale, true);
} else {
if (prev_op_info->HasOutputScale(var_name)) {
*scale = prev_op_info->GetOutputScale(var_name)[0];
found = true;
}
}
}
}
return found;
......
......@@ -35,6 +35,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
int neuron_errCode;
VLOG(3) << "[APU] Converting [" << op_type << "]";
CHECK(op_info->HasAttr("enable_int8") &&
op_info->GetAttr<bool>("enable_int8"));
// Get input and output vars and op attributes
auto input_name = op_info->Input("Input").front();
auto input = scope->FindMutableTensor(input_name);
......@@ -94,34 +97,18 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
input_dims,
filter_dims);
float input_scale;
float output_scale;
std::vector<float> weight_scale;
if (op_info->HasAttr("enable_int8")) {
if (op_info->GetAttr<bool>("enable_int8")) {
auto input_name = op_info->Input("Input").front();
auto filter_name = op_info->Input("Filter").front();
auto output_name = op_info->Output("Output").front();
if (op_info->HasInputScale(input_name))
input_scale = op_info->GetInputScale(input_name)[0];
if (op_info->HasInputScale(filter_name))
weight_scale = op_info->GetInputScale(filter_name);
if (op_info->HasOutputScale(output_name)) {
output_scale = op_info->GetOutputScale(output_name)[0];
}
VLOG(3) << "has output scale:" << output_scale;
} else {
return FAILED;
}
} else {
return FAILED;
}
CHECK(op_info->HasInputScale(input_name));
auto input_scale = op_info->GetInputScale(input_name)[0];
CHECK(op_info->HasInputScale(filter_name));
auto filter_scale = op_info->GetInputScale(filter_name);
CHECK(op_info->HasOutputScale(output_name));
auto output_scale = op_info->GetOutputScale(output_name)[0];
VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
<< " ,dilations: " << dilations[0] << ":" << dilations[1];
VLOG(3) << "with_act: " << with_act << " ,act_type:" << act_type;
VLOG(3) << "input_dims: " << input_dims << " ,output_dims: " << output_dims
<< " ,weight_scale size: " << weight_scale.size();
<< " ,filter_scale size: " << filter_scale.size();
VLOG(3) << "filter_dims: " << filter_dims
<< " ,memory_size: " << filter->memory_size()
<< " ,data_size: " << filter->data_size();
......@@ -220,10 +207,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
NeuronOperandType filterType;
NeuronOperandType channelFilterType;
NeuronSymmPerChannelQuantParams symmPerChannelQuantParams;
if (1 == weight_scale.size()) {
if (1 == filter_scale.size()) {
// Per layer type
filterType.type = NEURON_TENSOR_QUANT8_ASYMM;
filterType.scale = weight_scale[0];
filterType.scale = filter_scale[0];
filterType.zeroPoint = 128;
filterType.dimensionCount = filter_dims.size();
filterType.dimensions = &dims_filter[0];
......@@ -241,17 +228,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
symmPerChannelQuantParams.channelDim = 3;
else
symmPerChannelQuantParams.channelDim = 0;
symmPerChannelQuantParams.scaleCount = weight_scale.size();
symmPerChannelQuantParams.scales = weight_scale.data();
symmPerChannelQuantParams.scaleCount = filter_scale.size();
symmPerChannelQuantParams.scales = filter_scale.data();
biasType.scale = 0;
}
std::shared_ptr<Node> filter_node = nullptr;
if (1 == weight_scale.size()) {
if (1 == filter_scale.size()) {
NeuronModel_addOperand(model, &filterType); // 1: filter
filter_node = graph->Add(filter_name, dims_filter);
VLOG(3) << "filter node idx: " << filter_node->index() << "w_scale[0]"
<< weight_scale[0] << ": filterType: " << filterType.dimensions[0]
VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]"
<< filter_scale[0] << ": filterType: " << filterType.dimensions[0]
<< ":" << filterType.dimensions[1] << ":"
<< filterType.dimensions[2] << ":" << filterType.dimensions[3];
memcpy(filter->mutable_data<int8_t>(),
......@@ -267,8 +254,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
NeuronModel_addOperand(model, &channelFilterType); // 1: filter
filter_node = graph->Add(filter_name, dims_filter);
VLOG(3) << "chennel filter node idx: " << filter_node->index()
<< " ,scale_count:" << weight_scale.size()
<< " weight_scale[0]:" << weight_scale.data()[0]
<< " ,scale_count:" << filter_scale.size()
<< " filter_scale[0]:" << filter_scale.data()[0]
<< " ,channelFilterType: " << channelFilterType.dimensions[0] << ":"
<< channelFilterType.dimensions[1] << ":"
<< channelFilterType.dimensions[2] << ":"
......@@ -302,7 +289,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
std::shared_ptr<Node> bias_node = nullptr;
if (HasInputArg(op_info, scope, "Bias")) {
auto bias_name = op_info->Input("Bias").front();
auto bias_type = kernel->GetInputDeclType("Bias");
auto bias = scope->FindMutableTensor(bias_name);
auto bias_dims = bias->dims();
......@@ -368,10 +354,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
// Add output tensor type
NeuronOperandType outType;
outType.type = NEURON_TENSOR_QUANT8_ASYMM;
if (graph->IsOutput(output_name))
outType.scale = output_scale / 127;
else
outType.scale = output_scale;
outType.scale = output_scale;
outType.zeroPoint = 128;
outType.dimensionCount = output_dims.size();
std::vector<uint32_t> dims_out = {(uint32_t)output_dims[0],
......@@ -405,7 +388,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
int32_t* int32_bias_data =
reinterpret_cast<int32_t*>(bias->mutable_data<float>());
float2int32(
bias->data<float>(), input_scale, weight_scale, int32_bias_data);
bias->data<float>(), input_scale, filter_scale, int32_bias_data);
VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << " : "
<< int32_bias_data[1] << " : " << int32_bias_data[2] << " : "
......
......@@ -31,6 +31,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto scope = op->scope();
VLOG(3) << "[APU] Converting [" + op_type + "]";
CHECK(op_info->HasAttr("enable_int8") &&
op_info->GetAttr<bool>("enable_int8"));
// Get input and output vars and op attributes
auto input_name = op_info->Input("Input").front();
auto input = scope->FindMutableTensor(input_name);
auto input_dims = input->dims();
......@@ -52,26 +56,12 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
<< " out_dims: " << out_dims << " m: " << m << " k: " << k
<< " n: " << n;
float input_scale = 1.0f;
float out_scale = 1.0f;
std::vector<float> w_scale;
if (op_info->HasAttr("enable_int8")) {
if (op_info->GetAttr<bool>("enable_int8")) {
auto input_name = op_info->Input("Input").front();
auto weight_name = op_info->Input("W").front();
auto out_name = op_info->Output("Out").front();
if (op_info->HasInputScale(input_name))
input_scale = op_info->GetInputScale(input_name)[0];
if (op_info->HasInputScale(weight_name))
w_scale = op_info->GetInputScale(weight_name);
if (op_info->HasOutputScale(out_name))
out_scale = op_info->GetOutputScale(out_name)[0];
} else {
return FAILED;
}
} else {
return FAILED;
}
CHECK(op_info->HasInputScale(input_name));
auto input_scale = op_info->GetInputScale(input_name)[0];
CHECK(op_info->HasInputScale(w_name));
auto w_scale = op_info->GetInputScale(w_name);
CHECK(op_info->HasOutputScale(out_name));
auto out_scale = op_info->GetOutputScale(out_name)[0];
// Add input tensor type
NeuronOperandType inType;
......
......@@ -32,6 +32,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto scope = op->scope();
VLOG(3) << "[APU] Converting [" + op_type + "] ";
CHECK(op_info->HasAttr("enable_int8") &&
op_info->GetAttr<bool>("enable_int8"));
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindMutableTensor(x_name);
......@@ -87,24 +90,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
ksize);
// Add x tensor type
float x_scale = 1.0f;
float out_scale = 1.0f;
if (op_info->HasAttr("enable_int8")) {
if (op_info->GetAttr<bool>("enable_int8")) {
auto x_name = op_info->Input("X").front();
auto out_name = op_info->Output("Out").front();
if (op_info->HasInputScale(x_name))
x_scale = op_info->GetInputScale(x_name)[0];
if (op_info->HasOutputScale(out_name))
out_scale = op_info->GetOutputScale(out_name)[0];
} else {
LOG(WARNING) << "Do not enable_int8";
return FAILED;
}
} else {
LOG(WARNING) << "Do not enable_int8";
return FAILED;
}
CHECK(op_info->HasInputScale(x_name));
auto x_scale = op_info->GetInputScale(x_name)[0];
CHECK(op_info->HasOutputScale(out_name));
auto out_scale = op_info->GetOutputScale(out_name)[0];
NeuronOperandType xType;
xType.type = NEURON_TENSOR_QUANT8_ASYMM;
......
......@@ -31,6 +31,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto scope = op->scope();
VLOG(3) << "[APU] Converting [" + op_type + "]";
CHECK(op_info->HasAttr("enable_int8") &&
op_info->GetAttr<bool>("enable_int8"));
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindMutableTensor(x_name);
......@@ -45,24 +48,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
axis += x_rank;
}
float input_scale = 1.0f;
float out_scale = 1.0f;
if (op_info->HasAttr("enable_int8")) {
if (op_info->GetAttr<bool>("enable_int8")) {
auto x_name = op_info->Input("X").front();
auto out_name = op_info->Output("Out").front();
if (op_info->HasInputScale(x_name))
input_scale = op_info->GetInputScale(x_name)[0];
if (op_info->HasOutputScale(out_name))
out_scale = op_info->GetOutputScale(out_name)[0];
} else {
LOG(WARNING) << "Do not enable_int8";
return FAILED;
}
} else {
LOG(WARNING) << "Do not enable_int8";
return FAILED;
}
CHECK(op_info->HasInputScale(x_name));
auto input_scale = op_info->GetInputScale(x_name)[0];
CHECK(op_info->HasOutputScale(out_name));
auto out_scale = op_info->GetOutputScale(out_name)[0];
// Check output scale
NeuronOperandType xType;
......@@ -106,14 +95,14 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
// Add out operand
NeuronOperandType outType;
outType.type = NEURON_TENSOR_QUANT8_ASYMM;
outType.scale = out_scale / 127;
outType.scale = out_scale;
outType.zeroPoint = 128;
outType.dimensionCount = x_dims.size();
outType.dimensions = &dims_x[0];
NeuronModel_addOperand(model, &outType); // 3: output
std::shared_ptr<Node> out_node = nullptr;
out_node = graph->Add(out_name, dims_x);
VLOG(3) << "output_scale: " << out_scale;
VLOG(3) << "out_scale: " << out_scale;
float beta_val[] = {1.0f};
NeuronModel_setOperandValue(
......
......@@ -153,18 +153,15 @@ int SubgraphEngine::LaunchDeviceProgram() {
}
// Set input buffer
Tensor input_temp;
for (size_t i = 0; i < origin_itensors_.size(); i++) {
input_temp.Resize({origin_idims_[i]});
uint8_t* input_data = input_temp.mutable_data<uint8_t>();
memcpy(input_data,
origin_itensors_[i]->raw_data(),
origin_itensors_[i]->memory_size());
auto origin_data = origin_itensors_[i]->mutable_data<int8_t>();
auto converted_data = reinterpret_cast<uint8_t*>(origin_data);
for (int j = 0; j < origin_itensors_[i]->data_size(); j++) {
input_data[j] += (uint8_t)128;
converted_data[j] =
static_cast<uint8_t>(static_cast<int16_t>(origin_data[j]) + 128);
}
NeuronExecution_setInput(
run, i, NULL, input_data, origin_itensors_[i]->memory_size());
run, i, NULL, converted_data, origin_itensors_[i]->memory_size());
}
// Set output buffer
......@@ -184,10 +181,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
}
for (size_t i = 0; i < origin_otensors_.size(); i++) {
int8_t* output_data = origin_otensors_[i]->mutable_data<int8_t>();
VLOG(3) << "output size:" << origin_otensors_[i]->memory_size();
auto converted_data = origin_otensors_[i]->mutable_data<int8_t>();
auto origin_data = reinterpret_cast<uint8_t*>(converted_data);
for (int j = 0; j < origin_otensors_[i]->data_size(); j++) {
output_data[j] -= (int8_t)128;
converted_data[j] =
static_cast<int8_t>(static_cast<int16_t>(origin_data[j]) - 128);
}
}
NeuronExecution_free(run);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册