// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "lite/operators/conv_op.h" #include #include #include "lite/kernels/apu/bridges/graph.h" #include "lite/kernels/apu/bridges/utility.h" #include "lite/kernels/npu/bridges/registry.h" namespace paddle { namespace lite { namespace subgraph { namespace apu { int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK(ctx != nullptr); CHECK(op != nullptr); auto graph = static_cast(ctx); auto model = graph->model(); auto op_info = op->op_info(); auto op_type = op_info->Type(); auto scope = op->scope(); int neuron_errCode; VLOG(3) << "[APU] Converting [" << op_type << "]"; auto libHandle = graph->libHandle(); LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand) LOAD_FUNCTIONS( libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue) LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation) LOAD_FUNCTIONS(libHandle, NeuronModel_setOperandSymmPerChannelQuantParams, neuron_model_setOperandSymmPerChannelQuantParams) // Get input and output vars and op attributes auto input_name = op_info->Input("Input").front(); auto input = scope->FindMutableTensor(input_name); auto input_dims = input->dims(); auto filter_name = op_info->Input("Filter").front(); auto filter = scope->FindMutableTensor(filter_name); auto filter_dims = filter->dims(); auto output_name = op_info->Output("Output").front(); auto output = scope->FindMutableTensor(output_name); auto output_dims = output->dims(); auto bs = input_dims[0]; auto ic = input_dims[1]; auto oc = filter_dims[0]; CHECK_EQ(input_dims.size(), 4L); CHECK_EQ(output_dims.size(), 4L); CHECK_EQ(filter_dims.size(), 4L); CHECK_EQ(output_dims[0], bs); CHECK_EQ(output_dims[1], oc); auto strides = op_info->GetAttr>("strides"); auto paddings = op_info->GetAttr>("paddings"); auto groups = op_info->GetAttr("groups"); auto dilations = op_info->GetAttr>("dilations"); bool with_act = op_info->HasAttr("with_act") && op_info->GetAttr("with_act"); std::string act_type = with_act ? op_info->GetAttr("act_type") : ""; float leaky_relu_alpha = act_type == "leaky_relu" ? op_info->GetAttr("leaky_relu_alpha") : 0.f; CHECK_EQ(strides.size(), 2L); CHECK_EQ(dilations.size(), 2L); bool is_depthwise_mode = ic == groups && oc == groups; VLOG(3) << "is_depthwise_mode" << is_depthwise_mode; if (paddings.size() == 2L) { for (size_t i = 0; i < strides.size(); ++i) { int copy_pad = *(paddings.begin() + 2 * i); paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); } } CHECK_EQ(paddings.size(), 4L) << "[APU] Paddings size should be the same or twice as the input size." << paddings.size(); std::string padding_algorithm(""); if (op_info->HasAttr("padding_algorithm")) { padding_algorithm = op_info->GetAttr("padding_algorithm"); } operators::UpdatePaddingAndDilation(&paddings, &dilations, strides, padding_algorithm, input_dims, filter_dims); float input_scale; float output_scale; std::vector weight_scale; if (op_info->HasAttr("enable_int8")) { if (op_info->GetAttr("enable_int8")) { if (op_info->HasAttr("input_scale")) input_scale = op_info->GetAttr("input_scale"); if (op_info->HasAttr("weight_scale")) weight_scale = op_info->GetAttr>("weight_scale"); if (op_info->HasAttr("output_scale")) output_scale = op_info->GetAttr("output_scale"); VLOG(3) << "has output scale:" << output_scale; } else { return FAILED; } } else { return FAILED; } VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups << " ,dilations: " << dilations[0] << ":" << dilations[1]; VLOG(3) << "with_act: " << with_act << " ,act_type:" << act_type; VLOG(3) << "input_dims: " << input_dims << " ,output_dims: " << output_dims << " ,weight_scale size: " << weight_scale.size(); VLOG(3) << "filter_dims: " << filter_dims << " ,memory_size: " << filter->memory_size() << " ,data_size: " << filter->data_size(); // Add input tensor type NeuronOperandType inType; inType.type = NEURON_TENSOR_QUANT8_ASYMM; inType.scale = input_scale; inType.zeroPoint = 128; inType.dimensionCount = input_dims.size(); std::vector dims_in = {(uint32_t)input_dims[0], (uint32_t)input_dims[2], (uint32_t)input_dims[3], (uint32_t)input_dims[1]}; inType.dimensions = &dims_in[0]; std::shared_ptr input_node = nullptr; if (graph->Has(input_name)) { VLOG(3) << "Graph has " << input_name; // input operand already exist input_node = graph->Get(input_name); } else { // add input operand if (graph->IsInput(input_name)) { // Insert transpose for NCHW -> NHWC insert_transpose_node( ctx, input_name, "transpose_" + input_name, {input_dims[0], input_dims[1], input_dims[2], input_dims[3]}, dims_in, {0, 2, 3, 1}, inType.scale, inType.zeroPoint); // change input_name input_name = "transpose_" + input_name; input_node = graph->Get(input_name); if (input_node == nullptr) return subgraph::FAILED; } else { (*neuron_model_addOperand)(model, &inType); // input input_node = graph->Add(input_name, dims_in); } } VLOG(3) << "input node idx" << input_node->index() << ": input_scale: " << input_scale << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1] << ":" << inType.dimensions[2] << ":" << inType.dimensions[3]; // Add bias type NeuronOperandType biasType; // Add filter type // filter NCHW -> NHWC Tensor transpose_filter; std::vector dims_filter; if (is_depthwise_mode) { transpose_filter.Resize({1, (uint32_t)filter_dims[2], (uint32_t)filter_dims[3], (uint32_t)filter_dims[0]}); dims_filter = {1, (uint32_t)filter_dims[0], (uint32_t)filter_dims[2], (uint32_t)filter_dims[3]}; transpose(filter->data(), transpose_filter.mutable_data(), dims_filter, {0, 2, 3, 1}); dims_filter = {(uint32_t)filter_dims[1], (uint32_t)filter_dims[2], (uint32_t)filter_dims[3], (uint32_t)filter_dims[0]}; } else { transpose_filter.Resize({(uint32_t)filter_dims[0], (uint32_t)filter_dims[2], (uint32_t)filter_dims[3], (uint32_t)filter_dims[1]}); dims_filter = {(uint32_t)filter_dims[0], (uint32_t)filter_dims[1], (uint32_t)filter_dims[2], (uint32_t)filter_dims[3]}; transpose(filter->data(), transpose_filter.mutable_data(), dims_filter, {0, 2, 3, 1}); dims_filter = {(uint32_t)filter_dims[0], (uint32_t)filter_dims[2], (uint32_t)filter_dims[3], (uint32_t)filter_dims[1]}; } NeuronOperandType filterType; NeuronOperandType channelFilterType; NeuronSymmPerChannelQuantParams symmPerChannelQuantParams; if (1 == weight_scale.size()) { // Per layer type filterType.type = NEURON_TENSOR_QUANT8_ASYMM; filterType.scale = weight_scale[0]; filterType.zeroPoint = 128; filterType.dimensionCount = filter_dims.size(); filterType.dimensions = &dims_filter[0]; biasType.scale = inType.scale * filterType.scale; } else { // Per channel type channelFilterType.type = NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL; channelFilterType.scale = 0.0f; channelFilterType.zeroPoint = 0; channelFilterType.dimensionCount = filter_dims.size(); channelFilterType.dimensions = &dims_filter[0]; // Per channel setting if (is_depthwise_mode) symmPerChannelQuantParams.channelDim = 3; else symmPerChannelQuantParams.channelDim = 0; symmPerChannelQuantParams.scaleCount = weight_scale.size(); symmPerChannelQuantParams.scales = weight_scale.data(); biasType.scale = 0; } std::shared_ptr filter_node = nullptr; if (1 == weight_scale.size()) { (*neuron_model_addOperand)(model, &filterType); // 1: filter filter_node = graph->Add(filter_name, dims_filter); VLOG(3) << "filter node idx: " << filter_node->index() << "w_scale[0]" << weight_scale[0] << ": filterType: " << filterType.dimensions[0] << ":" << filterType.dimensions[1] << ":" << filterType.dimensions[2] << ":" << filterType.dimensions[3]; memcpy(filter->mutable_data(), transpose_filter.mutable_data(), filter->memory_size()); neuron_errCode = (*neuron_model_setOperandValue)( model, filter_node->index(), filter->raw_data(), filter->memory_size()); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode; return subgraph::FAILED; } } else { (*neuron_model_addOperand)(model, &channelFilterType); // 1: filter filter_node = graph->Add(filter_name, dims_filter); VLOG(3) << "chennel filter node idx: " << filter_node->index() << " ,scale_count:" << weight_scale.size() << " weight_scale[0]:" << weight_scale.data()[0] << " ,channelFilterType: " << channelFilterType.dimensions[0] << ":" << channelFilterType.dimensions[1] << ":" << channelFilterType.dimensions[2] << ":" << channelFilterType.dimensions[3]; memcpy(filter->mutable_data(), transpose_filter.mutable_data(), filter->memory_size()); neuron_errCode = (*neuron_model_setOperandValue)( model, filter_node->index(), filter->raw_data(), filter->memory_size()); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode; return subgraph::FAILED; } neuron_errCode = (*neuron_model_setOperandSymmPerChannelQuantParams)( model, filter_node->index(), &symmPerChannelQuantParams); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "Set per channel filter params fail:" << neuron_errCode; return subgraph::FAILED; } } // Add biasType node value // A 1-D tensor, of shape [depth_out], specifying the bias. // For filter tensor of NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL, the bias // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 // and bias_scale of 0. The actual scale of each value 'i' is equal // to bias_scale[i] = input_scale * filter_scale[i]. biasType.type = NEURON_TENSOR_INT32; biasType.zeroPoint = 0; std::vector dims_bias; std::shared_ptr bias_node = nullptr; if (HasInputArg(op_info, scope, "Bias")) { auto bias_name = op_info->Input("Bias").front(); auto bias_type = kernel->GetInputDeclType("Bias"); auto bias = scope->FindMutableTensor(bias_name); auto bias_dims = bias->dims(); biasType.dimensionCount = bias_dims.size(); for (int i = 0; i < bias_dims.size(); i++) dims_bias.push_back(bias_dims[i]); biasType.dimensions = &dims_bias[0]; (*neuron_model_addOperand)(model, &biasType); // 2: bias bias_node = graph->Add(bias_name, dims_bias); VLOG(3) << "node idx" << bias_node->index() << ": Bias name: " << bias_name << " ,bias scale: " << biasType.scale << " ,dimensions: " << bias_dims; } else { biasType.dimensionCount = 1; dims_bias = {(uint32_t)output_dims[1]}; biasType.dimensions = &dims_bias[0]; (*neuron_model_addOperand)(model, &biasType); // 2: bias bias_node = graph->Add(filter_name + "_default_bias", dims_bias); VLOG(3) << "node idx" << bias_node->index() << ": Bias name: default_bias " << " ,bias scale: " << biasType.scale << " ,dimensions: " << dims_bias.size(); } NeuronOperandType int32Type; int32Type.type = NEURON_INT32; int32Type.dimensionCount = 0; std::vector dims_int32 = {1}; std::shared_ptr paddingL_node = nullptr; (*neuron_model_addOperand)(model, &int32Type); // 3: padding left paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32); std::shared_ptr paddingR_node = nullptr; (*neuron_model_addOperand)(model, &int32Type); // 4: padding right paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32); std::shared_ptr paddingT_node = nullptr; (*neuron_model_addOperand)(model, &int32Type); // 5: padding top paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32); std::shared_ptr paddingB_node = nullptr; (*neuron_model_addOperand)(model, &int32Type); // 6: padding bottom paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32); std::shared_ptr strideW_node = nullptr; (*neuron_model_addOperand)(model, &int32Type); // 7: stride width strideW_node = graph->Add(filter_name + "_stride_width", dims_int32); std::shared_ptr strideH_node = nullptr; (*neuron_model_addOperand)(model, &int32Type); // 8: stride height strideH_node = graph->Add(filter_name + "_stride_height", dims_int32); std::shared_ptr dm_node = nullptr; if (is_depthwise_mode) { (*neuron_model_addOperand)(model, &int32Type); // 9: depthwise multiplier dm_node = graph->Add(filter_name + "_dm", dims_int32); } std::shared_ptr fuse_node = nullptr; (*neuron_model_addOperand)(model, &int32Type); // 9/10: fuse fuse_node = graph->Add(filter_name + "_fuse", dims_int32); // Add output tensor type NeuronOperandType outType; outType.type = NEURON_TENSOR_QUANT8_ASYMM; if (graph->IsOutput(output_name)) outType.scale = output_scale / 127; else outType.scale = output_scale; outType.zeroPoint = 128; outType.dimensionCount = output_dims.size(); std::vector dims_out = {(uint32_t)output_dims[0], (uint32_t)output_dims[2], (uint32_t)output_dims[3], (uint32_t)output_dims[1]}; outType.dimensions = &dims_out[0]; std::shared_ptr output_node = nullptr; if (graph->Has(output_name)) { output_node = graph->Get(output_name); } else { // add output operand if (graph->IsOutput(output_name)) { (*neuron_model_addOperand)(model, &outType); // output output_node = graph->Add("transpose_" + output_name, dims_out); } else { (*neuron_model_addOperand)(model, &outType); // output output_node = graph->Add(output_name, dims_out); } } VLOG(3) << "output node idx: " << output_node->index() << ": output_scale: " << outType.scale << ", outType: " << outType.dimensions[0] << ":" << outType.dimensions[1] << ":" << outType.dimensions[2] << ":" << outType.dimensions[3]; // Add bias value if (HasInputArg(op_info, scope, "Bias")) { auto bias_name = op_info->Input("Bias").front(); auto bias = scope->FindMutableTensor(bias_name); int32_t* int32_bias_data = reinterpret_cast(bias->mutable_data()); float2int32( bias->data(), input_scale, weight_scale, int32_bias_data); VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << " : " << int32_bias_data[1] << " : " << int32_bias_data[2] << " : " << int32_bias_data[3]; neuron_errCode = (*neuron_model_setOperandValue)( model, bias_node->index(), bias->raw_data(), bias->memory_size()); } else { auto int32_bias = std::make_shared(); int32_bias->Resize({1, output_dims[1]}); int32_bias->mutable_data(); VLOG(3) << "bais_default: " << int32_bias->memory_size(); memset(int32_bias->mutable_data(), 0, int32_bias->memory_size()); neuron_errCode = (*neuron_model_setOperandValue)(model, bias_node->index(), int32_bias->raw_data(), int32_bias->memory_size()); bias_node->set_data(int32_bias); } if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "Set bias operand value fail:" << neuron_errCode; return subgraph::FAILED; } VLOG(3) << "paddings: " << paddings[0] << ":" << paddings[1] << ":" << paddings[2] << ":" << paddings[3]; // Add padding value int32_t padding_val[1]; padding_val[0] = paddings[2]; (*neuron_model_setOperandValue)( model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1); padding_val[0] = paddings[3]; (*neuron_model_setOperandValue)( model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1); padding_val[0] = paddings[0]; (*neuron_model_setOperandValue)( model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1); padding_val[0] = paddings[1]; (*neuron_model_setOperandValue)( model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1); VLOG(3) << " stride width:" << strides[1] << " height:" << strides[0]; // Add Stride int32_t stride_val[1]; stride_val[0] = strides[1]; // width (*neuron_model_setOperandValue)( model, strideW_node->index(), stride_val, sizeof(int32_t) * 1); stride_val[0] = strides[0]; // height (*neuron_model_setOperandValue)( model, strideH_node->index(), stride_val, sizeof(int32_t) * 1); // Add fuse int32_t fuse_val[1] = {0}; if (act_type == "relu") { fuse_val[0] = 1; } else if (act_type == "relu1") { fuse_val[0] = 2; } else if (act_type == "relu6") { fuse_val[0] = 3; } else if (!act_type.empty()) { fuse_val[0] = 0; LOG(WARNING) << "Support act_type: " << act_type; return FAILED; } if (is_depthwise_mode) { int32_t dm = oc / ic; (*neuron_model_setOperandValue)( model, dm_node->index(), &dm, sizeof(int32_t) * 1); VLOG(3) << "depthwise multiplier:" << dm; // Depthwise conv (*neuron_model_setOperandValue)( model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); std::vector addInIndex = { input_node->index(), // 0: input filter_node->index(), // 1: filter bias_node->index(), // 2: bias paddingL_node->index(), // 3: padding left paddingR_node->index(), // 4: padding right paddingT_node->index(), // 5: padding top paddingB_node->index(), // 6: padding bottom strideW_node->index(), // 7: stride width strideH_node->index(), // 8: stride height dm_node->index(), // 9: depthwise multiplier fuse_node->index()}; // 10 : fuse std::vector addOutIndex = {output_node->index()}; neuron_errCode = (*neuron_model_addOperation)(model, NEURON_DEPTHWISE_CONV_2D, addInIndex.size(), &addInIndex[0], addOutIndex.size(), &addOutIndex[0]); } else { (*neuron_model_setOperandValue)( model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); std::vector addInIndex = { input_node->index(), // 0: input filter_node->index(), // 1: filter bias_node->index(), // 2: bias paddingL_node->index(), // 3: padding left paddingR_node->index(), // 4: padding right paddingT_node->index(), // 5: padding top paddingB_node->index(), // 6: padding bottom strideW_node->index(), // 7: stride width strideH_node->index(), // 8: stride height fuse_node->index()}; // 9: fuse std::vector addOutIndex = {output_node->index()}; neuron_errCode = (*neuron_model_addOperation)(model, NEURON_CONV_2D, addInIndex.size(), &addInIndex[0], addOutIndex.size(), &addOutIndex[0]); } if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "Add op fail:" << op_type; return FAILED; } if (graph->IsOutput(output_name)) { // Insert transpose for NHWC -> NCHW insert_transpose_node( ctx, "transpose_" + output_name, output_name, dims_out, {output_dims[0], output_dims[1], output_dims[2], output_dims[3]}, {0, 3, 1, 2}, outType.scale, outType.zeroPoint); output_node = graph->Get(output_name); if (output_node == nullptr) return subgraph::FAILED; } return REBUILD_WHEN_SHAPE_CHANGED; } } // namespace apu } // namespace subgraph } // namespace lite } // namespace paddle REGISTER_SUBGRAPH_BRIDGE(conv2d, kAPU, paddle::lite::subgraph::apu::ConvConverter); REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d, kAPU, paddle::lite::subgraph::apu::ConvConverter);