未验证 提交 7987a905 编写于 作者: Z zhoutianzi666 提交者: GitHub

[Paddle-TRT] support new quant format from slim (#46022)

上级 6e4cba14
...@@ -111,9 +111,6 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { ...@@ -111,9 +111,6 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
} }
*/ */
std::unordered_set<const Node*> nodes2rm = {}; std::unordered_set<const Node*> nodes2rm = {};
int bit_length =
PADDLE_GET_CONST(int, quantize_linear_op->Op()->GetAttr("bit_length"));
int range = ((1 << (bit_length - 1)) - 1);
// Get input scale from tensor // Get input scale from tensor
const LoDTensor& input_scale_tensor = const LoDTensor& input_scale_tensor =
...@@ -124,7 +121,7 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { ...@@ -124,7 +121,7 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Input scale tensor's place should be CPU.")); "Input scale tensor's place should be CPU."));
const float* input_scale_data = input_scale_tensor.data<float>(); const float* input_scale_data = input_scale_tensor.data<float>();
float input_scale = input_scale_data[0] / range; float input_scale = input_scale_data[0];
int nums_any_ops = dequantize_linear_op_out->outputs.size(); int nums_any_ops = dequantize_linear_op_out->outputs.size();
for (int i = 0; i < nums_any_ops; ++i) { for (int i = 0; i < nums_any_ops; ++i) {
...@@ -138,7 +135,8 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { ...@@ -138,7 +135,8 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
IR_NODE_LINK_TO(quantize_linear_op_x, IR_NODE_LINK_TO(quantize_linear_op_x,
dequantize_linear_op_out->outputs[i]); dequantize_linear_op_out->outputs[i]);
} }
// Forbid removing weight tensor when weight is shared between ops
if (quantize_linear_op_scale->outputs.size() <= 1UL)
nodes2rm.insert(quantize_linear_op_scale); nodes2rm.insert(quantize_linear_op_scale);
nodes2rm.insert(quantize_linear_op); nodes2rm.insert(quantize_linear_op);
nodes2rm.insert(quantize_linear_op_out); nodes2rm.insert(quantize_linear_op_out);
......
...@@ -84,7 +84,6 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { ...@@ -84,7 +84,6 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
void PaddlePassBuilder::ClearPasses() { passes_.clear(); } void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
const std::vector<std::string> kTRTSubgraphPasses({ const std::vector<std::string> kTRTSubgraphPasses({
"identity_scale_op_clean_pass", //
"adaptive_pool2d_convert_global_pass", // "adaptive_pool2d_convert_global_pass", //
"shuffle_channel_detect_pass", // "shuffle_channel_detect_pass", //
"quant_conv2d_dequant_fuse_pass", // "quant_conv2d_dequant_fuse_pass", //
...@@ -93,6 +92,7 @@ const std::vector<std::string> kTRTSubgraphPasses({ ...@@ -93,6 +92,7 @@ const std::vector<std::string> kTRTSubgraphPasses({
"delete_quant_dequant_filter_op_pass", // "delete_quant_dequant_filter_op_pass", //
"delete_weight_dequant_linear_op_pass", // "delete_weight_dequant_linear_op_pass", //
"delete_quant_dequant_linear_op_pass", // "delete_quant_dequant_linear_op_pass", //
"identity_scale_op_clean_pass", //
"add_support_int8_pass", // "add_support_int8_pass", //
// "fc_fuse_pass", // // "fc_fuse_pass", //
"simplify_with_basic_ops_pass", // "simplify_with_basic_ops_pass", //
......
...@@ -37,9 +37,9 @@ class MatMulV2OpConverter : public OpConverter { ...@@ -37,9 +37,9 @@ class MatMulV2OpConverter : public OpConverter {
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, const framework::Scope& scope,
bool test_mode) override { bool test_mode) override {
VLOG(3) << "convert a fluid matmul_v2 op to tensorrt matmul layer "; VLOG(3) << "convert a matmul_v2 op to tensorrt IMatrixMultiplyLayer layer ";
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
nvinfer1::ILayer* layer = nullptr; nvinfer1::IMatrixMultiplyLayer* layer = nullptr;
// Declare inputs // Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
...@@ -61,8 +61,9 @@ class MatMulV2OpConverter : public OpConverter { ...@@ -61,8 +61,9 @@ class MatMulV2OpConverter : public OpConverter {
: nvinfer1::MatrixOperation::kNONE; : nvinfer1::MatrixOperation::kNONE;
int one_num = 0; int one_num = 0;
bool all_matrix = dims_x.nbDims >= 2 && dims_y.nbDims >= 2;
nvinfer1::ITensor* new_shape_tensor = nullptr; nvinfer1::ITensor* new_shape_tensor = nullptr;
if (dims_x.nbDims < dims_y.nbDims) { if (dims_x.nbDims < dims_y.nbDims && all_matrix) {
one_num = dims_y.nbDims - dims_x.nbDims; one_num = dims_y.nbDims - dims_x.nbDims;
new_shape_tensor = Shape(input1); new_shape_tensor = Shape(input1);
std::vector<int32_t> one_vec(one_num, 1); std::vector<int32_t> one_vec(one_num, 1);
...@@ -80,7 +81,7 @@ class MatMulV2OpConverter : public OpConverter { ...@@ -80,7 +81,7 @@ class MatMulV2OpConverter : public OpConverter {
*input2, *input2,
matrix_operation_Y); matrix_operation_Y);
} else if (dims_x.nbDims > dims_y.nbDims) { } else if (dims_x.nbDims > dims_y.nbDims && all_matrix) {
one_num = dims_x.nbDims - dims_y.nbDims; one_num = dims_x.nbDims - dims_y.nbDims;
new_shape_tensor = Shape(input2); new_shape_tensor = Shape(input2);
std::vector<int32_t> one_vec(one_num, 1); std::vector<int32_t> one_vec(one_num, 1);
...@@ -105,9 +106,26 @@ class MatMulV2OpConverter : public OpConverter { ...@@ -105,9 +106,26 @@ class MatMulV2OpConverter : public OpConverter {
*input2, *input2,
matrix_operation_Y); matrix_operation_Y);
} }
VLOG(3) << "Convert a fluid matmul_v2_op_float to TensorRT "; if (dims_x.nbDims == 1)
layer->setOperation(0, nvinfer1::MatrixOperation::kVECTOR);
if (dims_y.nbDims == 1)
layer->setOperation(1, nvinfer1::MatrixOperation::kVECTOR);
nvinfer1::ILayer* final_layer = static_cast<nvinfer1::ILayer*>(layer);
// When vec * vec, trt produces a scalar, so to be consistent with paddle,
// we need add a reshape.
if (dims_x.nbDims == 1 && dims_y.nbDims == 1) {
auto reshape_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0));
nvinfer1::Dims reshape_dim;
reshape_dim.nbDims = 1;
reshape_dim.d[0] = 1;
reshape_layer->setReshapeDimensions(reshape_dim);
final_layer = static_cast<nvinfer1::ILayer*>(reshape_layer);
}
VLOG(3) << "Convert a matmul_v2_op to TensorRT ";
RreplenishLayerAndOutput(layer, "matmul_v2_op", {output_name}, test_mode); RreplenishLayerAndOutput(
final_layer, "matmul_v2_op", {output_name}, test_mode);
} }
}; };
......
...@@ -46,6 +46,7 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -46,6 +46,7 @@ struct SimpleOpTypeSetTeller : public Teller {
#if IS_TRT_VERSION_GE(7000) #if IS_TRT_VERSION_GE(7000)
teller_set.insert("tile"); teller_set.insert("tile");
teller_set.insert("flatten_contiguous_range"); teller_set.insert("flatten_contiguous_range");
int8_teller_set.insert("flatten_contiguous_range");
teller_set.insert("rnn"); teller_set.insert("rnn");
int8_teller_set.insert("rnn"); int8_teller_set.insert("rnn");
teller_set.insert("fill_constant_batch_size_like"); teller_set.insert("fill_constant_batch_size_like");
......
...@@ -193,5 +193,127 @@ class TrtConvertMatmulTest_dynamic2(TrtLayerAutoScanTest): ...@@ -193,5 +193,127 @@ class TrtConvertMatmulTest_dynamic2(TrtLayerAutoScanTest):
self.run_test() self.run_test()
class TrtConvertMatmulTest_dynamic3(TrtLayerAutoScanTest):
def sample_program_configs(self):
def generate_input(shape):
return np.random.random(shape).astype(np.float32)
# case0: mat * vec
# case1: vec * mat
# case2: vec * vec
for case in [0, 1, 2]:
for batch in range(20, 23):
for trans_x in [False, True]:
for trans_y in [False, True]:
self.case = case
input1_shape = []
input2_shape = []
if case == 0:
input1_shape = [batch, 50]
input2_shape = [50]
elif case == 1:
input1_shape = [50]
input2_shape = [50, batch]
elif case == 2:
input1_shape = [50]
input2_shape = [50]
if (case == 0 or case == 1):
dics = [{
"trans_x": False,
"trans_y": False,
}]
elif (case == 2):
dics = [{
"trans_x": trans_x,
"trans_y": trans_y,
}]
ops_config = [{
"op_type": "matmul_v2",
"op_inputs": {
"X": ["input1_data"],
"Y": ["input2_data"]
},
"op_outputs": {
"Out": ["output_data"]
},
"op_attrs": dics[0]
}]
ops = self.generate_op_config(ops_config)
program_config = ProgramConfig(
ops=ops,
weights={},
inputs={
"input1_data":
TensorConfig(data_gen=partial(
generate_input, input1_shape)),
"input2_data":
TensorConfig(data_gen=partial(
generate_input, input2_shape))
},
outputs=["output_data"])
yield program_config
def sample_predictor_configs(
self, program_config) -> (paddle_infer.Config, List[int], float):
def generate_dynamic_shape():
if (self.case == 0):
self.dynamic_shape.min_input_shape = {
"input1_data": [20, 50],
"input2_data": [50]
}
self.dynamic_shape.max_input_shape = {
"input1_data": [30, 50],
"input2_data": [50]
}
self.dynamic_shape.opt_input_shape = {
"input1_data": [25, 50],
"input2_data": [50]
}
elif (self.case == 1):
self.dynamic_shape.min_input_shape = {
"input2_data": [50, 20],
"input1_data": [50]
}
self.dynamic_shape.max_input_shape = {
"input2_data": [50, 30],
"input1_data": [50]
}
self.dynamic_shape.opt_input_shape = {
"input2_data": [50, 25],
"input1_data": [50]
}
elif (self.case == 2):
self.dynamic_shape.min_input_shape = {
"input2_data": [30],
"input1_data": [50]
}
self.dynamic_shape.max_input_shape = {
"input2_data": [50],
"input1_data": [50]
}
self.dynamic_shape.opt_input_shape = {
"input2_data": [50],
"input1_data": [50]
}
generate_dynamic_shape()
self.trt_param.precision = paddle_infer.PrecisionType.Float32
yield self.create_inference_config(), (1, 3), 1e-5
self.trt_param.precision = paddle_infer.PrecisionType.Half
yield self.create_inference_config(), (1, 3), 1e-5
def add_skip_trt_case(self):
pass
def test(self):
self.add_skip_trt_case()
self.run_test()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册