未验证 提交 7987a905 编写于 作者: Z zhoutianzi666 提交者: GitHub

[Paddle-TRT] support new quant format from slim (#46022)

上级 6e4cba14
......@@ -111,9 +111,6 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
}
*/
std::unordered_set<const Node*> nodes2rm = {};
int bit_length =
PADDLE_GET_CONST(int, quantize_linear_op->Op()->GetAttr("bit_length"));
int range = ((1 << (bit_length - 1)) - 1);
// Get input scale from tensor
const LoDTensor& input_scale_tensor =
......@@ -124,7 +121,7 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
platform::errors::InvalidArgument(
"Input scale tensor's place should be CPU."));
const float* input_scale_data = input_scale_tensor.data<float>();
float input_scale = input_scale_data[0] / range;
float input_scale = input_scale_data[0];
int nums_any_ops = dequantize_linear_op_out->outputs.size();
for (int i = 0; i < nums_any_ops; ++i) {
......@@ -138,8 +135,9 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
IR_NODE_LINK_TO(quantize_linear_op_x,
dequantize_linear_op_out->outputs[i]);
}
nodes2rm.insert(quantize_linear_op_scale);
// Forbid removing weight tensor when weight is shared between ops
if (quantize_linear_op_scale->outputs.size() <= 1UL)
nodes2rm.insert(quantize_linear_op_scale);
nodes2rm.insert(quantize_linear_op);
nodes2rm.insert(quantize_linear_op_out);
nodes2rm.insert(dequantize_linear_op);
......
......@@ -84,8 +84,7 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
const std::vector<std::string> kTRTSubgraphPasses({
"identity_scale_op_clean_pass", //
"adaptive_pool2d_convert_global_pass", //
"adaptive_pool2d_convert_global_pass", //
"shuffle_channel_detect_pass", //
"quant_conv2d_dequant_fuse_pass", //
"delete_fill_constant_op_pass", //
......@@ -93,6 +92,7 @@ const std::vector<std::string> kTRTSubgraphPasses({
"delete_quant_dequant_filter_op_pass", //
"delete_weight_dequant_linear_op_pass", //
"delete_quant_dequant_linear_op_pass", //
"identity_scale_op_clean_pass", //
"add_support_int8_pass", //
// "fc_fuse_pass", //
"simplify_with_basic_ops_pass", //
......
......@@ -37,9 +37,9 @@ class MatMulV2OpConverter : public OpConverter {
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope,
bool test_mode) override {
VLOG(3) << "convert a fluid matmul_v2 op to tensorrt matmul layer ";
VLOG(3) << "convert a matmul_v2 op to tensorrt IMatrixMultiplyLayer layer ";
framework::OpDesc op_desc(op, nullptr);
nvinfer1::ILayer* layer = nullptr;
nvinfer1::IMatrixMultiplyLayer* layer = nullptr;
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
......@@ -61,8 +61,9 @@ class MatMulV2OpConverter : public OpConverter {
: nvinfer1::MatrixOperation::kNONE;
int one_num = 0;
bool all_matrix = dims_x.nbDims >= 2 && dims_y.nbDims >= 2;
nvinfer1::ITensor* new_shape_tensor = nullptr;
if (dims_x.nbDims < dims_y.nbDims) {
if (dims_x.nbDims < dims_y.nbDims && all_matrix) {
one_num = dims_y.nbDims - dims_x.nbDims;
new_shape_tensor = Shape(input1);
std::vector<int32_t> one_vec(one_num, 1);
......@@ -80,7 +81,7 @@ class MatMulV2OpConverter : public OpConverter {
*input2,
matrix_operation_Y);
} else if (dims_x.nbDims > dims_y.nbDims) {
} else if (dims_x.nbDims > dims_y.nbDims && all_matrix) {
one_num = dims_x.nbDims - dims_y.nbDims;
new_shape_tensor = Shape(input2);
std::vector<int32_t> one_vec(one_num, 1);
......@@ -105,9 +106,26 @@ class MatMulV2OpConverter : public OpConverter {
*input2,
matrix_operation_Y);
}
VLOG(3) << "Convert a fluid matmul_v2_op_float to TensorRT ";
if (dims_x.nbDims == 1)
layer->setOperation(0, nvinfer1::MatrixOperation::kVECTOR);
if (dims_y.nbDims == 1)
layer->setOperation(1, nvinfer1::MatrixOperation::kVECTOR);
nvinfer1::ILayer* final_layer = static_cast<nvinfer1::ILayer*>(layer);
// When vec * vec, trt produces a scalar, so to be consistent with paddle,
// we need add a reshape.
if (dims_x.nbDims == 1 && dims_y.nbDims == 1) {
auto reshape_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0));
nvinfer1::Dims reshape_dim;
reshape_dim.nbDims = 1;
reshape_dim.d[0] = 1;
reshape_layer->setReshapeDimensions(reshape_dim);
final_layer = static_cast<nvinfer1::ILayer*>(reshape_layer);
}
VLOG(3) << "Convert a matmul_v2_op to TensorRT ";
RreplenishLayerAndOutput(layer, "matmul_v2_op", {output_name}, test_mode);
RreplenishLayerAndOutput(
final_layer, "matmul_v2_op", {output_name}, test_mode);
}
};
......
......@@ -46,6 +46,7 @@ struct SimpleOpTypeSetTeller : public Teller {
#if IS_TRT_VERSION_GE(7000)
teller_set.insert("tile");
teller_set.insert("flatten_contiguous_range");
int8_teller_set.insert("flatten_contiguous_range");
teller_set.insert("rnn");
int8_teller_set.insert("rnn");
teller_set.insert("fill_constant_batch_size_like");
......
......@@ -193,5 +193,127 @@ class TrtConvertMatmulTest_dynamic2(TrtLayerAutoScanTest):
self.run_test()
class TrtConvertMatmulTest_dynamic3(TrtLayerAutoScanTest):
def sample_program_configs(self):
def generate_input(shape):
return np.random.random(shape).astype(np.float32)
# case0: mat * vec
# case1: vec * mat
# case2: vec * vec
for case in [0, 1, 2]:
for batch in range(20, 23):
for trans_x in [False, True]:
for trans_y in [False, True]:
self.case = case
input1_shape = []
input2_shape = []
if case == 0:
input1_shape = [batch, 50]
input2_shape = [50]
elif case == 1:
input1_shape = [50]
input2_shape = [50, batch]
elif case == 2:
input1_shape = [50]
input2_shape = [50]
if (case == 0 or case == 1):
dics = [{
"trans_x": False,
"trans_y": False,
}]
elif (case == 2):
dics = [{
"trans_x": trans_x,
"trans_y": trans_y,
}]
ops_config = [{
"op_type": "matmul_v2",
"op_inputs": {
"X": ["input1_data"],
"Y": ["input2_data"]
},
"op_outputs": {
"Out": ["output_data"]
},
"op_attrs": dics[0]
}]
ops = self.generate_op_config(ops_config)
program_config = ProgramConfig(
ops=ops,
weights={},
inputs={
"input1_data":
TensorConfig(data_gen=partial(
generate_input, input1_shape)),
"input2_data":
TensorConfig(data_gen=partial(
generate_input, input2_shape))
},
outputs=["output_data"])
yield program_config
def sample_predictor_configs(
self, program_config) -> (paddle_infer.Config, List[int], float):
def generate_dynamic_shape():
if (self.case == 0):
self.dynamic_shape.min_input_shape = {
"input1_data": [20, 50],
"input2_data": [50]
}
self.dynamic_shape.max_input_shape = {
"input1_data": [30, 50],
"input2_data": [50]
}
self.dynamic_shape.opt_input_shape = {
"input1_data": [25, 50],
"input2_data": [50]
}
elif (self.case == 1):
self.dynamic_shape.min_input_shape = {
"input2_data": [50, 20],
"input1_data": [50]
}
self.dynamic_shape.max_input_shape = {
"input2_data": [50, 30],
"input1_data": [50]
}
self.dynamic_shape.opt_input_shape = {
"input2_data": [50, 25],
"input1_data": [50]
}
elif (self.case == 2):
self.dynamic_shape.min_input_shape = {
"input2_data": [30],
"input1_data": [50]
}
self.dynamic_shape.max_input_shape = {
"input2_data": [50],
"input1_data": [50]
}
self.dynamic_shape.opt_input_shape = {
"input2_data": [50],
"input1_data": [50]
}
generate_dynamic_shape()
self.trt_param.precision = paddle_infer.PrecisionType.Float32
yield self.create_inference_config(), (1, 3), 1e-5
self.trt_param.precision = paddle_infer.PrecisionType.Half
yield self.create_inference_config(), (1, 3), 1e-5
def add_skip_trt_case(self):
pass
def test(self):
self.add_skip_trt_case()
self.run_test()
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册