From 5c0bfc1806a785921337d7492de4b81525033e81 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Tue, 18 Oct 2022 20:50:50 +0800 Subject: [PATCH] [Paddle-TRT]Rewrite strided_slice converter using shape tensor (#46819) * Rewrite strided_slice converter using shape tensor * clean code --- .../tensorrt/convert/strided_slice_op.cc | 249 ++++++++++-------- .../operators/tensorrt/tensorrt_engine_op.h | 12 + .../test_trt_convert_strided_slice.py | 85 +++++- 3 files changed, 230 insertions(+), 116 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc b/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc index 2302d96e235..deecb913891 100644 --- a/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc @@ -14,33 +14,23 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -namespace paddle { -namespace framework { -class Scope; -namespace proto { -class OpDesc; -} // namespace proto -} // namespace framework -} // namespace paddle - namespace paddle { namespace inference { namespace tensorrt { -/* - * Stack converter from fluid to tensorRT. - */ class StridedSliceOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(4) << "convert fluid StridedSlice op to tensorrt Slice layer"; - + VLOG(4) << "convert strided_slice op to tensorrt layer"; framework::OpDesc op_desc(op, nullptr); + // Declare inputs auto* input = engine_->GetITensor(op_desc.Input("Input")[0]); - nvinfer1::Dims input_dims = input->getDimensions(); auto output_name = op_desc.Output("Out")[0]; + + // phi only allow axes[i] >= 0 && axes = PADDLE_GET_CONST(std::vector, op_desc.GetAttr("axes")); std::vector starts = @@ -49,119 +39,148 @@ class StridedSliceOpConverter : public OpConverter { PADDLE_GET_CONST(std::vector, op_desc.GetAttr("ends")); std::vector strides = PADDLE_GET_CONST(std::vector, op_desc.GetAttr("strides")); - int axes_size = axes.size(); - nvinfer1::Dims start; - nvinfer1::Dims stride; - nvinfer1::Dims size; - start.nbDims = input_dims.nbDims; - stride.nbDims = input_dims.nbDims; - size.nbDims = input_dims.nbDims; - for (int i = 0; i < input_dims.nbDims; i++) { - start.d[i] = 0; - stride.d[i] = 1; - size.d[i] = input_dims.d[i]; - } + std::vector decrease_axises = + PADDLE_GET_CONST(std::vector, op_desc.GetAttr("decrease_axis")); + auto input_dims = input->getDimensions(); if (!engine_->with_dynamic_shape()) { - for (int i = 0; i < axes_size; i++) { - start.d[axes[i] - 1] = starts[i]; + // notice that input shape is [CHW] without batch axis when input has + // static shape + for (size_t i = input_dims.nbDims; i > 0; i--) { + input_dims.d[i] = input_dims.d[i - 1]; } - for (int i = 0; i < axes_size; i++) { - stride.d[axes[i] - 1] = strides[i]; - } - for (int i = 0; i < axes_size; ++i) { - int dim = size.d[axes[i] - 1]; - if (dim > 0) { - int start = starts[i] < 0 ? (starts[i] + dim) : starts[i]; - int end = ends[i] < 0 ? (ends[i] + dim) : ends[i]; - int stride = std::abs(strides[i]); - start = std::max(start, 0); - end = std::max(end, 0); - end = std::min(end, dim); - size.d[axes[i] - 1] = (std::abs(end - start) + stride - 1) / stride; + input_dims.d[0] = 1; // fake batchsize, not useful here + for (size_t i = 0; i < axes.size(); i++) { + if (starts[i] < 0) { + starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0); + } + if (ends[i] < 0) { + ends[i] = std::max(ends[i] + input_dims.d[axes[i]], 0); } + ends[i] = std::min(ends[i], input_dims.d[axes[i]]); + PADDLE_ENFORCE_GT( + ends[i], + starts[i], + platform::errors::InvalidArgument( + "Attr(ends) should be greater than attr(starts) in " + "slice op. But received ends = %d, starts = %d.", + ends[i], + starts[i])); } - auto* layer = - TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, start, size, stride); - RreplenishLayerAndOutput( - layer, "strided_slice", {output_name}, test_mode); - } else { - for (int i = 0; i < axes_size; i++) { - start.d[axes[i]] = starts[i]; + } + + nvinfer1::ILayer* layer = nullptr; + if (engine_->with_dynamic_shape()) { + auto nchw_input_dims = input->getDimensions(); + nvinfer1::Dims trt_start_dims; + trt_start_dims.nbDims = nchw_input_dims.nbDims; + memset(trt_start_dims.d, 0, sizeof(int32_t) * nchw_input_dims.nbDims); + nvinfer1::Dims trt_size_dims = trt_start_dims; + nvinfer1::Dims trt_end_dims = trt_start_dims; + nvinfer1::Dims trt_step_dims = trt_start_dims; + for (int i = 0; i < trt_step_dims.nbDims; i++) trt_step_dims.d[i] = 1; + // input : [N,C,H,W] + bool has_neg_indices = false; + for (size_t i = 0; i < axes.size(); i++) { + int trt_axis = axes[i]; + trt_start_dims.d[trt_axis] = starts[i]; + trt_end_dims.d[trt_axis] = ends[i]; + trt_step_dims.d[axes[i]] = strides[i]; + if (starts[i] < 0 || ends[i] < 0) has_neg_indices = true; } - for (int i = 0; i < axes_size; i++) { - stride.d[axes[i]] = strides[i]; + auto* shape_tensor = Shape(input); + auto* start_tensor = Add1DConstantLayer(trt_start_dims); + if (has_neg_indices) { + start_tensor = FixNegIndices(shape_tensor, start_tensor); } - for (int i = 0; i < axes_size; ++i) { - int dim = size.d[axes[i]]; - if (dim > 0) { - int start = starts[i] < 0 ? (starts[i] + dim) : starts[i]; - int end = ends[i] < 0 ? (ends[i] + dim) : ends[i]; - int stride = std::abs(strides[i]); - start = std::max(start, 0); - end = std::max(end, 0); - end = std::min(end, dim); - size.d[axes[i]] = (std::abs(end - start) + stride - 1) / stride; - } + + std::vector end_vec_tensor; + for (int i = 0; i < trt_end_dims.nbDims; i++) { + end_vec_tensor.push_back(GetEleTensorOfShape(shape_tensor, i)); } - auto create_weights = [&](const std::vector& data, - const std::string& type) -> int* { - std::unique_ptr tmp_tensor(new phi::DenseTensor()); - int data_size = data.size(); - tmp_tensor->Resize({data_size}); - auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); - for (int i = 0; i < data_size; i++) { - tmp_data[i] = data[i]; + for (size_t i = 0; i < axes.size(); i++) { + int trt_axis = axes[i]; + if (ends[i] >= 0) { + end_vec_tensor[trt_axis] = Add1DConstantLayer(ends[i]); + } else { + end_vec_tensor[trt_axis] = + Sum(end_vec_tensor[trt_axis], Add1DConstantLayer(ends[i])); } - - engine_->SetWeights(output_name + "_add_slice_op_" + type, - std::move(tmp_tensor)); - return tmp_data; - }; - - std::vector const_weight(input_dims.nbDims, 0); - for (int i = 0; i < axes_size; i++) { - int dim = input_dims.d[axes[i]]; - int start = starts[i] < 0 ? (starts[i] + dim) : starts[i]; - int end = ends[i] < 0 ? (ends[i] + dim) : ends[i]; - int stride = std::abs(strides[i]); - start = std::max(start, 0); - end = std::max(end, 0); - end = std::min(end, dim); - const_weight[axes[i]] = - dim - ((std::abs(end - start) + stride - 1) / stride); } - int* weight_data = create_weights(const_weight, "size"); - - TensorRTEngine::Weight weight{nvinfer1::DataType::kINT32, - static_cast(weight_data), - static_cast(input_dims.nbDims)}; - - int input_dim_size = input_dims.nbDims; - nvinfer1::Dims input_shape; - input_shape.nbDims = 1; - input_shape.d[0] = input_dim_size; - - auto const_layer = - TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get()); - - auto shape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shape, *input); - // slice layer - auto* layer = - TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, start, size, stride); - // elementwise layer for get size tensor - auto size_layer = - TRT_ENGINE_ADD_LAYER(engine_, - ElementWise, - *shape_layer->getOutput(0), - *const_layer->getOutput(0), - nvinfer1::ElementWiseOperation::kSUB); - layer->setInput(2, *size_layer->getOutput(0)); - RreplenishLayerAndOutput( - layer, "strided_slice", {output_name}, test_mode); + auto* size_tensor = + Sub(start_tensor, Min(Concat(end_vec_tensor), shape_tensor)); + auto zero_t = + Add1DConstantLayer(std::vector(nchw_input_dims.nbDims, 0)); + auto step_tensor = Add1DConstantLayer(trt_step_dims); + size_tensor = Sub(zero_t, FloorDiv(size_tensor, step_tensor)); + + layer = TRT_ENGINE_ADD_LAYER( + engine_, Slice, *input, trt_start_dims, trt_size_dims, trt_step_dims); + layer->setInput(1, *start_tensor); + layer->setInput(2, *size_tensor); + layer->setInput(3, *step_tensor); + + if (decrease_axises.size() > 0) { + std::vector gather_indices; + for (int i = 0; i < trt_size_dims.nbDims; i++) { + if (decrease_axises.end() != + std::find(decrease_axises.begin(), decrease_axises.end(), i)) + continue; + gather_indices.push_back(i); + } + if (gather_indices.empty()) + gather_indices.push_back(decrease_axises[0]); + auto real_size_tensor = Gather(size_tensor, gather_indices); + layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0)); + layer->setInput(1, *real_size_tensor); + } + } else { + auto chw_input_dims = input->getDimensions(); + nvinfer1::Dims trt_start_dims; + trt_start_dims.nbDims = chw_input_dims.nbDims; + memset(trt_start_dims.d, 0, sizeof(int32_t) * chw_input_dims.nbDims); + nvinfer1::Dims trt_size_dims = chw_input_dims; + nvinfer1::Dims trt_step_dims; + trt_step_dims.nbDims = chw_input_dims.nbDims; + for (int i = 0; i < trt_step_dims.nbDims; i++) trt_step_dims.d[i] = 1; + + // input : [C,H,W] + for (size_t i = 0; i < axes.size(); i++) { + int trt_axis = axes[i] - 1; + trt_start_dims.d[trt_axis] = starts[i]; + trt_size_dims.d[trt_axis] = + (ends[i] - starts[i] + strides[i] - 1) / strides[i]; + trt_step_dims.d[trt_axis] = strides[i]; + } + layer = TRT_ENGINE_ADD_LAYER( + engine_, Slice, *input, trt_start_dims, trt_size_dims, trt_step_dims); + nvinfer1::Dims real_trt_size_dims; + real_trt_size_dims.nbDims = 0; + + if (decrease_axises.size() > 0) { + for (size_t i = 0; i < decrease_axises.size(); i++) { + decrease_axises[i]--; + } + for (int i = 0; i < trt_size_dims.nbDims; i++) { + if (decrease_axises.end() != + std::find(decrease_axises.begin(), decrease_axises.end(), i)) + continue; + real_trt_size_dims.d[real_trt_size_dims.nbDims] = trt_size_dims.d[i]; + real_trt_size_dims.nbDims++; + } + if (real_trt_size_dims.nbDims == 0) { + real_trt_size_dims.nbDims = 1; + real_trt_size_dims.d[0] = 1; + } + auto reshape_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0)); + reshape_layer->setReshapeDimensions(real_trt_size_dims); + layer = static_cast(reshape_layer); + } } + RreplenishLayerAndOutput(layer, "strided_slice", {output_name}, test_mode); } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 178c0fc22a5..eea337d93fb 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -495,6 +495,18 @@ class TensorRTEngineOp : public framework::OperatorBase { // convert input and copy to TRT engine's buffer auto &t = inference::analysis::GetFromScope(scope, x); + PADDLE_ENFORCE_GT( + t.numel(), + 0, + phi::errors::InvalidArgument( + "The input tensor named %s of trt-subgraph must " + "have >0 elements, but now have %d elements. " + "It's likely that this tensor is connected to a Concat op inside " + "a trt-subgraph, " + "try to ues API to forbid this op into trt-subgraph.", + x, + t.numel())); + // check the input_tensor if (!platform::is_gpu_place(t.place())) { phi::DenseTensor out; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py index aea3f6e55d4..4f517b447a1 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py @@ -34,7 +34,7 @@ class TrtConvertStridedSliceTest(TrtLayerAutoScanTest): def sample_program_configs(self): def generate_input1(attrs: List[Dict[str, Any]]): - return np.ones([1, 56, 56, 192]).astype(np.float32) + return np.random.random([1, 56, 56, 192]).astype(np.float32) for axes in [[1, 2]]: for starts in [[1, 1]]: @@ -130,5 +130,88 @@ class TrtConvertStridedSliceTest(TrtLayerAutoScanTest): self.run_test() +class TrtConvertStridedSliceTest2(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + + def generate_input1(attrs: List[Dict[str, Any]]): + return np.random.random([1, 56, 56, 192]).astype(np.float32) + + for axes in [[1, 2], [2, 3], [1, 3]]: + for starts in [[-10, 1], [-10, 20], [-10, 15], [-10, 16], [-10, + 20]]: + for ends in [[-9, 10000], [-9, -1], [-9, 40]]: + for decrease_axis in [[]]: + for infer_flags in [[1, 1]]: + for strides in [[2, 2]]: + dics = [{ + "axes": axes, + "starts": starts, + "ends": ends, + "decrease_axis": [axes[0]], + "infer_flags": infer_flags, + "strides": strides + }] + + ops_config = [{ + "op_type": "strided_slice", + "op_inputs": { + "Input": ["input_data"] + }, + "op_outputs": { + "Out": ["slice_output_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": + TensorConfig(data_gen=partial( + generate_input1, dics)) + }, + outputs=["slice_output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(): + self.dynamic_shape.min_input_shape = { + "input_data": [1, 56, 56, 192] + } + self.dynamic_shape.max_input_shape = { + "input_data": [8, 100, 100, 200] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [4, 56, 56, 192] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), (1, 2), 1e-5 + + # for dynamic_shape + generate_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), (1, 2), 1e-5 + + def test(self): + self.run_test() + + if __name__ == "__main__": unittest.main() -- GitLab