提交 2050f31c 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into transfer_gru_unit

test=develop
...@@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh ...@@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh
# and its size is only one-third of the official one. # and its size is only one-third of the official one.
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
tar -xz -C /usr/local && \ RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \
tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \
cp -rf /usr/local/TensorRT/include /usr && \ cp -rf /usr/local/TensorRT/include /usr && \
cp -rf /usr/local/TensorRT/lib /usr cp -rf /usr/local/TensorRT/lib /usr
......
...@@ -31,9 +31,17 @@ IF(APPLE) ...@@ -31,9 +31,17 @@ IF(APPLE)
return() return()
ENDIF() ENDIF()
MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path") # Introduce variables:
# * CMAKE_INSTALL_LIBDIR
INCLUDE(GNUInstallDirs)
SET(LIBDIR "lib")
if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$")
SET(LIBDIR "lib64")
endif()
MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/l${LIBDIR} to runtime path")
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR}")
INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers. INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
...@@ -58,7 +66,7 @@ ExternalProject_Add( ...@@ -58,7 +66,7 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS} DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git" GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git"
GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a" GIT_TAG "863ff6e7042cec7d2e29897fe9f0872e0888b0fc"
PREFIX ${MKLDNN_SOURCES_DIR} PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
...@@ -79,9 +87,9 @@ ExternalProject_Add( ...@@ -79,9 +87,9 @@ ExternalProject_Add(
-DMKLROOT:PATH=${MKLML_ROOT} -DMKLROOT:PATH=${MKLML_ROOT}
) )
if(WIN32) if(WIN32)
SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE) SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
else(WIN32) else(WIN32)
SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
endif(WIN32) endif(WIN32)
ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL) ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
...@@ -101,7 +109,7 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) ...@@ -101,7 +109,7 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
# copy the real so.0 lib to install dir # copy the real so.0 lib to install dir
# it can be directly contained in wheel or capi # it can be directly contained in wheel or capi
if(WIN32) if(WIN32)
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/lib/mkldnn.dll) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll)
else(WIN32) else(WIN32)
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
......
...@@ -5,13 +5,13 @@ Kexin Zhao <zhaokexin01@baidu.com> ...@@ -5,13 +5,13 @@ Kexin Zhao <zhaokexin01@baidu.com>
## Introduction ## Introduction
Deep learning is usually a two-stage work: training and inference. The training stage estimates model parameters (weights) from data. The inference stage loads the weights and uses them to interpret inputs. Typically, weights are 32-bit float values (float32). Some new devices, including NVIDIA Volta GPUs, support higher speed computation using 16-bit float values (float16). Deep learning is usually a two-stage work: training and inference. The training stage estimates model parameters (weights) from data. The inference stage loads the weights and uses them to interpret inputs. Typically, weights are 32-bit float values (float32). Some new devices, including NVIDIA Volta GPUs, support higher speed computation using 16-bit float values (float16).
This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16. This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16.
## What is float16? ## What is float16?
float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float or float32 data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has a high tolerance for the loss of precision and range when using float16 to represent the weights, and the inference accuracy will only be minimally affected in most cases, which gives us the opportunity to use float16 data type to speed up the inference. float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float or float32 data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has a high tolerance for the loss of precision and range when using float16 to represent the weights, and the inference accuracy will only be minimally affected in most cases, which gives us the opportunity to use float16 data type to speed up the inference.
Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type. Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
## Why float16? ## Why float16?
The trend in today's deep learning community is to use bigger and deeper model, which translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float32 are correspondingly three-fold: The trend in today's deep learning community is to use bigger and deeper model, which translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float32 are correspondingly three-fold:
...@@ -24,12 +24,12 @@ The trend in today's deep learning community is to use bigger and deeper model, ...@@ -24,12 +24,12 @@ The trend in today's deep learning community is to use bigger and deeper model,
## Fluid implementation of float16 inference ## Fluid implementation of float16 inference
### Overview ### Overview
Fluid use [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block. Fluid use [Program](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block.
### Basic requirement ### Basic requirement
When an executor runs an operator, it uses a kernel to perform computations on tensors contained in the input variables, and then writes the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs. When an executor runs an operator, it uses a kernel to perform computations on tensors contained in the input variables, and then writes the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.
If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type. If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type.
The same principle applies if we want a program to run in float16 mode. We provide input variable of the float16 data type to the first operator, and every subsequent operator will invoke the float16 kernel until we get the final output in float16. So the preliminary requirements for float16 inference are to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax. The same principle applies if we want a program to run in float16 mode. We provide input variable of the float16 data type to the first operator, and every subsequent operator will invoke the float16 kernel until we get the final output in float16. So the preliminary requirements for float16 inference are to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.
...@@ -75,7 +75,7 @@ In this scenario, we already have a float32 inference program and some associate ...@@ -75,7 +75,7 @@ In this scenario, we already have a float32 inference program and some associate
We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. Consequently, we need to add cast operators in the float16 inference program for conversions between the float16 tensor and float32 tensor. We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. Consequently, we need to add cast operators in the float16 inference program for conversions between the float16 tensor and float32 tensor.
The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference). The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
### Experiment results ### Experiment results
Simply running the following commands to reproduce the experiment results presented in this section: Simply running the following commands to reproduce the experiment results presented in this section:
...@@ -113,7 +113,7 @@ We repeat the test ten times and get the following results: ...@@ -113,7 +113,7 @@ We repeat the test ten times and get the following results:
| #10 | 62.53% | 62.48% | | #10 | 62.53% | 62.48% |
| average| 62.63% | 62.62% | | average| 62.63% | 62.62% |
We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests. We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests.
#### Performance benchmark #### Performance benchmark
Currently, Fluid only supports float16 inference on NVIDIA GPUs. There is no motivation to support float16 inference on non-ARM CPUs where float16 is not natively supported, and float16 calculation will only be slower than its float32 counterpart. Currently, Fluid only supports float16 inference on NVIDIA GPUs. There is no motivation to support float16 inference on non-ARM CPUs where float16 is not natively supported, and float16 calculation will only be slower than its float32 counterpart.
...@@ -132,7 +132,7 @@ Average inference time for one mini-batch on Vgg16 model tested on ImageNet data ...@@ -132,7 +132,7 @@ Average inference time for one mini-batch on Vgg16 model tested on ImageNet data
|float16| 3.32 | 4.11 | 5.88 | 9.41 | 16.54 | 30.47 | 60.23 | |float16| 3.32 | 4.11 | 5.88 | 9.41 | 16.54 | 30.47 | 60.23 |
|Speedup| 4.22 | 2.36  | 3.91 | 3.00 | 3.26  | 2.77 | 2.97 | |Speedup| 4.22 | 2.36  | 3.91 | 3.00 | 3.26  | 2.77 | 2.97 |
We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes. We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes.
Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows: Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows:
...@@ -162,7 +162,7 @@ We find that the speedup provided by float16 inference starts relatively small a ...@@ -162,7 +162,7 @@ We find that the speedup provided by float16 inference starts relatively small a
We also did the same benchmark on a single NVIDIA GeForce GTX 1080 Ti GPU that does not support Tensor Core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then delivers around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that Tensor Core, which is specialized for float16 computations, is a critical component of high performance float16 inference. We also did the same benchmark on a single NVIDIA GeForce GTX 1080 Ti GPU that does not support Tensor Core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then delivers around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that Tensor Core, which is specialized for float16 computations, is a critical component of high performance float16 inference.
Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for complete benchmark results. Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/float16/float16_benchmark.md) for complete benchmark results.
### Summary ### Summary
1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode. 1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode.
......
...@@ -144,7 +144,7 @@ paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon' ...@@ -144,7 +144,7 @@ paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon'
paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'c317aa595deb31649083c8faa91cdb97')) paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'c317aa595deb31649083c8faa91cdb97'))
paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '12c5bbb8b38c42e623fbc47611d766e1')) paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '12c5bbb8b38c42e623fbc47611d766e1'))
paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '1ba0508d573f65feecf3564dce22aa1d')) paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '1ba0508d573f65feecf3564dce22aa1d'))
paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', 'b3ecb819454832885c1f0f3ab9a5b938')) paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', '7a1966d7c3a48f1fc0881cdaf5d83b0b'))
paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', '06211aefc50c5a3e940d7204d859cdf7')) paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', '06211aefc50c5a3e940d7204d859cdf7'))
paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', 'e4fb4ed511b2293b8f04f7e872afbfd7')) paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', 'e4fb4ed511b2293b8f04f7e872afbfd7'))
paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', '735fa9758a6d7ff3b47d7b827f961c1d')) paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', '735fa9758a6d7ff3b47d7b827f961c1d'))
...@@ -221,6 +221,7 @@ paddle.fluid.layers.psroi_pool (ArgSpec(args=['input', 'rois', 'output_channels' ...@@ -221,6 +221,7 @@ paddle.fluid.layers.psroi_pool (ArgSpec(args=['input', 'rois', 'output_channels'
paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '2f6ff96864054a31aa4bb659c6722c99')) paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '2f6ff96864054a31aa4bb659c6722c99'))
paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7')) paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7'))
paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607')) paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139')) paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc')) paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e')) paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e'))
...@@ -238,7 +239,7 @@ paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], var ...@@ -238,7 +239,7 @@ paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], var
paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae')) paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae'))
paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8')) paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8'))
paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4')) paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4'))
paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '60cb8f843d625abf33f8bf12455b8f99')) paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '992eb42590fc1c380841a6db72ce78b3'))
paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb')) paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb'))
paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b19b79be4f05e85d1d6cec642c9fb535')) paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b19b79be4f05e85d1d6cec642c9fb535'))
paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816')) paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816'))
...@@ -329,6 +330,7 @@ paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varar ...@@ -329,6 +330,7 @@ paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varar
paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691')) paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e')) paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0')) paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '005a5ae47d6c8fff721931d69d072b9f')) paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '005a5ae47d6c8fff721931d69d072b9f'))
paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd')) paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd'))
paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47')) paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47'))
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include <memory>
#include <utility>
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle { namespace paddle {
...@@ -29,6 +31,11 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) { ...@@ -29,6 +31,11 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
auto &g = graphs.back(); auto &g = graphs.back();
g->Set(kGraphVars, new GraphVars(1UL)); g->Set(kGraphVars, new GraphVars(1UL));
g->Set(kGraphDepVars, new GraphDepVars); g->Set(kGraphDepVars, new GraphDepVars);
auto &stale_ops =
graph->Get<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs);
g->Erase(details::kStaleProgramOpDescs);
g->Set<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs,
new std::vector<OpDesc *>(stale_ops));
} }
auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph); auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -24,6 +25,10 @@ namespace ir { ...@@ -24,6 +25,10 @@ namespace ir {
static const char kParamScopeAttr[] = "__param_scope__"; static const char kParamScopeAttr[] = "__param_scope__";
static const char kFuseStatisAttr[] = "__fuse_statis__"; static const char kFuseStatisAttr[] = "__fuse_statis__";
// When we use trt or other third_party lib, the parameters are managed by
// the lib, but not the fluid. So we need to record them to avoid duplicate
// allocation.
static const char kRepetitiveParamAttr[] = "__repetitive_param__";
enum FuseOptions { enum FuseOptions {
DO_NOT_FUSE, // fusing will not be done DO_NOT_FUSE, // fusing will not be done
......
...@@ -130,15 +130,21 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList( ...@@ -130,15 +130,21 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
if (adj_list.find(n) == adj_list.end()) { if (adj_list.find(n) == adj_list.end()) {
adj_list[n] = std::unordered_set<ir::Node *>(); adj_list[n] = std::unordered_set<ir::Node *>();
} }
std::vector<ir::Node *> nodes;
for (auto &var : n->inputs) { for (auto &var : n->inputs) {
for (auto &adj_n : var->inputs) { for (auto &adj_n : var->inputs) {
PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n) VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
<< " -> " << n->Name() << reinterpret_cast<void *>(n) << " -> " << n->Name() << reinterpret_cast<void *>(n)
<< " via " << var->Name() << reinterpret_cast<void *>(var); << " via " << var->Name() << reinterpret_cast<void *>(var);
adj_list[n].insert(adj_n); nodes.push_back(adj_n);
} }
} }
std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) {
return node1->id() > node2->id();
});
adj_list[n].insert(std::make_move_iterator(nodes.begin()),
std::make_move_iterator(nodes.end()));
} }
return adj_list; return adj_list;
} }
......
...@@ -23,8 +23,12 @@ ...@@ -23,8 +23,12 @@
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -133,6 +137,8 @@ struct Argument { ...@@ -133,6 +137,8 @@ struct Argument {
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
AnalysisConfig::Precision); AnalysisConfig::Precision);
DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
bool);
// Memory optimized related. // Memory optimized related.
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
......
...@@ -17,10 +17,12 @@ limitations under the License. */ ...@@ -17,10 +17,12 @@ limitations under the License. */
#include <sys/stat.h> #include <sys/stat.h>
#include <cstdio> #include <cstdio>
#include <fstream> #include <fstream>
#include <memory>
#include <set> #include <set>
#include <string> #include <string>
#include <typeindex> #include <typeindex>
#include <unordered_map> #include <unordered_map>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
...@@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir, ...@@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
return ""; return "";
} }
static std::string GetTrtEngineSerializedPath(const std::string &model_root,
const std::string &engine_key) {
return model_root + "/trt_serialized_" + engine_key;
}
static std::string GetTrtEngineSerializedData(
const std::string &model_opt_cache_dir, const std::string &engine_key) {
std::string trt_serialized_path =
GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key);
if (FileExists(trt_serialized_path)) {
VLOG(3) << "Trt serialized file: " << trt_serialized_path
<< "is found here";
std::ifstream infile(trt_serialized_path, std::ios::in);
std::stringstream buffer;
buffer << infile.rdbuf();
std::string trt_engine_serialized_data(buffer.str());
return trt_engine_serialized_data;
}
return "";
}
static void SaveTrtEngineSerializedDataToFile(
const std::string &trt_serialized_path,
const std::string &engine_serialized_data) {
std::ofstream outfile(trt_serialized_path);
outfile << engine_serialized_data;
outfile.close();
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
......
...@@ -81,6 +81,9 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -81,6 +81,9 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set( pass->Set(
"model_opt_cache_dir", "model_opt_cache_dir",
new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
pass->Set("use_static_engine",
new bool(argument->tensorrt_use_static_engine()));
} }
pre_pass = pass_name; pre_pass = pass_name;
......
...@@ -22,7 +22,10 @@ ...@@ -22,7 +22,10 @@
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
......
...@@ -14,13 +14,13 @@ ...@@ -14,13 +14,13 @@
#include <algorithm> #include <algorithm>
#include <set> #include <set>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h" #include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/string/pretty_log.h" #include "paddle/fluid/string/pretty_log.h"
...@@ -33,8 +33,15 @@ using framework::ir::Node; ...@@ -33,8 +33,15 @@ using framework::ir::Node;
std::vector<std::string> ExtractParameters( std::vector<std::string> ExtractParameters(
const std::unordered_set<Node *> &nodes); const std::unordered_set<Node *> &nodes);
std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( void RenameAndGetOutputs(
const std::vector<framework::ir::Node *> &subgraph_nodes,
framework::BlockDesc *block_desc,
const std::set<std::string> &input_names_with_id,
std::set<std::string> *output_names_with_id,
std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map);
std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const { std::unique_ptr<framework::ir::Graph> graph) const {
framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
...@@ -47,9 +54,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( ...@@ -47,9 +54,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
Get<int>("min_subgraph_size") /*min subgraph size*/); Get<int>("min_subgraph_size") /*min subgraph size*/);
fuser(); fuser();
std::vector<std::string> graph_param_names =
ExtractParameters(graph->Nodes());
// those parameter already exist in trt, and should not have another copy in
// fluid.
std::vector<std::string> repetitive_params;
for (auto *node : graph->Nodes()) { for (auto *node : graph->Nodes()) {
if (node->IsOp() && !Agent(node).subgraph()->empty()) { if (node->IsOp() && !Agent(node).subgraph()->empty()) {
CreateTensorRTOp(node, graph.get()); CreateTensorRTOp(node, graph.get(), graph_param_names,
&repetitive_params);
std::unordered_set<const Node *> nodes2remove( std::unordered_set<const Node *> nodes2remove(
Agent(node).subgraph()->begin(), Agent(node).subgraph()->end()); Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
...@@ -64,12 +78,15 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( ...@@ -64,12 +78,15 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
} }
} }
framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
graph->Set(framework::ir::kRepetitiveParamAttr,
new std::vector<std::string>(repetitive_params));
return graph; return graph;
} }
std::string GenerateEngineKey(const std::set<std::string> &engine_inputs, std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
const std::set<std::string> &engine_outputs) { const std::set<std::string> &engine_outputs,
const std::string &predictor_id) {
std::string engine_hash_key = ""; std::string engine_hash_key = "";
for (auto name : engine_inputs) { for (auto name : engine_inputs) {
engine_hash_key += name; engine_hash_key += name;
...@@ -77,12 +94,15 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs, ...@@ -77,12 +94,15 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
for (auto name : engine_outputs) { for (auto name : engine_outputs) {
engine_hash_key += name; engine_hash_key += name;
} }
engine_hash_key += predictor_id;
auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key)); auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
return engine_key; return engine_key;
} }
void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, void TensorRtSubgraphPass::CreateTensorRTOp(
Graph *graph) const { framework::ir::Node *node, Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const {
auto *op_desc = node->Op(); auto *op_desc = node->Op();
auto &subgraph = *Agent(node).subgraph(); auto &subgraph = *Agent(node).subgraph();
PADDLE_ENFORCE(!subgraph.empty()); PADDLE_ENFORCE(!subgraph.empty());
...@@ -116,12 +136,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -116,12 +136,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// is unique. // is unique.
std::set<std::string> input_names; std::set<std::string> input_names;
std::set<std::string> input_names_with_id; std::set<std::string> input_names_with_id;
std::vector<std::string> params;
// The node->inputs containes input tensors and parameters.
for (auto *x : node->inputs) { for (auto *x : node->inputs) {
input_names.insert(x->Name()); input_names.insert(x->Name());
input_names_with_id.insert(x->Name() + std::to_string(x->id())); input_names_with_id.insert(x->Name() + std::to_string(x->id()));
if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
params.push_back(x->Name());
}
} }
op_desc->SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
std::set<std::string> output_names; std::set<std::string> output_names;
std::set<std::string> output_names_with_id; std::set<std::string> output_names_with_id;
...@@ -130,11 +154,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -130,11 +154,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
output_names_with_id.insert(x->Name() + std::to_string(x->id())); output_names_with_id.insert(x->Name() + std::to_string(x->id()));
} }
op_desc->SetOutput(
"Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
op_desc->SetType("tensorrt_engine");
std::unordered_map<std::string, std::string> output_name_map; std::unordered_map<std::string, std::string> output_name_map;
auto &subgraph_nodes = *Agent(node).subgraph();
// The following procedure is used to rename all the intermediate // The following procedure is used to rename all the intermediate
// variables and the output variables of the subgraph. // variables and the output variables of the subgraph.
...@@ -148,61 +169,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -148,61 +169,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// input of a OP, but also the output of a Op, there will be problems. // input of a OP, but also the output of a Op, there will be problems.
// So we have to rename the variable in the subgraph to make sure // So we have to rename the variable in the subgraph to make sure
// it is either an OP's input or an OP's output. // it is either an OP's input or an OP's output.
RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
auto &subgraph_nodes = *Agent(node).subgraph(); &output_names_with_id, &output_names, &output_name_map);
for (size_t index = 0; index < block_desc.OpSize(); ++index) {
framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
std::unordered_map<std::string, size_t> var2id;
for (auto *in_var : correspond_node->inputs) {
var2id[in_var->Name()] = in_var->id();
}
// rename for the input variables of op inside subgraph
for (int i = 0; i < op->inputs_size(); i++) {
// one input
auto *in_var = op->mutable_inputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments
std::string arg_value = in_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (input_names_with_id.count(arg_value_with_id)) {
replaced_names.push_back(arg_value);
} else {
replaced_names.push_back(arg_value_with_id);
}
}
in_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
in_var->add_arguments(replaced_names[k]);
}
}
var2id.clear();
for (auto out_var : correspond_node->outputs) {
var2id[out_var->Name()] = out_var->id();
}
// rename for the output variables of op inside subgraph
for (int i = 0; i < op->outputs_size(); i++) {
framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (output_names_with_id.count(arg_value_with_id)) {
output_name_map[arg_value] = arg_value_with_id;
}
replaced_names.push_back(arg_value_with_id);
}
out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
out_var->add_arguments(replaced_names[k]);
}
}
}
// When tensorrt engine runs at the end of the operation, // When tensorrt engine runs at the end of the operation,
// output_mapping help us copy the data from the renamed ITensor // output_mapping help us copy the data from the renamed ITensor
...@@ -212,6 +180,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -212,6 +180,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
PADDLE_ENFORCE(output_name_map.count(name) != 0); PADDLE_ENFORCE(output_name_map.count(name) != 0);
output_mapping.push_back(output_name_map[name]); output_mapping.push_back(output_name_map[name]);
} }
PADDLE_ENFORCE(!output_mapping.empty());
auto *vars = block_desc.Proto()->mutable_vars(); auto *vars = block_desc.Proto()->mutable_vars();
for (framework::ir::Node *node : graph->Nodes()) { for (framework::ir::Node *node : graph->Nodes()) {
...@@ -222,26 +191,83 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -222,26 +191,83 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
"the block has no var-desc"); "the block has no var-desc");
PADDLE_ENFORCE(!output_mapping.empty());
// Set attrs
op_desc->SetType("tensorrt_engine");
op_desc->SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
op_desc->SetOutput(
"Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
op_desc->SetBlockAttr("sub_block", new_block); op_desc->SetBlockAttr("sub_block", new_block);
SetAttr(op_desc->Proto(), "subgraph", SetAttr(op_desc->Proto(), "subgraph",
block_desc.Proto()->SerializeAsString()); block_desc.Proto()->SerializeAsString());
// Set attrs
SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size")); SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size")); SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
SetAttr(op_desc->Proto(), "parameters", params);
auto enable_int8 = Get<bool>("enable_int8"); auto enable_int8 = Get<bool>("enable_int8");
auto engine_key = auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
GenerateEngineKey(input_names_with_id, output_names_with_id); std::to_string(0));
// Get "" when there is no cached calibration table data.
std::string calibration_data = GetTrtCalibTableData( std::string calibration_data = GetTrtCalibTableData(
Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8); Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
SetAttr(op_desc->Proto(), "calibration_data", calibration_data); SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "engine_key", engine_key); SetAttr(op_desc->Proto(), "engine_key", engine_key);
SetAttr(op_desc->Proto(), "engine_serialized_data", std::string(""));
std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
if (enable_int8 && calibration_data.size() != 0) {
calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data));
}
bool use_static_engine = Get<bool>("use_static_engine");
// When in int8 mode and calibration_mode, the program just produce the
// calibration table data.
bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
if (!calibration_mode && use_static_engine) {
std::copy(params.begin(), params.end(),
std::back_inserter(*repetitive_params));
std::string trt_engine_serialized_data = GetTrtEngineSerializedData(
Get<std::string>("model_opt_cache_dir"), engine_key);
if (trt_engine_serialized_data.empty()) {
LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time.";
std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
new tensorrt::TensorRTEngine(
Get<int>("max_batch_size"), Get<int>("workspace_size"),
enable_int8, calibrator.get(), Get<int>("gpu_device_id")));
auto *scope = param_scope();
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
std::unordered_set<std::string> param_set(params.begin(), params.end());
inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlockToTRTEngine(
&block_desc_temp, *scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, trt_engine.get());
nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
trt_engine_serialized_data =
std::string((const char *)serialized_engine_data->data(),
serialized_engine_data->size());
SaveTrtEngineSerializedDataToFile(
GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
engine_key),
trt_engine_serialized_data);
} else {
LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
}
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
}
} }
std::vector<std::string> ExtractParameters( std::vector<std::string> ExtractParameters(
...@@ -253,7 +279,7 @@ std::vector<std::string> ExtractParameters( ...@@ -253,7 +279,7 @@ std::vector<std::string> ExtractParameters(
for (const auto &node : nodes) { for (const auto &node : nodes) {
if (!node->IsOp()) continue; if (!node->IsOp()) continue;
std::string op_type = node->Op()->Type(); std::string op_type = node->Op()->Type();
if (op_type == "feed") { if (op_type == "feed" || op_type == "fetch") {
std::vector<std::string> output_names = node->Op()->OutputArgumentNames(); std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
std::copy(output_names.begin(), output_names.end(), std::copy(output_names.begin(), output_names.end(),
std::back_inserter(feed_outputs)); std::back_inserter(feed_outputs));
...@@ -272,6 +298,99 @@ std::vector<std::string> ExtractParameters( ...@@ -272,6 +298,99 @@ std::vector<std::string> ExtractParameters(
return parameters; return parameters;
} }
void RenameAndGetOutputs(
const std::vector<framework::ir::Node *> &subgraph_nodes,
framework::BlockDesc *block_desc,
const std::set<std::string> &input_names_with_id,
std::set<std::string> *output_names_with_id,
std::set<std::string> *output_names,
std::unordered_map<std::string, std::string> *output_name_map) {
//// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into one conv, and then trigger bug. So, We should use strategy to avoid
// this optimization for the time being. This bug will be fixed in the future.
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
same_hierarchy_conv2d_num_map;
for (size_t index = 0; index < block_desc->OpSize(); ++index) {
framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
framework::OpDesc op_desc(*op, nullptr);
auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
std::unordered_map<std::string, size_t> var2id;
std::unordered_map<std::string, framework::ir::Node *> in_vars;
for (auto *in_var : correspond_node->inputs) {
var2id[in_var->Name()] = in_var->id();
in_vars[in_var->Name()] = in_var;
}
// rename for the input variables of op inside subgraph
for (int i = 0; i < op->inputs_size(); i++) {
// one input
auto *in_var = op->mutable_inputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments
std::string arg_value = in_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (input_names_with_id.count(arg_value_with_id)) {
replaced_names.push_back(arg_value);
} else {
replaced_names.push_back(arg_value_with_id);
}
}
in_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
in_var->add_arguments(replaced_names[k]);
}
}
var2id.clear();
for (auto out_var : correspond_node->outputs) {
var2id[out_var->Name()] = out_var->id();
}
if (op_desc.Type() == "conv2d") {
auto input_var_name = op_desc.Input("Input").front();
auto filter_var_name = op_desc.Input("Filter").front();
auto out_var_name = op_desc.Output("Output").front();
auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
const std::vector<int> strides =
boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
const std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
(*output_names_with_id)
.insert(out_var_name + std::to_string(var2id[out_var_name]));
(*output_names).insert(out_var_name);
} else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
paddings[1] == 0) {
same_hierarchy_conv2d_num_map[input_var_name] += 1;
}
}
// rename for the output variables of op inside subgraph
for (int i = 0; i < op->outputs_size(); i++) {
framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (output_names_with_id->count(arg_value_with_id)) {
(*output_name_map)[arg_value] = arg_value_with_id;
}
replaced_names.push_back(arg_value_with_id);
}
out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
out_var->add_arguments(replaced_names[k]);
}
}
}
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
......
...@@ -13,7 +13,12 @@ ...@@ -13,7 +13,12 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <paddle/fluid/framework/ir/fuse_pass_base.h> #include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
namespace paddle { namespace paddle {
...@@ -26,8 +31,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase { ...@@ -26,8 +31,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
std::unique_ptr<framework::ir::Graph> graph) const override; std::unique_ptr<framework::ir::Graph> graph) const override;
private: private:
void CreateTensorRTOp(framework::ir::Node *x, void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph,
framework::ir::Graph *graph) const; const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const;
void CleanIntermediateOutputs(framework::ir::Node *node); void CleanIntermediateOutputs(framework::ir::Node *node);
}; };
......
...@@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { ...@@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// The parameters are on the cpu, therefore, synchronization is not necessary. // The parameters are on the cpu, therefore, synchronization is not necessary.
if (!argument->use_gpu()) return; if (!argument->use_gpu()) return;
auto &graph = argument->main_graph();
std::vector<std::string> repetitive_params;
if (graph.Has(framework::ir::kRepetitiveParamAttr))
repetitive_params = graph.Get<std::vector<std::string>>(
framework::ir::kRepetitiveParamAttr);
LOG(INFO) << "Sync params from CPU to GPU"; LOG(INFO) << "Sync params from CPU to GPU";
PADDLE_ENFORCE(argument->gpu_device_id_valid()); PADDLE_ENFORCE(argument->gpu_device_id_valid());
...@@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { ...@@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// Because there exists the case that new parameter variables are not added to // Because there exists the case that new parameter variables are not added to
// the program in the analysis pass. // the program in the analysis pass.
for (auto &var_name : all_vars) { for (auto &var_name : all_vars) {
if (std::count(repetitive_params.begin(), repetitive_params.end(),
var_name)) {
continue;
}
auto *var = scope->FindLocalVar(var_name); auto *var = scope->FindLocalVar(var_name);
PADDLE_ENFORCE(var != nullptr); PADDLE_ENFORCE(var != nullptr);
if (var->IsType<framework::LoDTensor>() || if (var->IsType<framework::LoDTensor>() ||
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
......
...@@ -103,6 +103,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -103,6 +103,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(tensorrt_max_batchsize_); CP_MEMBER(tensorrt_max_batchsize_);
CP_MEMBER(tensorrt_min_subgraph_size_); CP_MEMBER(tensorrt_min_subgraph_size_);
CP_MEMBER(tensorrt_precision_mode_); CP_MEMBER(tensorrt_precision_mode_);
CP_MEMBER(trt_use_static_engine_);
// MKLDNN related. // MKLDNN related.
CP_MEMBER(use_mkldnn_); CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_); CP_MEMBER(mkldnn_enabled_op_types_);
...@@ -144,7 +145,7 @@ void AnalysisConfig::EnableMKLDNN() { ...@@ -144,7 +145,7 @@ void AnalysisConfig::EnableMKLDNN() {
void AnalysisConfig::EnableTensorRtEngine( void AnalysisConfig::EnableTensorRtEngine(
int workspace_size, int max_batch_size, int min_subgraph_size, int workspace_size, int max_batch_size, int min_subgraph_size,
AnalysisConfig::Precision precision_mode) { AnalysisConfig::Precision precision_mode, bool use_static) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (!use_gpu()) { if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
...@@ -156,6 +157,7 @@ void AnalysisConfig::EnableTensorRtEngine( ...@@ -156,6 +157,7 @@ void AnalysisConfig::EnableTensorRtEngine(
tensorrt_max_batchsize_ = max_batch_size; tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_min_subgraph_size_ = min_subgraph_size;
tensorrt_precision_mode_ = precision_mode; tensorrt_precision_mode_ = precision_mode;
trt_use_static_engine_ = use_static;
Update(); Update();
#else #else
......
...@@ -243,6 +243,8 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -243,6 +243,8 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
input_ptr = input.mutable_data<int64_t>(ddim, place_); input_ptr = input.mutable_data<int64_t>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::FLOAT32) { } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
input_ptr = input.mutable_data<float>(ddim, place_); input_ptr = input.mutable_data<float>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::INT32) {
input_ptr = input.mutable_data<int32_t>(ddim, place_);
} else { } else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false; return false;
...@@ -326,8 +328,11 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs, ...@@ -326,8 +328,11 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
} else if (type == framework::proto::VarType::INT64) { } else if (type == framework::proto::VarType::INT64) {
GetFetchOne<int64_t>(fetch, output); GetFetchOne<int64_t>(fetch, output);
output->dtype = PaddleDType::INT64; output->dtype = PaddleDType::INT64;
} else if (type == framework::proto::VarType::INT32) {
GetFetchOne<int32_t>(fetch, output);
output->dtype = PaddleDType::INT32;
} else { } else {
LOG(ERROR) << "unknown type, only support float32 and int64 now."; LOG(ERROR) << "unknown type, only support float32, int64 and int32 now.";
} }
} }
return true; return true;
...@@ -365,6 +370,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -365,6 +370,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
} }
if (config_.use_mkldnn_) { if (config_.use_mkldnn_) {
...@@ -438,12 +444,14 @@ void AnalysisPredictor::PrepareFeedFetch() { ...@@ -438,12 +444,14 @@ void AnalysisPredictor::PrepareFeedFetch() {
} }
feeds_[idx] = op; feeds_[idx] = op;
feed_names_[op->Output("Out")[0]] = idx; feed_names_[op->Output("Out")[0]] = idx;
idx2feeds_[idx] = op->Output("Out")[0];
} else if (op->Type() == "fetch") { } else if (op->Type() == "fetch") {
int idx = boost::get<int>(op->GetAttr("col")); int idx = boost::get<int>(op->GetAttr("col"));
if (fetches_.size() <= static_cast<size_t>(idx)) { if (fetches_.size() <= static_cast<size_t>(idx)) {
fetches_.resize(idx + 1); fetches_.resize(idx + 1);
} }
fetches_[idx] = op; fetches_[idx] = op;
idx2fetches_[idx] = op->Input("X")[0];
} }
} }
} }
...@@ -456,6 +464,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) { ...@@ -456,6 +464,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
var->GetMutable<framework::FeedFetchList>(); var->GetMutable<framework::FeedFetchList>();
} }
std::vector<std::string> AnalysisPredictor::GetInputNames() {
std::vector<std::string> input_names;
for (auto &item : idx2feeds_) {
input_names.push_back(item.second);
}
return input_names;
}
std::vector<std::string> AnalysisPredictor::GetOutputNames() {
std::vector<std::string> output_names;
for (auto &item : idx2fetches_) {
output_names.push_back(item.second);
}
return output_names;
}
std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor( std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
const std::string &name) { const std::string &name) {
PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
...@@ -463,6 +487,13 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor( ...@@ -463,6 +487,13 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
new ZeroCopyTensor(static_cast<void *>(executor_->scope()))); new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
res->input_or_output_ = true; res->input_or_output_ = true;
res->SetName(name); res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else {
auto gpu_place = boost::get<platform::CUDAPlace>(place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
}
return res; return res;
} }
...@@ -473,6 +504,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor( ...@@ -473,6 +504,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
new ZeroCopyTensor(static_cast<void *>(executor_->scope()))); new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
res->input_or_output_ = false; res->input_or_output_ = false;
res->SetName(name); res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else {
auto gpu_place = boost::get<platform::CUDAPlace>(place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
}
return res; return res;
} }
......
...@@ -15,12 +15,14 @@ ...@@ -15,12 +15,14 @@
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h" #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#ifdef PADDLE_WITH_TESTING #ifdef PADDLE_WITH_TESTING
...@@ -53,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -53,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor {
std::vector<PaddleTensor> *output_data, std::vector<PaddleTensor> *output_data,
int batch_size = -1) override; int batch_size = -1) override;
std::vector<std::string> GetInputNames();
std::vector<std::string> GetOutputNames();
std::unique_ptr<ZeroCopyTensor> GetInputTensor( std::unique_ptr<ZeroCopyTensor> GetInputTensor(
const std::string &name) override; const std::string &name) override;
std::unique_ptr<ZeroCopyTensor> GetOutputTensor( std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
...@@ -131,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -131,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor {
std::shared_ptr<framework::ProgramDesc> inference_program_; std::shared_ptr<framework::ProgramDesc> inference_program_;
std::vector<framework::OpDesc *> feeds_; std::vector<framework::OpDesc *> feeds_;
std::map<std::string, size_t> feed_names_; std::map<std::string, size_t> feed_names_;
// Sorted according to the idx.
std::map<size_t, std::string> idx2feeds_;
std::vector<framework::OpDesc *> fetches_; std::vector<framework::OpDesc *> fetches_;
std::map<size_t, std::string> idx2fetches_;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, wrong results and memory leak, so cache them. // concurrency problems, wrong results and memory leak, so cache them.
std::vector<framework::LoDTensor> feed_tensors_; std::vector<framework::LoDTensor> feed_tensors_;
......
...@@ -28,6 +28,8 @@ int PaddleDtypeSize(PaddleDType dtype) { ...@@ -28,6 +28,8 @@ int PaddleDtypeSize(PaddleDType dtype) {
return sizeof(float); return sizeof(float);
case PaddleDType::INT64: case PaddleDType::INT64:
return sizeof(int64_t); return sizeof(int64_t);
case PaddleDType::INT32:
return sizeof(int32_t);
default: default:
assert(false); assert(false);
return -1; return -1;
......
...@@ -203,6 +203,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -203,6 +203,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
input_ptr = input.mutable_data<int64_t>(ddim, place_); input_ptr = input.mutable_data<int64_t>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::FLOAT32) { } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
input_ptr = input.mutable_data<float>(ddim, place_); input_ptr = input.mutable_data<float>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::INT32) {
input_ptr = input.mutable_data<int32_t>(ddim, place_);
} else { } else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false; return false;
...@@ -281,8 +283,11 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs, ...@@ -281,8 +283,11 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
} else if (type == framework::DataTypeTrait<int64_t>::DataType) { } else if (type == framework::DataTypeTrait<int64_t>::DataType) {
GetFetchOne<int64_t>(fetch, output); GetFetchOne<int64_t>(fetch, output);
output->dtype = PaddleDType::INT64; output->dtype = PaddleDType::INT64;
} else if (type == framework::DataTypeTrait<int32_t>::DataType) {
GetFetchOne<int32_t>(fetch, output);
output->dtype = PaddleDType::INT32;
} else { } else {
LOG(ERROR) << "unknown type, only support float32 and int64 now."; LOG(ERROR) << "unknown type, only support float32, int64 and int32 now.";
} }
} }
return true; return true;
......
...@@ -42,6 +42,9 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { ...@@ -42,6 +42,9 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
} else if (t->type() == framework::proto::VarType::FP32) { } else if (t->type() == framework::proto::VarType::FP32) {
pt.data.Reset(t->data<void>(), t->numel() * sizeof(float)); pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
pt.dtype = PaddleDType::FLOAT32; pt.dtype = PaddleDType::FLOAT32;
} else if (t->type() == framework::proto::VarType::INT32) {
pt.data.Reset(t->data<void>(), t->numel() * sizeof(int32_t));
pt.dtype = PaddleDType::INT32;
} else { } else {
LOG(FATAL) << "unsupported type."; LOG(FATAL) << "unsupported type.";
} }
......
...@@ -88,13 +88,20 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) { ...@@ -88,13 +88,20 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
} }
break; break;
} }
case PaddleDType::FLOAT32: case PaddleDType::FLOAT32: {
for (size_t i = 0; i < numel; ++i) { for (size_t i = 0; i < numel; ++i) {
CHECK_LT( CHECK_LT(
fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]), fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
1e-5); 1e-5);
} }
break; break;
}
case PaddleDType::INT32: {
for (size_t i = 0; i < numel; ++i) {
CHECK_EQ(static_cast<int32_t*>(output.data.data())[i], refer.data[i]);
}
break;
}
} }
} }
...@@ -113,11 +120,18 @@ static std::string SummaryTensor(const PaddleTensor& tensor) { ...@@ -113,11 +120,18 @@ static std::string SummaryTensor(const PaddleTensor& tensor) {
} }
break; break;
} }
case PaddleDType::FLOAT32: case PaddleDType::FLOAT32: {
for (int i = 0; i < std::min(num_elems, 10); i++) { for (int i = 0; i < std::min(num_elems, 10); i++) {
ss << static_cast<float*>(tensor.data.data())[i] << " "; ss << static_cast<float*>(tensor.data.data())[i] << " ";
} }
break; break;
}
case PaddleDType::INT32: {
for (int i = 0; i < std::min(num_elems, 10); i++) {
ss << static_cast<int32_t*>(tensor.data.data())[i] << " ";
}
break;
}
} }
return ss.str(); return ss.str();
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -73,6 +74,61 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const { ...@@ -73,6 +74,61 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
return res; return res;
} }
template <typename T>
void ZeroCopyTensor::copy_from_cpu(const T *data) {
EAGER_GET_TENSOR;
PADDLE_ENFORCE_GE(
tensor->numel(), 0,
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
"function before copy data from cpu.");
size_t ele_size = tensor->numel() * sizeof(T);
if (place_ == PaddlePlace::kCPU) {
auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
std::memcpy(static_cast<void *>(t_data), data, ele_size);
} else {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
platform::CUDAPlace gpu_place(device_);
auto *t_data = tensor->mutable_data<T>(gpu_place);
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
data, ele_size, dev_ctx->stream());
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
}
}
template <typename T>
void ZeroCopyTensor::copy_to_cpu(T *data) {
EAGER_GET_TENSOR;
auto ele_num = tensor->numel();
auto *t_data = tensor->data<T>();
auto t_place = tensor->place();
if (platform::is_cpu_place(t_place)) {
std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
} else {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto gpu_place = boost::get<platform::CUDAPlace>(t_place);
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
t_data, ele_num * sizeof(T), dev_ctx->stream());
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
}
}
template void ZeroCopyTensor::copy_from_cpu<float>(const float *data);
template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data);
template void ZeroCopyTensor::copy_to_cpu<float>(float *data);
template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data);
template float *ZeroCopyTensor::data<float>(PaddlePlace *place, template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
int *size) const; int *size) const;
template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place, template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
...@@ -92,10 +148,10 @@ void *ZeroCopyTensor::FindTensor() const { ...@@ -92,10 +148,10 @@ void *ZeroCopyTensor::FindTensor() const {
return tensor; return tensor;
} }
std::vector<int64_t> ZeroCopyTensor::shape() const { std::vector<int> ZeroCopyTensor::shape() const {
EAGER_GET_TENSOR; EAGER_GET_TENSOR;
PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_); PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
return framework::vectorize(tensor->dims()); return framework::vectorize2int(tensor->dims());
} }
void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) { void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
......
...@@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); ...@@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const { return nullptr; } void *ZeroCopyTensor::FindTensor() const { return nullptr; }
std::vector<int64_t> ZeroCopyTensor::shape() const { return {}; } std::vector<int> ZeroCopyTensor::shape() const { return {}; }
void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {} void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}
......
...@@ -50,6 +50,11 @@ class Timer { ...@@ -50,6 +50,11 @@ class Timer {
} }
}; };
static int GetUniqueId() {
static int id = 0;
return id++;
}
static void split(const std::string &str, char sep, static void split(const std::string &str, char sep,
std::vector<std::string> *pieces) { std::vector<std::string> *pieces) {
pieces->clear(); pieces->clear();
...@@ -81,6 +86,13 @@ static void split_to_int64(const std::string &str, char sep, ...@@ -81,6 +86,13 @@ static void split_to_int64(const std::string &str, char sep,
std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is), std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is),
[](const std::string &v) { return std::stoi(v); }); [](const std::string &v) { return std::stoi(v); });
} }
static void split_to_int(const std::string &str, char sep,
std::vector<int> *is) {
std::vector<std::string> pieces;
split(str, sep, &pieces);
std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is),
[](const std::string &v) { return std::stoi(v); });
}
template <typename T> template <typename T>
std::string to_string(const std::vector<T> &vec) { std::string to_string(const std::vector<T> &vec) {
std::stringstream ss; std::stringstream ss;
...@@ -197,6 +209,9 @@ static std::string DescribeTensor(const PaddleTensor &tensor, ...@@ -197,6 +209,9 @@ static std::string DescribeTensor(const PaddleTensor &tensor,
case PaddleDType::INT64: case PaddleDType::INT64:
os << "int64"; os << "int64";
break; break;
case PaddleDType::INT32:
os << "int32";
break;
default: default:
os << "unset"; os << "unset";
} }
......
...@@ -135,7 +135,8 @@ struct AnalysisConfig { ...@@ -135,7 +135,8 @@ struct AnalysisConfig {
*/ */
void EnableTensorRtEngine(int workspace_size = 1 << 20, void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3, int max_batch_size = 1, int min_subgraph_size = 3,
Precision precision = Precision::kFloat32); Precision precision = Precision::kFloat32,
bool use_static = true);
/** A boolean state telling whether the TensorRT engine is used. /** A boolean state telling whether the TensorRT engine is used.
*/ */
bool tensorrt_engine_enabled() const { return use_tensorrt_; } bool tensorrt_engine_enabled() const { return use_tensorrt_; }
...@@ -233,6 +234,7 @@ struct AnalysisConfig { ...@@ -233,6 +234,7 @@ struct AnalysisConfig {
// subgraph, 3 as default value. // subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3}; int tensorrt_min_subgraph_size_{3};
Precision tensorrt_precision_mode_; Precision tensorrt_precision_mode_;
bool trt_use_static_engine_;
// memory reuse related. // memory reuse related.
bool enable_memory_optim_{false}; bool enable_memory_optim_{false};
......
...@@ -36,6 +36,7 @@ namespace paddle { ...@@ -36,6 +36,7 @@ namespace paddle {
enum PaddleDType { enum PaddleDType {
FLOAT32, FLOAT32,
INT64, INT64,
INT32,
// TODO(Superjomn) support more data types if needed. // TODO(Superjomn) support more data types if needed.
}; };
...@@ -160,11 +161,21 @@ class ZeroCopyTensor { ...@@ -160,11 +161,21 @@ class ZeroCopyTensor {
template <typename T> template <typename T>
T* data(PaddlePlace* place, int* size) const; T* data(PaddlePlace* place, int* size) const;
std::vector<int64_t> shape() const; template <typename T>
void copy_from_cpu(const T* data);
template <typename T>
void copy_to_cpu(T* data);
std::vector<int> shape() const;
void SetLoD(const std::vector<std::vector<size_t>>& x); void SetLoD(const std::vector<std::vector<size_t>>& x);
std::vector<std::vector<size_t>> lod() const; std::vector<std::vector<size_t>> lod() const;
const std::string& name() const { return name_; } const std::string& name() const { return name_; }
void SetPlace(PaddlePlace place, int device = -1) {
place_ = place;
device_ = device;
}
protected: protected:
explicit ZeroCopyTensor(void* scope) : scope_{scope} {} explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
...@@ -179,6 +190,8 @@ class ZeroCopyTensor { ...@@ -179,6 +190,8 @@ class ZeroCopyTensor {
// The corresponding tensor pointer inside Paddle workspace is cached for // The corresponding tensor pointer inside Paddle workspace is cached for
// performance. // performance.
mutable void* tensor_{nullptr}; mutable void* tensor_{nullptr};
PaddlePlace place_;
int device_;
}; };
/** A simple Inference API for Paddle. /** A simple Inference API for Paddle.
...@@ -200,6 +213,14 @@ class PaddlePredictor { ...@@ -200,6 +213,14 @@ class PaddlePredictor {
std::vector<PaddleTensor>* output_data, std::vector<PaddleTensor>* output_data,
int batch_size = -1) = 0; int batch_size = -1) = 0;
/** \brief Get input names of the model
*/
virtual std::vector<std::string> GetInputNames() { return {}; }
/** \brief Get output names of the model
*/
virtual std::vector<std::string> GetOutputNames() { return {}; }
/** \brief Get a mutable tensor directly. /** \brief Get a mutable tensor directly.
* *
* NOTE Only works in AnalysisPredictor. * NOTE Only works in AnalysisPredictor.
......
...@@ -49,11 +49,6 @@ class EngineBase { ...@@ -49,11 +49,6 @@ class EngineBase {
// Execute the engine, that will run the inference network. // Execute the engine, that will run the inference network.
virtual void Execute(int batch_size) = 0; virtual void Execute(int batch_size) = 0;
// Return the IO buffer that allocated in engine. One can read/write directly
// on the buffer. If the buffer's buffer is nullptr, one can also allocate
// memory and maintain it outside the engine.
virtual Buffer& buffer(const std::string& name) = 0;
virtual ~EngineBase() {} virtual ~EngineBase() {}
}; // class EngineBase }; // class EngineBase
......
...@@ -18,21 +18,6 @@ namespace paddle { ...@@ -18,21 +18,6 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
bool to_skip_merging_optimize(TensorRTEngine* engine,
const std::vector<int>& filters,
const std::vector<int>& strides,
const std::vector<int>& paddings,
std::string input_name) {
if (engine->itensor_quote_num[input_name] > 0) {
return true;
}
if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
engine->itensor_quote_num[input_name] += 1;
return false;
}
template <typename RegistFunc, typename SetDilationFunc> template <typename RegistFunc, typename SetDilationFunc>
void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode, const framework::Scope& scope, bool test_mode,
...@@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, ...@@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
weight_tensor->Resize(Y_t->dims()); weight_tensor->Resize(Y_t->dims());
TensorCopySync((*Y_t), cpu_place, weight_tensor.get()); TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
auto* weight_data = weight_tensor->mutable_data<float>(platform::CPUPlace()); auto* weight_data = weight_tensor->mutable_data<float>(cpu_place);
PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
const int n_output = weight_tensor->dims()[0]; const int n_output = weight_tensor->dims()[0];
...@@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, ...@@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
layer->getOutput(0)->setName(output_name.c_str()); layer->getOutput(0)->setName(output_name.c_str());
engine->SetITensor(output_name, layer->getOutput(0)); engine->SetITensor(output_name, layer->getOutput(0));
if (test_mode || if (test_mode) {
to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings,
op_desc.Input("Input").front())) {
engine->DeclareOutput(output_name); engine->DeclareOutput(output_name);
} }
} }
......
...@@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter { ...@@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
if (CheckDims(dims_x, dims_y)) { if (CheckDims(dims_x, dims_y)) {
// The two input tensor should have the same dims // The two input tensor should have the same dims
VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer"; VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER( nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X), engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
*const_cast<nvinfer1::ITensor*>(Y), op_pair->second); *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
...@@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter { ...@@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
"ElementWisePluginLayer"; "ElementWisePluginLayer";
plugin::ElementWisePlugin* plugin = plugin::ElementWisePlugin* plugin =
new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis); new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis);
plugin->AddInput(X); plugin->AddInput(X);
plugin->AddInput(Y); plugin->AddInput(Y);
nvinfer1::IPluginLayer* layer = engine_->AddPlugin( nvinfer1::IPluginLayer* layer = engine_->AddPlugin(
......
...@@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter { ...@@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter {
Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float)); Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data), static_cast<void*>(weight_data),
Y_t->memory_size() / sizeof(float)}; static_cast<size_t>(Y_t->numel())};
TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT, TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
static_cast<void*>(tmp->data<float>()), static_cast<void*>(tmp->data<float>()),
Y_t->memory_size() / sizeof(float)); static_cast<size_t>(Y_t->numel()));
weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]}); weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
tmp_weight.dims = weight.dims; tmp_weight.dims = weight.dims;
......
...@@ -16,9 +16,12 @@ limitations under the License. */ ...@@ -16,9 +16,12 @@ limitations under the License. */
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
...@@ -26,6 +29,37 @@ namespace paddle { ...@@ -26,6 +29,37 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;
namespace { // NOLINT
TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) {
case FluidDT::VarType_Type_FP32:
return TRT_DT::kFLOAT;
case FluidDT::VarType_Type_INT32:
return TRT_DT::kINT32;
default:
return TRT_DT::kINT32;
}
PADDLE_THROW("unkown type");
return TRT_DT::kINT32;
}
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
PADDLE_ENFORCE_GT(shape.size(), 1UL,
"TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions");
PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
if (shape.size() == 4UL)
return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
return nvinfer1::DimsCHW(shape[1], 1, 1);
}
} // namespace // NOLINT
/* /*
* Convert Op from Fluid to TensorRT Engine. * Convert Op from Fluid to TensorRT Engine.
*/ */
...@@ -110,6 +144,34 @@ class OpConverter { ...@@ -110,6 +144,34 @@ class OpConverter {
} }
} }
// The scope here should be inited with the parameter vars.
void ConvertBlockToTRTEngine(
framework::BlockDesc* block_desc, const framework::Scope& scope,
const std::vector<std::string>& inputs,
const std::unordered_set<std::string>& parameters,
const std::vector<std::string>& outputs, TensorRTEngine* engine) {
engine->InitNetwork();
for (auto& input : inputs) {
if (parameters.count(input)) continue;
auto* var = block_desc->FindVar(input);
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input");
auto var_shape = var->GetShape();
engine->DeclareInput(
input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(var_shape));
}
framework::proto::BlockDesc* block_proto = block_desc->Proto();
ConvertBlock(*block_proto, parameters, scope, engine);
for (auto& output : outputs) {
engine->DeclareOutput(output);
}
engine->FreezeNetwork();
}
void SetEngine(TensorRTEngine* engine) { engine_ = engine; } void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
virtual ~OpConverter() {} virtual ~OpConverter() {}
......
...@@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter { ...@@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter {
PADDLE_ENFORCE_NOT_NULL(alpha_var); PADDLE_ENFORCE_NOT_NULL(alpha_var);
auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>(); auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();
platform::CUDAPlace place; platform::CPUPlace cpu_place;
std::unique_ptr<framework::LoDTensor> alpha_tensor_device( std::unique_ptr<framework::LoDTensor> alpha_tensor_temp(
new framework::LoDTensor()); new framework::LoDTensor());
alpha_tensor_device->Resize(alpha_tensor->dims()); alpha_tensor_temp->Resize(alpha_tensor->dims());
TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get()); TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get());
float* alpha_data = alpha_tensor_device->mutable_data<float>(place); float* alpha_data = alpha_tensor_temp->mutable_data<float>(cpu_place);
// Transform alpha to TensorRTEngine::Weight plugin::PReluPlugin* plugin =
TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT, new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode);
static_cast<void*>(alpha_data),
alpha_tensor_device->numel());
plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode);
nvinfer1::IPluginLayer* layer = nvinfer1::IPluginLayer* layer =
engine_->AddPlugin(&input, input_num, plugin); engine_->AddPlugin(&input, input_num, plugin);
// keep alpha tensor to avoid release it's memory // keep alpha tensor to avoid release it's memory
engine_->weight_map[op_desc.Input("Alpha")[0]] = engine_->weight_map[op_desc.Input("Alpha")[0]] =
std::move(alpha_tensor_device); std::move(alpha_tensor_temp);
std::string layer_name = "prelu (Output: "; std::string layer_name = "prelu (Output: ";
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
......
...@@ -19,7 +19,9 @@ limitations under the License. */ ...@@ -19,7 +19,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
...@@ -79,7 +81,8 @@ class TRTConvertValidation { ...@@ -79,7 +81,8 @@ class TRTConvertValidation {
if_add_batch_(if_add_batch), if_add_batch_(if_add_batch),
max_batch_size_(max_batch_size) { max_batch_size_(max_batch_size) {
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_)); engine_.reset(
new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0));
engine_->InitNetwork(); engine_->InitNetwork();
} }
...@@ -114,13 +117,12 @@ class TRTConvertValidation { ...@@ -114,13 +117,12 @@ class TRTConvertValidation {
} }
void DeclVar(const std::string& name, const std::vector<int> dim_vec) { void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
platform::CUDAPlace place; platform::CUDADeviceContext ctx(place_);
platform::CUDADeviceContext ctx(place);
auto* x = scope_.Var(name); auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>(); auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec)); x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place, ctx); RandomizeTensor(x_tensor, place_, ctx);
} }
// Declare a variable in a fluid Scope. // Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims, void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
...@@ -146,19 +148,6 @@ class TRTConvertValidation { ...@@ -146,19 +148,6 @@ class TRTConvertValidation {
// Declare outputs. // Declare outputs.
op_desc_.reset(new framework::OpDesc(desc, nullptr)); op_desc_.reset(new framework::OpDesc(desc, nullptr));
// Set Inputs.
for (const auto& input : op_desc_->InputArgumentNames()) {
if (parameters_.count(input)) continue;
auto* var = scope_.FindVar(input);
PADDLE_ENFORCE(var);
auto tensor = var->GetMutable<framework::LoDTensor>();
engine_->SetInputFromGPU(
input, static_cast<void*>(tensor->data<void>()),
sizeof(float) *
analysis::AccuDims(tensor->dims(), tensor->dims().size()));
}
} }
// We use the set 'neglected_output' here, because some Ops like batch norm, // We use the set 'neglected_output' here, because some Ops like batch norm,
...@@ -168,43 +157,71 @@ class TRTConvertValidation { ...@@ -168,43 +157,71 @@ class TRTConvertValidation {
std::unordered_set<std::string> neglected_output = {}) { std::unordered_set<std::string> neglected_output = {}) {
// Execute Fluid Op // Execute Fluid Op
PADDLE_ENFORCE_LE(batch_size, max_batch_size_); PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
platform::CUDAPlace place; platform::CUDADeviceContext ctx(place_);
platform::CUDADeviceContext ctx(place); op_->Run(scope_, place_);
op_->Run(scope_, place);
// Execute TRT.
engine_->Execute(batch_size);
cudaStreamSynchronize(engine_->stream());
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); std::vector<std::string> input_output_names;
const size_t output_space_size = 3000;
// Note: we need filter the parameter
for (const auto& input : op_desc_->InputArgumentNames()) {
if (parameters_.count(input)) continue;
input_output_names.push_back(input);
}
// Collect the fluid outputs.
std::vector<std::vector<float>> fluid_outs;
for (const auto& output : op_desc_->OutputArgumentNames()) { for (const auto& output : op_desc_->OutputArgumentNames()) {
if (neglected_output.count(output)) continue; if (neglected_output.count(output)) continue;
input_output_names.push_back(output);
std::vector<float> fluid_out; std::vector<float> fluid_out;
std::vector<float> trt_out(output_space_size);
engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
cudaStreamSynchronize(engine_->stream());
auto* var = scope_.FindVar(output); auto* var = scope_.FindVar(output);
auto tensor = var->GetMutable<framework::LoDTensor>(); auto* tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &fluid_out); framework::TensorToVector(*tensor, ctx, &fluid_out);
fluid_outs.push_back(fluid_out);
}
// Bind input and output for TRT.
const int num_bindings = input_output_names.size();
std::vector<void*> buffers(num_bindings);
for (const std::string& name : input_output_names) {
auto* var = scope_.FindVar(name);
auto* tensor = var->GetMutable<framework::LoDTensor>();
const int bind_index = engine_->engine()->getBindingIndex(name.c_str());
buffers[bind_index] =
static_cast<void*>(tensor->mutable_data<float>(place_));
}
// Execute TRT.
engine_->Execute(batch_size, &buffers, stream_);
size_t fluid_out_size = fluid_out.size(); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
int index = 0;
for (const auto& output : op_desc_->OutputArgumentNames()) {
if (neglected_output.count(output)) continue;
std::vector<float> trt_out;
auto* var = scope_.FindVar(output);
auto* tensor = var->GetMutable<framework::LoDTensor>();
framework::TensorToVector(*tensor, ctx, &trt_out);
size_t fluid_out_size = fluid_outs[index].size();
if (if_add_batch_ == true) { if (if_add_batch_ == true) {
fluid_out_size = fluid_out_size =
batch_size * (framework::product(tensor->dims()) / max_batch_size_); batch_size * (framework::product(tensor->dims()) / max_batch_size_);
} }
// Compare two output
ASSERT_FALSE(fluid_out.empty());
for (size_t i = 0; i < fluid_out_size; i++) { for (size_t i = 0; i < fluid_out_size; i++) {
// Loose the threshold for CI in different machine model. // Loose the threshold for CI in different machine model.
EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5); EXPECT_LT(std::abs(fluid_outs[index][i] - trt_out[i]), 2e-5);
} }
index += 1;
} }
} }
framework::Scope& scope() { return scope_; } framework::Scope& scope() { return scope_; }
private: private:
platform::CUDAPlace place_;
std::unique_ptr<TensorRTEngine> engine_; std::unique_ptr<TensorRTEngine> engine_;
cudaStream_t stream_; cudaStream_t stream_;
std::unique_ptr<framework::OperatorBase> op_; std::unique_ptr<framework::OperatorBase> op_;
......
...@@ -32,36 +32,18 @@ void TensorRTEngine::Build(const DescType &paddle_model) { ...@@ -32,36 +32,18 @@ void TensorRTEngine::Build(const DescType &paddle_model) {
PADDLE_ENFORCE(false, "not implemented"); PADDLE_ENFORCE(false, "not implemented");
} }
void TensorRTEngine::Execute(int batch_size) { void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
cudaStream_t stream) {
freshDeviceId(); freshDeviceId();
batch_size_ = batch_size; batch_size_ = batch_size;
std::vector<void *> buffers; infer_context_->enqueue(batch_size, buffers->data(), stream, nullptr);
for (auto &buf : buffers_) { cudaStreamSynchronize(stream);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
PADDLE_ENFORCE_GT(buf.max_size, 0);
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
buffers.push_back(buf.buffer);
}
infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr);
cudaStreamSynchronize(stream_);
SetRuntimeBatch(batch_size); SetRuntimeBatch(batch_size);
} }
TensorRTEngine::~TensorRTEngine() {
cudaStreamSynchronize(stream_);
// clean buffer
for (auto &buf : buffers_) {
if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
buf.buffer = nullptr;
buf.max_size = 0;
}
}
}
void TensorRTEngine::FreezeNetwork() { void TensorRTEngine::FreezeNetwork() {
VLOG(3) << "TRT to freeze network";
freshDeviceId(); freshDeviceId();
VLOG(3) << "TRT to freeze network";
PADDLE_ENFORCE(infer_builder_ != nullptr, PADDLE_ENFORCE(infer_builder_ != nullptr,
"Call InitNetwork first to initialize network."); "Call InitNetwork first to initialize network.");
PADDLE_ENFORCE(infer_network_ != nullptr, PADDLE_ENFORCE(infer_network_ != nullptr,
...@@ -81,30 +63,6 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -81,30 +63,6 @@ void TensorRTEngine::FreezeNetwork() {
PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
infer_context_.reset(infer_engine_->createExecutionContext()); infer_context_.reset(infer_engine_->createExecutionContext());
// allocate GPU buffers.
buffers_.resize(buffer_sizes_.size());
for (auto &item : buffer_sizes_) {
// The output buffers are not set in the network building phrase, need to
// infer from the TesorRT network.
if (item.second == 0) {
auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
auto dims = infer_engine_->getBindingDimensions(slot_offset);
item.second = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
PADDLE_ENFORCE_GT(item.second, 0);
}
auto &buf = buffer(item.first);
buf.max_size = item.second * max_batch_;
CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
buf.size = 0;
PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G
buf.device = DeviceType::GPU;
}
} }
nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
...@@ -158,83 +116,6 @@ void TensorRTEngine::DeclareOutput(const std::string &name) { ...@@ -158,83 +116,6 @@ void TensorRTEngine::DeclareOutput(const std::string &name) {
buffer_sizes_[name] = 0; buffer_sizes_[name] = 0;
} }
void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
return buffer(name).buffer;
}
void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
size_t max_size) {
// determine data size
auto *output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
kDataTypeSize[static_cast<int>(output->getType())];
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_LE(dst_size, it->second);
PADDLE_ENFORCE_GE(max_size, dst_size);
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
cudaMemcpyDeviceToDevice, stream_),
0);
}
void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
size_t max_size) {
// determine data size
auto *output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
kDataTypeSize[static_cast<int>(output->getType())];
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_LE(dst_size, it->second);
PADDLE_ENFORCE_GE(max_size, dst_size);
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
cudaMemcpyDeviceToHost, stream_));
}
Buffer &TensorRTEngine::buffer(const std::string &name) {
PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s",
name);
auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
return buffers_[slot_offset];
}
void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
size_t size) {
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
PADDLE_ENFORCE_NOT_NULL(data);
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
buf.size = size;
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
cudaMemcpyHostToDevice, stream_));
}
void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
size_t size) {
auto &buf = buffer(name);
buf.size = size;
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
cudaMemcpyDeviceToDevice, stream_));
}
void TensorRTEngine::SetITensor(const std::string &name, void TensorRTEngine::SetITensor(const std::string &name,
nvinfer1::ITensor *tensor) { nvinfer1::ITensor *tensor) {
PADDLE_ENFORCE(tensor != nullptr); PADDLE_ENFORCE(tensor != nullptr);
...@@ -254,13 +135,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { ...@@ -254,13 +135,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
void TensorRTEngine::freshDeviceId() {
int count;
cudaGetDeviceCount(&count);
PADDLE_ENFORCE_LT(device_, count);
cudaSetDevice(device_);
}
nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
nvinfer1::ITensor *const *inputs, int num_inputs, nvinfer1::ITensor *const *inputs, int num_inputs,
plugin::PluginTensorRT *plugin) { plugin::PluginTensorRT *plugin) {
...@@ -268,6 +142,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( ...@@ -268,6 +142,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin); return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin);
} }
void TensorRTEngine::freshDeviceId() {
int count;
cudaGetDeviceCount(&count);
PADDLE_ENFORCE_LT(device_id_, count);
cudaSetDevice(device_id_);
}
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -23,6 +23,7 @@ limitations under the License. */ ...@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
...@@ -37,7 +38,9 @@ class TRTInt8Calibrator; ...@@ -37,7 +38,9 @@ class TRTInt8Calibrator;
* There are two alternative ways to use it, one is to build from a paddle * There are two alternative ways to use it, one is to build from a paddle
* protobuf model, another way is to manully construct the network. * protobuf model, another way is to manully construct the network.
*/ */
class TensorRTEngine : public EngineBase { class TensorRTEngine {
using DescType = ::paddle::framework::proto::BlockDesc;
public: public:
// Weight is model parameter. // Weight is model parameter.
class Weight { class Weight {
...@@ -56,28 +59,28 @@ class TensorRTEngine : public EngineBase { ...@@ -56,28 +59,28 @@ class TensorRTEngine : public EngineBase {
nvinfer1::Weights w_; nvinfer1::Weights w_;
}; };
TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream, TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false,
int device = 0, bool enable_int8 = false, TRTInt8Calibrator* calibrator = nullptr, int device_id = 0,
TRTInt8Calibrator* calibrator = nullptr,
nvinfer1::ILogger& logger = NaiveLogger::Global()) nvinfer1::ILogger& logger = NaiveLogger::Global())
: max_batch_(max_batch), : max_batch_(max_batch),
max_workspace_(max_workspace), max_workspace_(max_workspace),
stream_(stream),
device_(device),
enable_int8_(enable_int8), enable_int8_(enable_int8),
calibrator_(calibrator), calibrator_(calibrator),
device_id_(device_id),
logger_(logger) {} logger_(logger) {}
virtual ~TensorRTEngine(); ~TensorRTEngine() {}
// TODO(Superjomn) implement it later when graph segmentation is supported. // TODO(Superjomn) implement it later when graph segmentation is supported.
void Build(const DescType& paddle_model) override; void Build(const DescType& paddle_model);
void Execute(int batch_size) override; void Execute(int batch_size, std::vector<void*>* buffers,
cudaStream_t stream);
// Initialize the inference network, so that TensorRT layers can add to this // Initialize the inference network, so that TensorRT layers can add to this
// network. // network.
void InitNetwork() { void InitNetwork() {
freshDeviceId();
infer_builder_.reset(createInferBuilder(&logger_)); infer_builder_.reset(createInferBuilder(&logger_));
infer_network_.reset(infer_builder_->createNetwork()); infer_network_.reset(infer_builder_->createNetwork());
} }
...@@ -98,37 +101,34 @@ class TensorRTEngine : public EngineBase { ...@@ -98,37 +101,34 @@ class TensorRTEngine : public EngineBase {
// Check if the ITensor has been declared // Check if the ITensor has been declared
bool HasDeclared(const std::string& name); bool HasDeclared(const std::string& name);
// GPU memory address for an ITensor with specific name. One can operate on
// these memory directly for acceleration, for example, output the converted
// data directly to the buffer to save data copy overhead.
// NOTE this should be used after calling `FreezeNetwork`.
Buffer& buffer(const std::string& name) override;
cudaStream_t stream() { return stream_; }
// Fill an input from CPU memory with name and size.
void SetInputFromCPU(const std::string& name, const void* data, size_t size);
// TODO(Superjomn) is this method necessary given that buffer(xxx) can be
// accessed directly. Fill an input from GPU memory with name and size.
void SetInputFromGPU(const std::string& name, const void* data, size_t size);
// Get an output called name, the output of tensorrt is in GPU, so this method
// Return the output's GPU memory address without copy.
void* GetOutputInGPU(const std::string& name);
// Copy data into dst inside the GPU device.
void GetOutputInGPU(const std::string& name, void* dst, size_t max_size);
// LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
// to CPU.
void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
// Fill an ITensor into map itensor_map_.
void SetITensor(const std::string& name, nvinfer1::ITensor* tensor); void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
// Get an ITensor called name. // Get an ITensor called name.
nvinfer1::ITensor* GetITensor(const std::string& name); nvinfer1::ITensor* GetITensor(const std::string& name);
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
nvinfer1::IHostMemory* Serialize() {
PADDLE_ENFORCE(infer_engine_ != nullptr,
"You should build engine first and then serialize");
ihost_memory_.reset(infer_engine_->serialize());
return ihost_memory_.get();
}
void Deserialize(const std::string& engine_serialized_data) {
freshDeviceId();
infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
infer_engine_.reset(runtime->deserializeCudaEngine(
engine_serialized_data.c_str(), engine_serialized_data.size(),
&inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
PADDLE_ENFORCE(infer_engine_ != nullptr,
"build cuda engine failed when deserialize engine info.!");
infer_context_.reset(infer_engine_->createExecutionContext());
}
void SetRuntimeBatch(size_t batch_size); void SetRuntimeBatch(size_t batch_size);
int GetRuntimeBatch(); int GetRuntimeBatch();
int GetDevice() { return device_; } int GetDeviceId() { return device_id_; }
nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
int num_inputs, plugin::PluginTensorRT*); int num_inputs, plugin::PluginTensorRT*);
...@@ -140,17 +140,12 @@ class TensorRTEngine : public EngineBase { ...@@ -140,17 +140,12 @@ class TensorRTEngine : public EngineBase {
std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>> std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
weight_map; weight_map;
// TODO(NHZLX)
// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into one conv, and then trigger bug. So, We should use strategy to avoid
// this
// optimization for the time being. This bug will be fixed in the future.
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
itensor_quote_num;
private: private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling
// freshDeviceId().
void freshDeviceId();
// the max batch size // the max batch size
int max_batch_; int max_batch_;
// the runtime batch size // the runtime batch size
...@@ -158,18 +153,14 @@ class TensorRTEngine : public EngineBase { ...@@ -158,18 +153,14 @@ class TensorRTEngine : public EngineBase {
// the max memory size the engine uses // the max memory size the engine uses
int max_workspace_; int max_workspace_;
cudaStream_t stream_;
// The specific GPU id that the TensorRTEngine bounded to.
int device_;
bool enable_int8_; bool enable_int8_;
TRTInt8Calibrator* calibrator_; TRTInt8Calibrator* calibrator_;
// batch size of the current data, will be updated each Executation. // batch size of the current data, will be updated each Executation.
int batch_size_{-1}; int batch_size_{-1};
int device_id_;
nvinfer1::ILogger& logger_; nvinfer1::ILogger& logger_;
std::vector<Buffer> buffers_;
// max data size for the buffers. // max data size for the buffers.
std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_; std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/> std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
...@@ -192,15 +183,11 @@ class TensorRTEngine : public EngineBase { ...@@ -192,15 +183,11 @@ class TensorRTEngine : public EngineBase {
infer_ptr<nvinfer1::INetworkDefinition> infer_network_; infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
infer_ptr<nvinfer1::ICudaEngine> infer_engine_; infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
infer_ptr<nvinfer1::IExecutionContext> infer_context_; infer_ptr<nvinfer1::IExecutionContext> infer_context_;
// Each ICudaEngine object is bound to a specific GPU when it is instantiated, infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
// ensure that the thread is associated with the correct device by calling
// freshDeviceId().
void freshDeviceId();
}; // class TensorRTEngine }; // class TensorRTEngine
// Add an layer__ into engine__ with args ARGS. // Add an layer__ into engine__ with args ARGS.
// For example: // For example:
// TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias)
// //
// Reference // Reference
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network
......
...@@ -17,6 +17,9 @@ ...@@ -17,6 +17,9 @@
#include <NvInfer.h> #include <NvInfer.h>
#include <cuda.h> #include <cuda.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/platform/dynload/tensorrt.h" #include "paddle/fluid/platform/dynload/tensorrt.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -74,6 +77,32 @@ class NaiveLogger : public nvinfer1::ILogger { ...@@ -74,6 +77,32 @@ class NaiveLogger : public nvinfer1::ILogger {
~NaiveLogger() override {} ~NaiveLogger() override {}
}; };
class NaiveProfiler : public nvinfer1::IProfiler {
public:
typedef std::pair<std::string, float> Record;
std::vector<Record> mProfile;
virtual void reportLayerTime(const char* layerName, float ms) {
auto record =
std::find_if(mProfile.begin(), mProfile.end(),
[&](const Record& r) { return r.first == layerName; });
if (record == mProfile.end())
mProfile.push_back(std::make_pair(layerName, ms));
else
record->second += ms;
}
void printLayerTimes() {
float totalTime = 0;
for (size_t i = 0; i < mProfile.size(); i++) {
printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(),
mProfile[i].second);
totalTime += mProfile[i].second;
}
printf("Time over all layers: %4.3f\n", totalTime);
}
};
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
nv_library(tensorrt_plugin nv_library(tensorrt_plugin
SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
prelu_op_plugin.cu trt_plugin_factory.cc
avg_pool_op_plugin.cu avg_pool_op_plugin.cu
DEPS enforce tensorrt_engine prelu) DEPS enforce tensorrt_engine prelu)
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/math/pooling.h"
namespace paddle { namespace paddle {
...@@ -20,6 +21,12 @@ namespace inference { ...@@ -20,6 +21,12 @@ namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin { namespace plugin {
AvgPoolPlugin* CreateAvgPoolPluginDeserialize(const void* buffer,
size_t length) {
return new AvgPoolPlugin(buffer, length);
}
REGISTER_TRT_PLUGIN("avg_pool_plugin", CreateAvgPoolPluginDeserialize);
nvinfer1::Dims AvgPoolPlugin::getOutputDimensions( nvinfer1::Dims AvgPoolPlugin::getOutputDimensions(
int index, const nvinfer1::Dims* inputDims, int nbInputs) { int index, const nvinfer1::Dims* inputDims, int nbInputs) {
assert(nbInputs == 1); assert(nbInputs == 1);
......
...@@ -33,24 +33,27 @@ class AvgPoolPlugin : public PluginTensorRT { ...@@ -33,24 +33,27 @@ class AvgPoolPlugin : public PluginTensorRT {
protected: protected:
size_t getSerializationSize() override { size_t getSerializationSize() override {
return SerializedSize(ceil_mode_) + SerializedSize(ksize_) + return SerializedSize(getPluginType()) + SerializedSize(ceil_mode_) +
SerializedSize(strides_) + SerializedSize(paddings_) + SerializedSize(ksize_) + SerializedSize(strides_) +
SerializedSize(input_shape_) + getBaseSerializationSize(); SerializedSize(paddings_) + SerializedSize(input_shape_) +
SerializedSize(output_shape_) + getBaseSerializationSize();
} }
// TRT will call this func when we need to serialize the configuration of // TRT will call this func when we need to serialize the configuration of
// tensorrt. // tensorrt.
// It should not be called by users.
void serialize(void *buffer) override { void serialize(void *buffer) override {
SerializeValue(&buffer, getPluginType());
serializeBase(buffer); serializeBase(buffer);
SerializeValue(&buffer, ceil_mode_); SerializeValue(&buffer, ceil_mode_);
SerializeValue(&buffer, ksize_); SerializeValue(&buffer, ksize_);
SerializeValue(&buffer, strides_); SerializeValue(&buffer, strides_);
SerializeValue(&buffer, paddings_); SerializeValue(&buffer, paddings_);
SerializeValue(&buffer, input_shape_); SerializeValue(&buffer, input_shape_);
SerializeValue(&buffer, output_shape_);
} }
public: public:
AvgPoolPlugin() {}
AvgPoolPlugin(bool ceil_mode, std::vector<int> ksize, AvgPoolPlugin(bool ceil_mode, std::vector<int> ksize,
std::vector<int> strides, std::vector<int> paddings, std::vector<int> strides, std::vector<int> paddings,
std::vector<int> input_shape) std::vector<int> input_shape)
...@@ -89,6 +92,7 @@ class AvgPoolPlugin : public PluginTensorRT { ...@@ -89,6 +92,7 @@ class AvgPoolPlugin : public PluginTensorRT {
DeserializeValue(&serialData, &serialLength, &strides_); DeserializeValue(&serialData, &serialLength, &strides_);
DeserializeValue(&serialData, &serialLength, &paddings_); DeserializeValue(&serialData, &serialLength, &paddings_);
DeserializeValue(&serialData, &serialLength, &input_shape_); DeserializeValue(&serialData, &serialLength, &input_shape_);
DeserializeValue(&serialData, &serialLength, &output_shape_);
} }
AvgPoolPlugin *clone() const override { AvgPoolPlugin *clone() const override {
...@@ -96,7 +100,7 @@ class AvgPoolPlugin : public PluginTensorRT { ...@@ -96,7 +100,7 @@ class AvgPoolPlugin : public PluginTensorRT {
input_shape_); input_shape_);
} }
const char *getPluginType() const override { return "avg_pool"; } const char *getPluginType() const override { return "avg_pool_plugin"; }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
int nbInputDims) override; int nbInputDims) override;
......
...@@ -14,12 +14,19 @@ limitations under the License. */ ...@@ -14,12 +14,19 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin { namespace plugin {
ElementWisePlugin* CreateElementWisePluginDeserialize(const void* buffer,
size_t length) {
return new ElementWisePlugin(buffer, length);
}
REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize);
namespace details { namespace details {
template <typename T> template <typename T>
...@@ -119,10 +126,10 @@ int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs, ...@@ -119,10 +126,10 @@ int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
const float* y = reinterpret_cast<const float*>(inputs[1]); const float* y = reinterpret_cast<const float*>(inputs[1]);
float* out = reinterpret_cast<float*>(outputs[0]); float* out = reinterpret_cast<float*>(outputs[0]);
if (type_ == nvinfer1::ElementWiseOperation::kSUM) { if (type_ == "add") {
details::ElementWise(details::Add<float>(), x, y, out, batch_size, details::ElementWise(details::Add<float>(), x, y, out, batch_size,
prev_size_, midd_size_, post_size_, stream); prev_size_, midd_size_, post_size_, stream);
} else if (type_ == nvinfer1::ElementWiseOperation::kPROD) { } else if (type_ == "mul") {
details::ElementWise(details::Mul<float>(), x, y, out, batch_size, details::ElementWise(details::Mul<float>(), x, y, out, batch_size,
prev_size_, midd_size_, post_size_, stream); prev_size_, midd_size_, post_size_, stream);
} else { } else {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...@@ -24,9 +25,8 @@ namespace plugin { ...@@ -24,9 +25,8 @@ namespace plugin {
class ElementWisePlugin : public PluginTensorRT { class ElementWisePlugin : public PluginTensorRT {
public: public:
ElementWisePlugin(nvinfer1::ElementWiseOperation type, ElementWisePlugin(std::string type, nvinfer1::Dims const &dims_x,
nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y, nvinfer1::Dims const &dims_y, int axis)
int axis)
: type_(type), : type_(type),
dims_x_(dims_x), dims_x_(dims_x),
dims_y_(dims_y), dims_y_(dims_y),
...@@ -37,6 +37,9 @@ class ElementWisePlugin : public PluginTensorRT { ...@@ -37,6 +37,9 @@ class ElementWisePlugin : public PluginTensorRT {
ElementWisePlugin(void const *serial_data, size_t serial_length) { ElementWisePlugin(void const *serial_data, size_t serial_length) {
deserializeBase(serial_data, serial_length); deserializeBase(serial_data, serial_length);
const char *elementwise_type;
DeserializeValue(&serial_data, &serial_length, &elementwise_type);
type_ = std::string(elementwise_type);
DeserializeValue(&serial_data, &serial_length, &axis_); DeserializeValue(&serial_data, &serial_length, &axis_);
DeserializeValue(&serial_data, &serial_length, &dims_x_); DeserializeValue(&serial_data, &serial_length, &dims_x_);
DeserializeValue(&serial_data, &serial_length, &dims_y_); DeserializeValue(&serial_data, &serial_length, &dims_y_);
...@@ -47,7 +50,7 @@ class ElementWisePlugin : public PluginTensorRT { ...@@ -47,7 +50,7 @@ class ElementWisePlugin : public PluginTensorRT {
return nullptr; return nullptr;
} }
const char *getPluginType() const override { return "elementwise"; } const char *getPluginType() const override { return "elementwise_plugin"; }
nvinfer1::Dims getOutputDimensions(int index, nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims *input_dims, const nvinfer1::Dims *input_dims,
...@@ -61,18 +64,21 @@ class ElementWisePlugin : public PluginTensorRT { ...@@ -61,18 +64,21 @@ class ElementWisePlugin : public PluginTensorRT {
protected: protected:
size_t getSerializationSize() override { size_t getSerializationSize() override {
return SerializedSize(axis_) + SerializedSize(dims_x_) + return SerializedSize(getPluginType()) + SerializedSize(axis_) +
SerializedSize(dims_y_) + getBaseSerializationSize(); SerializedSize(dims_x_) + SerializedSize(dims_y_) +
getBaseSerializationSize();
} }
void serialize(void *buffer) override { void serialize(void *buffer) override {
SerializeValue(&buffer, getPluginType());
serializeBase(buffer); serializeBase(buffer);
SerializeValue(&buffer, type_.c_str());
SerializeValue(&buffer, axis_); SerializeValue(&buffer, axis_);
SerializeValue(&buffer, dims_x_); SerializeValue(&buffer, dims_x_);
SerializeValue(&buffer, dims_y_); SerializeValue(&buffer, dims_y_);
} }
nvinfer1::ElementWiseOperation type_; std::string type_;
nvinfer1::Dims dims_x_; nvinfer1::Dims dims_x_;
nvinfer1::Dims dims_y_; nvinfer1::Dims dims_y_;
int axis_; int axis_;
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/operators/math/prelu.h" #include "paddle/fluid/operators/math/prelu.h"
namespace paddle { namespace paddle {
...@@ -24,6 +25,17 @@ namespace inference { ...@@ -24,6 +25,17 @@ namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin { namespace plugin {
PReluPlugin *CreatePreluPluginDeserialize(const void *buffer, size_t length) {
return new PReluPlugin(buffer, length);
}
REGISTER_TRT_PLUGIN("prelu_plugin", CreatePreluPluginDeserialize);
int PReluPlugin::initialize() {
cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
cudaMemcpyHostToDevice);
}
nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
const nvinfer1::Dims *inputDims, const nvinfer1::Dims *inputDims,
int nbInputs) { int nbInputs) {
...@@ -39,7 +51,8 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs, ...@@ -39,7 +51,8 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
// input dims is CHW. // input dims is CHW.
const auto &input_dims = this->getInputDims(0); const auto &input_dims = this->getInputDims(0);
const float *input = reinterpret_cast<const float *>(inputs[0]); const float *input = reinterpret_cast<const float *>(inputs[0]);
const float *alpha = reinterpret_cast<const float *>(alpha_.get().values); // const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
const float *alpha = p_gpu_weight_;
float *output = reinterpret_cast<float **>(outputs)[0]; float *output = reinterpret_cast<float **>(outputs)[0];
std::vector<int> input_shape; std::vector<int> input_shape;
......
...@@ -14,7 +14,12 @@ ...@@ -14,7 +14,12 @@
#pragma once #pragma once
#include <algorithm>
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...@@ -24,39 +29,51 @@ namespace tensorrt { ...@@ -24,39 +29,51 @@ namespace tensorrt {
namespace plugin { namespace plugin {
class PReluPlugin : public PluginTensorRT { class PReluPlugin : public PluginTensorRT {
TensorRTEngine::Weight alpha_; std::vector<float> weight_;
float *p_gpu_weight_;
std::string mode_; std::string mode_;
protected: protected:
size_t getSerializationSize() override { size_t getSerializationSize() override {
// return getBaseSerializationSize(alpha_) + SerializedSize(mode_); return getBaseSerializationSize() + SerializedSize(mode_.c_str()) +
return 0; SerializedSize(weight_) + SerializedSize(getPluginType());
} }
// TRT will call this func when we need to serialize the configuration of // TRT will call this func when we need to serialize the configuration of
// tensorrt. // tensorrt.
// It should not be called by users. // It should not be called by users.
void serialize(void *buffer) override { void serialize(void *buffer) override {
// serializeBase(buffer); SerializeValue(&buffer, getPluginType());
// SerializeValue(&buffer, alpha_); serializeBase(buffer);
// SerializeValue(&buffer, mode_); SerializeValue(&buffer, weight_);
SerializeValue(&buffer, mode_.c_str());
} }
public: public:
PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode) PReluPlugin(const float *weight, const int weight_num,
: alpha_(alpha), mode_(mode) {} std::string const &mode)
: mode_(mode) {
weight_.resize(weight_num);
std::copy(weight, weight + weight_num, weight_.data());
}
// It was used for tensorrt deserialization. // It was used for tensorrt deserialization.
// It should not be called by users. // It should not be called by users.
PReluPlugin(void const *serialData, size_t serialLength) { PReluPlugin(void const *serialData, size_t serialLength) {
// deserializeBase(serialData, serialLength); deserializeBase(serialData, serialLength);
// DeserializeValue(&serialData, &serialLength, &alpha_); DeserializeValue(&serialData, &serialLength, &weight_);
// DeserializeValue(&serialData, &serialLength, &mode_); const char *prelu_mode;
DeserializeValue(&serialData, &serialLength, &prelu_mode);
mode_ = std::string(prelu_mode);
} }
~PReluPlugin() { cudaFree(p_gpu_weight_); }
int initialize() override;
PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); } PReluPlugin *clone() const override {
return new PReluPlugin(weight_.data(), weight_.size(), mode_);
}
const char *getPluginType() const override { return "prelu"; } const char *getPluginType() const override { return "prelu_plugin"; }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
int nbInputDims) override; int nbInputDims) override;
......
...@@ -15,12 +15,18 @@ ...@@ -15,12 +15,18 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <algorithm> #include <algorithm>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin { namespace plugin {
SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) {
return new SplitPlugin(buffer, length);
}
REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize);
// copied from operators::math::SplitFunctor // copied from operators::math::SplitFunctor
template <typename T> template <typename T>
__global__ void SplitKernel(const T* input_data, const int in_row, __global__ void SplitKernel(const T* input_data, const int in_row,
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...@@ -25,6 +26,7 @@ namespace plugin { ...@@ -25,6 +26,7 @@ namespace plugin {
class SplitPlugin : public PluginTensorRT { class SplitPlugin : public PluginTensorRT {
public: public:
SplitPlugin() {}
SplitPlugin(int axis, std::vector<int> const &output_lengths) SplitPlugin(int axis, std::vector<int> const &output_lengths)
: axis_(axis), same_shape_(true), output_length_(output_lengths) {} : axis_(axis), same_shape_(true), output_length_(output_lengths) {}
...@@ -38,7 +40,7 @@ class SplitPlugin : public PluginTensorRT { ...@@ -38,7 +40,7 @@ class SplitPlugin : public PluginTensorRT {
return new SplitPlugin(axis_, output_length_); return new SplitPlugin(axis_, output_length_);
} }
const char *getPluginType() const override { return "split"; } const char *getPluginType() const override { return "split_plugin"; }
int getNbOutputs() const override { return output_length_.size(); } int getNbOutputs() const override { return output_length_.size(); }
nvinfer1::Dims getOutputDimensions(int index, nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims *input_dims, const nvinfer1::Dims *input_dims,
...@@ -50,11 +52,12 @@ class SplitPlugin : public PluginTensorRT { ...@@ -50,11 +52,12 @@ class SplitPlugin : public PluginTensorRT {
protected: protected:
size_t getSerializationSize() override { size_t getSerializationSize() override {
return SerializedSize(axis_) + SerializedSize(output_length_) + return SerializedSize(getPluginType()) + SerializedSize(axis_) +
getBaseSerializationSize(); SerializedSize(output_length_) + getBaseSerializationSize();
} }
void serialize(void *buffer) override { void serialize(void *buffer) override {
SerializeValue(&buffer, getPluginType());
serializeBase(buffer); serializeBase(buffer);
SerializeValue(&buffer, axis_); SerializeValue(&buffer, axis_);
SerializeValue(&buffer, output_length_); SerializeValue(&buffer, output_length_);
......
...@@ -17,9 +17,10 @@ ...@@ -17,9 +17,10 @@
#include <NvInfer.h> #include <NvInfer.h>
#include <cstring> #include <cstring>
#include <unordered_map> #include <unordered_map>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/serialize.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -30,6 +31,13 @@ namespace inference { ...@@ -30,6 +31,13 @@ namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin { namespace plugin {
class PluginTensorRT;
typedef std::function<PluginTensorRT*(const void*, size_t)>
PluginDeserializeFunc;
typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
class PluginTensorRT : public nvinfer1::IPluginExt { class PluginTensorRT : public nvinfer1::IPluginExt {
public: public:
PluginTensorRT() {} PluginTensorRT() {}
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
const void* serial_data,
size_t serial_length) {
const char* plugin_type;
DeserializeValue(&serial_data, &serial_length, &plugin_type);
PADDLE_ENFORCE(Has(plugin_type),
"trt plugin type %s does not exists, check it.", plugin_type);
auto plugin = plugin_registry_[plugin_type](serial_data, serial_length);
owned_plugins_.emplace_back(plugin);
return plugin;
}
bool PluginFactoryTensorRT::RegisterPlugin(
const std::string& op_name, PluginDeserializeFunc deserialize_func) {
if (Has(op_name)) return false;
auto ret = plugin_registry_.emplace(op_name, deserialize_func);
return ret.second;
}
void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); }
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <NvInfer.h>
#include <cstring>
#include <list>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
class PluginFactoryTensorRT : public nvinfer1::IPluginFactory,
public DeleteHelper {
public:
// Deserialization method
PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data,
size_t serial_length) override;
bool RegisterPlugin(const std::string& op_name,
PluginDeserializeFunc deserialize_func);
bool Has(const std::string& op_name) {
return plugin_registry_.find(op_name) != plugin_registry_.end();
}
void DestroyPlugins();
protected:
std::unordered_map<std::string, PluginDeserializeFunc> plugin_registry_;
std::list<std::unique_ptr<PluginTensorRT>> owned_plugins_;
};
class TrtPluginRegistrar {
public:
TrtPluginRegistrar(const std::string& name,
PluginDeserializeFunc deserialize_func) {
inference::Singleton<PluginFactoryTensorRT>::Global().RegisterPlugin(
name, deserialize_func);
}
};
#define REGISTER_TRT_PLUGIN(name, deserialize_func) \
REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func)
#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func) \
static paddle::inference::tensorrt::plugin::TrtPluginRegistrar \
trt_plugin_registrar##ctr __attribute__((unused)) = \
paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \
name, deserialize_func)
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <cstring> #include <cstring>
#include <string>
#include <type_traits> #include <type_traits>
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -24,6 +24,13 @@ namespace inference { ...@@ -24,6 +24,13 @@ namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin { namespace plugin {
// Some trt base classes lack of the destructor.
// We use a assisted class to fix this.
struct DeleteHelper {
protected:
virtual ~DeleteHelper() {}
};
template <typename T> template <typename T>
inline void SerializeValue(void** buffer, T const& value); inline void SerializeValue(void** buffer, T const& value);
......
...@@ -17,6 +17,8 @@ limitations under the License. */ ...@@ -17,6 +17,8 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -27,19 +29,34 @@ namespace tensorrt { ...@@ -27,19 +29,34 @@ namespace tensorrt {
class TensorRTEngineTest : public ::testing::Test { class TensorRTEngineTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ASSERT_EQ(0, cudaStreamCreate(&stream_)); ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
engine_ = new TensorRTEngine(10, 1 << 10, stream_);
engine_ = new TensorRTEngine(10, 1 << 10);
engine_->InitNetwork(); engine_->InitNetwork();
} }
void TearDown() override { void TearDown() override {
delete engine_; if (engine_) {
cudaStreamDestroy(stream_); delete engine_;
engine_ = nullptr;
}
}
void PrepareInputOutput(const std::vector<float> &input,
std::vector<int> output_shape) {
TensorFromVector(input, *ctx_, &input_);
output_.Resize(framework::make_ddim(output_shape));
}
void GetOutput(std::vector<float> *output) {
TensorToVector(output_, *ctx_, output);
} }
protected: protected:
TensorRTEngine* engine_; framework::Tensor input_;
cudaStream_t stream_; framework::Tensor output_;
TensorRTEngine *engine_;
platform::CUDADeviceContext *ctx_;
}; };
TEST_F(TensorRTEngineTest, add_layer) { TEST_F(TensorRTEngineTest, add_layer) {
...@@ -48,12 +65,14 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -48,12 +65,14 @@ TEST_F(TensorRTEngineTest, add_layer) {
float raw_weight[size] = {2.}; // Weight in CPU memory. float raw_weight[size] = {2.}; // Weight in CPU memory.
float raw_bias[size] = {3.}; float raw_bias[size] = {3.};
std::vector<void *> buffers(2); // TRT binded inputs
LOG(INFO) << "create weights"; LOG(INFO) << "create weights";
TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size); TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size);
TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size);
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
nvinfer1::DimsCHW{1, 1, 1}); nvinfer1::DimsCHW{1, 1, 1});
auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
weight.get(), bias.get()); weight.get(), bias.get());
PADDLE_ENFORCE(fc_layer != nullptr); PADDLE_ENFORCE(fc_layer != nullptr);
...@@ -63,18 +82,24 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -63,18 +82,24 @@ TEST_F(TensorRTEngineTest, add_layer) {
ASSERT_EQ(engine_->engine()->getNbBindings(), 2); ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
// fill in real data // fill in real data
float x_v = 1234; std::vector<float> x_v = {1234};
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v), std::vector<float> y_cpu;
1 * sizeof(float)); PrepareInputOutput(x_v, {1});
auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
buffers[1] = reinterpret_cast<void *>(y_gpu_data);
LOG(INFO) << "to execute"; LOG(INFO) << "to execute";
engine_->Execute(1); engine_->Execute(1, &buffers, ctx_->stream());
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float y_cpu; GetOutput(&y_cpu);
engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float));
LOG(INFO) << "to checkout output"; LOG(INFO) << "to checkout output";
ASSERT_EQ(y_cpu, x_v * 2 + 3); ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3);
} }
TEST_F(TensorRTEngineTest, add_layer_multi_dim) { TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
...@@ -83,12 +108,13 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { ...@@ -83,12 +108,13 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
// instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]] // instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]]
float raw_weight[4] = {1.0, 1.1, 3.3, 4.4}; float raw_weight[4] = {1.0, 1.1, 3.3, 4.4};
float raw_bias[2] = {1.3, 2.4}; float raw_bias[2] = {1.3, 2.4};
std::vector<void *> buffers(2); // TRT binded inputs
TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4); TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4);
TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2);
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
nvinfer1::DimsCHW{1, 2, 1}); nvinfer1::DimsCHW{1, 2, 1});
auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
weight.get(), bias.get()); weight.get(), bias.get());
PADDLE_ENFORCE(fc_layer != nullptr); PADDLE_ENFORCE(fc_layer != nullptr);
...@@ -96,19 +122,27 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { ...@@ -96,19 +122,27 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
engine_->FreezeNetwork(); engine_->FreezeNetwork();
ASSERT_EQ(engine_->engine()->getNbBindings(), 2); ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
float x_v[2] = {1.0, 2.0}; // fill in real data
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v), std::vector<float> x_v = {1.0, 2.0};
2 * sizeof(float)); std::vector<float> y_cpu;
engine_->Execute(1); PrepareInputOutput(x_v, {2});
auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
buffers[1] = reinterpret_cast<void *>(y_gpu_data);
engine_->Execute(1, &buffers, ctx_->stream());
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float y_cpu[2] = {-1., -1.}; GetOutput(&y_cpu);
auto dims = engine_->GetITensor("y")->getDimensions(); auto dims = engine_->GetITensor("y")->getDimensions();
ASSERT_EQ(dims.nbDims, 3); ASSERT_EQ(dims.nbDims, 3);
ASSERT_EQ(dims.d[0], 2); ASSERT_EQ(dims.d[0], 2);
ASSERT_EQ(dims.d[1], 1); ASSERT_EQ(dims.d[1], 1);
engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[0], 4.5);
ASSERT_EQ(y_cpu[1], 14.5); ASSERT_EQ(y_cpu[1], 14.5);
} }
...@@ -117,12 +151,13 @@ TEST_F(TensorRTEngineTest, test_conv2d) { ...@@ -117,12 +151,13 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
// Weight in CPU memory. // Weight in CPU memory.
float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
float raw_bias[1] = {0}; float raw_bias[1] = {0};
std::vector<void *> buffers(2); // TRT binded inputs
TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9); TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9);
TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1);
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims3{1, 3, 3}); nvinfer1::Dims3{1, 3, 3});
auto* conv_layer = auto *conv_layer =
TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3}, TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
weight.get(), bias.get()); weight.get(), bias.get());
PADDLE_ENFORCE(conv_layer != nullptr); PADDLE_ENFORCE(conv_layer != nullptr);
...@@ -133,28 +168,36 @@ TEST_F(TensorRTEngineTest, test_conv2d) { ...@@ -133,28 +168,36 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
engine_->FreezeNetwork(); engine_->FreezeNetwork();
ASSERT_EQ(engine_->engine()->getNbBindings(), 2); ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, // fill in real data
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; std::vector<float> x_v = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v), 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
18 * sizeof(float)); std::vector<float> y_cpu;
engine_->Execute(2); PrepareInputOutput(x_v, {18});
auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
buffers[1] = reinterpret_cast<void *>(y_gpu_data);
engine_->Execute(2, &buffers, ctx_->stream());
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float* y_cpu = new float[18]; GetOutput(&y_cpu);
engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float));
ASSERT_EQ(y_cpu[0], 4.0); ASSERT_EQ(y_cpu[0], 4.0);
ASSERT_EQ(y_cpu[1], 6.0); ASSERT_EQ(y_cpu[1], 6.0);
} }
TEST_F(TensorRTEngineTest, test_pool2d) { TEST_F(TensorRTEngineTest, test_pool2d) {
// Weight in CPU memory. // Weight in CPU memory.
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims3{1, 2, 2}); nvinfer1::Dims3{1, 2, 2});
std::vector<void *> buffers(2); // TRT binded inputs
nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE; nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE;
auto* pool_layer = auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast<nvinfer1::ITensor*>(x), nvinfer1::DimsHW{2, 2});
pool_t, nvinfer1::DimsHW{2, 2});
PADDLE_ENFORCE(pool_layer != nullptr); PADDLE_ENFORCE(pool_layer != nullptr);
pool_layer->setStride(nvinfer1::DimsHW{1, 1}); pool_layer->setStride(nvinfer1::DimsHW{1, 1});
...@@ -164,14 +207,21 @@ TEST_F(TensorRTEngineTest, test_pool2d) { ...@@ -164,14 +207,21 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
engine_->FreezeNetwork(); engine_->FreezeNetwork();
ASSERT_EQ(engine_->engine()->getNbBindings(), 2); ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0}; // fill in real data
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v), std::vector<float> x_v = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
8 * sizeof(float)); std::vector<float> y_cpu;
engine_->Execute(2); PrepareInputOutput(x_v, {2});
auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
buffers[1] = reinterpret_cast<void *>(y_gpu_data);
engine_->Execute(2, &buffers, ctx_->stream());
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float* y_cpu = new float[2]; GetOutput(&y_cpu);
engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
ASSERT_EQ(y_cpu[0], 2.0); ASSERT_EQ(y_cpu[0], 2.0);
ASSERT_EQ(y_cpu[1], 5.0); ASSERT_EQ(y_cpu[1], 5.0);
......
...@@ -105,6 +105,13 @@ set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1") ...@@ -105,6 +105,13 @@ set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1")
download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz") download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc) inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc)
# transformer, the dataset only works on batch_size=8 now
set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz")
inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8)
# ocr # ocr
set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR}) if (NOT EXISTS ${OCR_INSTALL_DIR})
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
struct DataRecord {
std::vector<std::vector<int64_t>> src_word, src_pos, trg_word, init_idx;
std::vector<std::vector<float>> src_slf_attn_bias, init_score,
trg_src_attn_bias;
std::vector<std::vector<int32_t>> batch_data_shape;
std::vector<std::vector<size_t>> lod;
size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= src_word.size()) {
data.src_word.assign(src_word.begin() + batch_iter,
src_word.begin() + batch_end);
data.src_pos.assign(src_pos.begin() + batch_iter,
src_pos.begin() + batch_end);
data.src_slf_attn_bias.assign(src_slf_attn_bias.begin() + batch_iter,
src_slf_attn_bias.begin() + batch_end);
data.trg_word.assign(trg_word.begin() + batch_iter,
trg_word.begin() + batch_end);
data.init_score.assign(init_score.begin() + batch_iter,
init_score.begin() + batch_end);
data.init_idx.assign(init_idx.begin() + batch_iter,
init_idx.begin() + batch_end);
data.trg_src_attn_bias.assign(trg_src_attn_bias.begin() + batch_iter,
trg_src_attn_bias.begin() + batch_end);
std::vector<int32_t> batch_shape =
*(batch_data_shape.begin() + batch_iter);
data.batch_data_shape.push_back(batch_shape);
data.lod.resize(2);
for (int i = 0; i < batch_shape[0] + 1; i++) {
data.lod[0].push_back(i);
data.lod[1].push_back(i);
}
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
size_t num_lines = 0;
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, ',', &data);
CHECK_EQ(data.size(), static_cast<size_t>(8));
// load src_word
std::vector<int64_t> src_word_data;
split_to_int64(data[0], ' ', &src_word_data);
src_word.push_back(std::move(src_word_data));
// load src_pos
std::vector<int64_t> src_pos_data;
split_to_int64(data[1], ' ', &src_pos_data);
src_pos.push_back(std::move(src_pos_data));
// load src_slf_attn_bias
std::vector<float> src_slf_attn_bias_data;
split_to_float(data[2], ' ', &src_slf_attn_bias_data);
src_slf_attn_bias.push_back(std::move(src_slf_attn_bias_data));
// load trg_word
std::vector<int64_t> trg_word_data;
split_to_int64(data[3], ' ', &trg_word_data);
trg_word.push_back(std::move(trg_word_data));
// load init_score
std::vector<float> init_score_data;
split_to_float(data[4], ' ', &init_score_data);
init_score.push_back(std::move(init_score_data));
// load init_idx
std::vector<int64_t> init_idx_data;
split_to_int64(data[5], ' ', &init_idx_data);
init_idx.push_back(std::move(init_idx_data));
// load trg_src_attn_bias
std::vector<float> trg_src_attn_bias_data;
split_to_float(data[6], ' ', &trg_src_attn_bias_data);
trg_src_attn_bias.push_back(std::move(trg_src_attn_bias_data));
// load shape for variant data shape
std::vector<int> batch_data_shape_data;
split_to_int(data[7], ' ', &batch_data_shape_data);
batch_data_shape.push_back(std::move(batch_data_shape_data));
}
num_samples = num_lines;
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
auto one_batch = data->NextBatch();
batch_size = one_batch.batch_data_shape[0][0];
auto n_head = one_batch.batch_data_shape[0][1];
auto trg_seq_len = one_batch.batch_data_shape[0][2]; // 1 for inference
auto src_seq_len = one_batch.batch_data_shape[0][3];
PaddleTensor src_word, src_pos, src_slf_attn_bias, trg_word, init_score,
init_idx, trg_src_attn_bias;
src_word.name = "src_word";
src_word.shape.assign({batch_size, src_seq_len, 1});
src_word.dtype = PaddleDType::INT64;
TensorAssignData<int64_t>(&src_word, one_batch.src_word);
src_pos.name = "src_pos";
src_pos.shape.assign({batch_size, src_seq_len, 1});
src_pos.dtype = PaddleDType::INT64;
TensorAssignData<int64_t>(&src_pos, one_batch.src_pos);
src_slf_attn_bias.name = "src_slf_attn_bias";
src_slf_attn_bias.shape.assign(
{batch_size, n_head, src_seq_len, src_seq_len});
src_slf_attn_bias.dtype = PaddleDType::FLOAT32;
TensorAssignData<float>(&src_slf_attn_bias, one_batch.src_slf_attn_bias);
trg_word.name = "trg_word";
trg_word.shape.assign({batch_size, 1});
trg_word.dtype = PaddleDType::INT64;
trg_word.lod.assign(one_batch.lod.begin(), one_batch.lod.end());
TensorAssignData<int64_t>(&trg_word, one_batch.trg_word);
init_score.name = "init_score";
init_score.shape.assign({batch_size, 1});
init_score.dtype = PaddleDType::FLOAT32;
init_score.lod.assign(one_batch.lod.begin(), one_batch.lod.end());
TensorAssignData<float>(&init_score, one_batch.init_score);
init_idx.name = "init_idx";
init_idx.shape.assign({batch_size});
init_idx.dtype = PaddleDType::INT32;
TensorAssignData<int64_t>(&init_idx, one_batch.init_idx);
trg_src_attn_bias.name = "trg_src_attn_bias";
trg_src_attn_bias.shape.assign(
{batch_size, n_head, trg_seq_len, src_seq_len});
trg_src_attn_bias.dtype = PaddleDType::FLOAT32;
TensorAssignData<float>(&trg_src_attn_bias, one_batch.trg_src_attn_bias);
input_slots->assign({src_word, src_pos, src_slf_attn_bias, trg_word,
init_score, init_idx, trg_src_attn_bias});
}
void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->SwitchIrOptim();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<PaddleTensor> input_slots;
int test_batch_num =
FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
LOG(INFO) << "The number of samples to be test: "
<< test_batch_num * FLAGS_batch_size;
for (int bid = 0; bid < test_batch_num; ++bid) {
input_slots.clear();
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
}
}
// Easy for profiling independently.
TEST(Analyzer_Transformer, profile) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
}
// Check the fuse status
TEST(Analyzer_Transformer, fuse_statis) {
AnalysisConfig cfg;
SetConfig(&cfg);
int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
}
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_Transformer, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
}
} // namespace inference
} // namespace paddle
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
#ifdef WITH_GPERFTOOLS #ifdef WITH_GPERFTOOLS
#include <gperftools/profiler.h> #include <gperftools/profiler.h>
#endif #endif
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
...@@ -97,6 +96,14 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -97,6 +96,14 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
} }
break; break;
} }
case PaddleDType::INT32: {
int32_t *pdata = static_cast<int32_t *>(out.data.data());
int32_t *pdata_ref = static_cast<int32_t *>(ref_out.data.data());
for (size_t j = 0; j < size; ++j) {
EXPECT_EQ(pdata_ref[j], pdata[j]);
}
break;
}
} }
} }
} }
......
...@@ -54,7 +54,8 @@ void SetConfig<AnalysisConfig>(AnalysisConfig* config, std::string model_dir, ...@@ -54,7 +54,8 @@ void SetConfig<AnalysisConfig>(AnalysisConfig* config, std::string model_dir,
if (use_gpu) { if (use_gpu) {
config->EnableUseGpu(100, 0); config->EnableUseGpu(100, 0);
if (use_tensorrt) { if (use_tensorrt) {
config->EnableTensorRtEngine(1 << 10, batch_size); config->EnableTensorRtEngine(1 << 10, batch_size, 3,
AnalysisConfig::Precision::kFloat32, false);
config->pass_builder()->DeletePass("conv_bn_fuse_pass"); config->pass_builder()->DeletePass("conv_bn_fuse_pass");
config->pass_builder()->DeletePass("fc_fuse_pass"); config->pass_builder()->DeletePass("fc_fuse_pass");
config->pass_builder()->TurnOnDebug(); config->pass_builder()->TurnOnDebug();
......
...@@ -51,9 +51,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -51,9 +51,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("selected_scores", AddOutput("selected_scores",
"A LoDTensor containing the accumulated scores corresponding to " "A LoDTensor containing the accumulated scores corresponding to "
"Output(selected_ids)."); "Output(selected_ids).");
AddOutput( AddOutput("parent_idx",
"parent_idx", "A Tensor preserving the selected_ids' parent indice in pre_ids.")
"A Tensor preserving the selected_ids' parent indice in pre_ids."); .AsDispensable();
// Attributes stored in AttributeMap // Attributes stored in AttributeMap
AddAttr<int>("level", "the level of LoDTensor"); AddAttr<int>("level", "the level of LoDTensor");
......
...@@ -44,7 +44,6 @@ class BeamSearchOpKernel : public framework::OpKernel<T> { ...@@ -44,7 +44,6 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
auto* parent_idx = context.Output<framework::Tensor>("parent_idx"); auto* parent_idx = context.Output<framework::Tensor>("parent_idx");
PADDLE_ENFORCE_NOT_NULL(selected_ids); PADDLE_ENFORCE_NOT_NULL(selected_ids);
PADDLE_ENFORCE_NOT_NULL(selected_scores); PADDLE_ENFORCE_NOT_NULL(selected_scores);
PADDLE_ENFORCE_NOT_NULL(parent_idx);
math::BeamSearchFunctor<DeviceContext, T> alg; math::BeamSearchFunctor<DeviceContext, T> alg;
alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores, alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores,
......
...@@ -42,8 +42,8 @@ void OpTester::Init(const OpTesterConfig &config) { ...@@ -42,8 +42,8 @@ void OpTester::Init(const OpTesterConfig &config) {
// Initialize the OpDesc // Initialize the OpDesc
if (op_desc_info.Has(config_.op_type)) { if (op_desc_info.Has(config_.op_type)) {
type_ = config_.op_type; type_ = config_.op_type;
op_desc_.SetType(config_.op_type);
CreateOpDesc();
CreateInputVarDesc(); CreateInputVarDesc();
CreateOutputVarDesc(); CreateOutputVarDesc();
} else { } else {
...@@ -131,6 +131,40 @@ std::vector<std::string> OpTester::GetOpProtoOutputNames() { ...@@ -131,6 +131,40 @@ std::vector<std::string> OpTester::GetOpProtoOutputNames() {
return output_names; return output_names;
} }
std::unordered_map<std::string, framework::proto::AttrType>
OpTester::GetOpProtoAttrNames() {
std::unordered_map<std::string, framework::proto::AttrType> attr_types;
const framework::proto::OpProto &proto =
framework::OpInfoMap::Instance().Get(type_).Proto();
const std::vector<std::string> skipped_attrs = {
framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(),
framework::OpProtoAndCheckerMaker::OpNamescopeAttrName(),
framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()};
for (int i = 0; i != proto.attrs_size(); ++i) {
const auto &attr = proto.attrs(i);
if (!Has(skipped_attrs, attr.name())) {
VLOG(4) << "attr: " << attr.name() << ", type: " << attr.type();
attr_types[attr.name()] = attr.type();
}
}
return attr_types;
}
framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
if (str == "int32") {
return framework::proto::VarType::INT32;
} else if (str == "int64") {
return framework::proto::VarType::INT64;
} else if (str == "fp32") {
return framework::proto::VarType::FP32;
} else if (str == "fp64") {
return framework::proto::VarType::FP64;
} else {
PADDLE_THROW("Unsupported dtype %s.", str.c_str());
}
}
void OpTester::CreateInputVarDesc() { void OpTester::CreateInputVarDesc() {
std::vector<std::string> input_names = GetOpProtoInputNames(); std::vector<std::string> input_names = GetOpProtoInputNames();
for (auto &name : input_names) { for (auto &name : input_names) {
...@@ -145,11 +179,11 @@ void OpTester::CreateInputVarDesc() { ...@@ -145,11 +179,11 @@ void OpTester::CreateInputVarDesc() {
// Need to support more type // Need to support more type
var->SetType(framework::proto::VarType::LOD_TENSOR); var->SetType(framework::proto::VarType::LOD_TENSOR);
var->SetPersistable(false); var->SetPersistable(false);
var->SetDataType(framework::proto::VarType::FP32); var->SetDataType(TransToVarType(input->dtype));
var->SetShape(input->dims); var->SetShape(input->dims);
op_desc_.SetInput(name, {var_name}); op_desc_.SetInput(name, {var_name});
input_lods_[var_name] = input->lod; inputs_[var_name] = *input;
} }
} }
...@@ -167,6 +201,49 @@ void OpTester::CreateOutputVarDesc() { ...@@ -167,6 +201,49 @@ void OpTester::CreateOutputVarDesc() {
} }
} }
void OpTester::CreateOpDesc() {
op_desc_.SetType(config_.op_type);
std::unordered_map<std::string, framework::proto::AttrType> attr_types =
GetOpProtoAttrNames();
for (auto item : config_.attrs) {
const std::string &name = item.first;
if (attr_types.find(name) == attr_types.end()) {
LOG(FATAL) << "Operator " << type_ << " do not have attr " << name;
}
const std::string &value_str = item.second;
const framework::proto::AttrType &type = attr_types[name];
switch (type) {
case framework::proto::AttrType::BOOLEAN:
break;
case framework::proto::AttrType::INT: {
int value = StringTo<int>(value_str);
op_desc_.SetAttr(name, {value});
} break;
case framework::proto::AttrType::FLOAT: {
float value = StringTo<float>(value_str);
op_desc_.SetAttr(name, {value});
} break;
case framework::proto::AttrType::STRING: {
op_desc_.SetAttr(name, {value_str});
} break;
case framework::proto::AttrType::BOOLEANS:
case framework::proto::AttrType::INTS:
case framework::proto::AttrType::FLOATS:
case framework::proto::AttrType::STRINGS:
LOG(FATAL) << "Not supported yet.";
break;
case framework::proto::AttrType::LONG: {
int64_t value = StringTo<int64_t>(value_str);
op_desc_.SetAttr(name, value);
} break;
case framework::proto::AttrType::LONGS:
default:
PADDLE_THROW("Unsupport attr type %d", type);
}
}
}
framework::VarDesc *OpTester::Var(const std::string &name) { framework::VarDesc *OpTester::Var(const std::string &name) {
auto it = vars_.find(name); auto it = vars_.find(name);
if (it != vars_.end()) { if (it != vars_.end()) {
...@@ -179,24 +256,41 @@ framework::VarDesc *OpTester::Var(const std::string &name) { ...@@ -179,24 +256,41 @@ framework::VarDesc *OpTester::Var(const std::string &name) {
template <typename T> template <typename T>
void OpTester::SetupTensor(framework::LoDTensor *tensor, void OpTester::SetupTensor(framework::LoDTensor *tensor,
const std::vector<int64_t> &shape, T lower, const std::vector<int64_t> &shape, T lower, T upper,
T upper) { const std::string &initializer) {
static unsigned int seed = 100; static unsigned int seed = 100;
std::mt19937 rng(seed++); std::mt19937 rng(seed++);
std::uniform_real_distribution<double> uniform_dist(0, 1); std::uniform_real_distribution<double> uniform_dist(0, 1);
T *ptr = tensor->mutable_data<T>(framework::make_ddim(shape), place_); T *ptr = tensor->mutable_data<T>(framework::make_ddim(shape), place_);
if (platform::is_cpu_place(place_)) {
for (int i = 0; i < tensor->numel(); ++i) { framework::LoDTensor cpu_tensor;
ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower); T *cpu_ptr = nullptr;
}
if (!platform::is_cpu_place(place_)) {
cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
platform::CPUPlace());
} else { } else {
framework::LoDTensor cpu_tensor; cpu_ptr = ptr;
T *cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape), }
platform::CPUPlace());
if (initializer == "random") {
for (int i = 0; i < cpu_tensor.numel(); ++i) { for (int i = 0; i < cpu_tensor.numel(); ++i) {
cpu_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower); cpu_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
} }
} else if (initializer == "natural") {
for (int i = 0; i < cpu_tensor.numel(); ++i) {
cpu_ptr[i] = lower + i;
}
} else if (initializer == "zeros") {
for (int i = 0; i < cpu_tensor.numel(); ++i) {
cpu_ptr[i] = 0;
}
} else {
PADDLE_THROW("Unsupported initializer %s.", initializer.c_str());
}
if (!platform::is_cpu_place(place_)) {
TensorCopySync(cpu_tensor, place_, tensor); TensorCopySync(cpu_tensor, place_, tensor);
} }
} }
...@@ -219,7 +313,7 @@ void OpTester::CreateVariables(framework::Scope *scope) { ...@@ -219,7 +313,7 @@ void OpTester::CreateVariables(framework::Scope *scope) {
} }
} }
for (auto &item : input_lods_) { for (auto &item : inputs_) {
// Allocate memory for input tensor // Allocate memory for input tensor
auto &var_name = item.first; auto &var_name = item.first;
VLOG(3) << "Allocate memory for tensor " << var_name; VLOG(3) << "Allocate memory for tensor " << var_name;
...@@ -229,11 +323,23 @@ void OpTester::CreateVariables(framework::Scope *scope) { ...@@ -229,11 +323,23 @@ void OpTester::CreateVariables(framework::Scope *scope) {
auto *var = scope->Var(var_name); auto *var = scope->Var(var_name);
auto *tensor = var->GetMutable<framework::LoDTensor>(); auto *tensor = var->GetMutable<framework::LoDTensor>();
SetupTensor<float>(tensor, shape, static_cast<float>(0.0), const auto &data_type = var_desc->GetDataType();
static_cast<float>(1.0)); if (data_type == framework::proto::VarType::INT32) {
SetupTensor<int>(tensor, shape, 0, 1, item.second.initializer);
} else if (data_type == framework::proto::VarType::INT64) {
SetupTensor<int64_t>(tensor, shape, 0, 1, item.second.initializer);
} else if (data_type == framework::proto::VarType::FP32) {
SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
static_cast<float>(1.0), item.second.initializer);
} else if (data_type == framework::proto::VarType::FP64) {
SetupTensor<double>(tensor, shape, static_cast<double>(0.0),
static_cast<double>(1.0), item.second.initializer);
} else {
PADDLE_THROW("Unsupported dtype %d.", data_type);
}
VLOG(3) << "Set lod for tensor " << var_name; VLOG(3) << "Set lod for tensor " << var_name;
std::vector<std::vector<size_t>> &lod_vec = item.second; std::vector<std::vector<size_t>> &lod_vec = item.second.lod;
framework::LoD lod; framework::LoD lod;
for (size_t i = 0; i < lod_vec.size(); ++i) { for (size_t i = 0; i < lod_vec.size(); ++i) {
lod.push_back(lod_vec[i]); lod.push_back(lod_vec[i]);
...@@ -261,7 +367,16 @@ std::string OpTester::DebugString() { ...@@ -261,7 +367,16 @@ std::string OpTester::DebugString() {
ss << GenSpaces(count) << "type: LOD_TENSOR\n"; ss << GenSpaces(count) << "type: LOD_TENSOR\n";
ss << GenSpaces(count++) << "lod_tensor {\n"; ss << GenSpaces(count++) << "lod_tensor {\n";
ss << GenSpaces(count++) << "tensor {\n"; ss << GenSpaces(count++) << "tensor {\n";
ss << GenSpaces(count) << "data_type: FP32\n"; const auto &data_type = var->GetDataType();
if (data_type == framework::proto::VarType::INT32) {
ss << GenSpaces(count) << "data_type: INT32\n";
} else if (data_type == framework::proto::VarType::INT64) {
ss << GenSpaces(count) << "data_type: INT64\n";
} else if (data_type == framework::proto::VarType::FP32) {
ss << GenSpaces(count) << "data_type: FP32\n";
} else if (data_type == framework::proto::VarType::FP64) {
ss << GenSpaces(count) << "data_type: FP64\n";
}
std::vector<int64_t> shape = var->GetShape(); std::vector<int64_t> shape = var->GetShape();
for (auto d : shape) { for (auto d : shape) {
ss << GenSpaces(count) << "dims: " << d << "\n"; ss << GenSpaces(count) << "dims: " << d << "\n";
...@@ -288,6 +403,63 @@ std::string OpTester::DebugString() { ...@@ -288,6 +403,63 @@ std::string OpTester::DebugString() {
ss << GenSpaces(--count) << "}\n"; ss << GenSpaces(--count) << "}\n";
} }
ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n"; ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n";
for (auto &name : op_desc_.AttrNames()) {
ss << GenSpaces(count++) << "attrs {\n";
const auto &attr_type = op_desc_.GetAttrType(name);
const auto &attr = op_desc_.GetAttr(name);
ss << GenSpaces(count) << "name: \"" << name << "\"\n";
switch (attr_type) {
case framework::proto::AttrType::BOOLEAN: {
ss << GenSpaces(count) << "type: BOOLEAN\n";
ss << GenSpaces(count) << "b: " << boost::get<bool>(attr) << "\n";
} break;
case framework::proto::AttrType::INT: {
ss << GenSpaces(count) << "type: INT\n";
ss << GenSpaces(count) << "i: " << boost::get<int>(attr) << "\n";
} break;
case framework::proto::AttrType::FLOAT: {
ss << GenSpaces(count) << "type: FLOAT\n";
ss << GenSpaces(count) << "f: " << boost::get<float>(attr) << "\n";
} break;
case framework::proto::AttrType::STRING: {
ss << GenSpaces(count) << "type: STRING\n";
ss << GenSpaces(count) << "s: \"" << boost::get<std::string>(attr)
<< "\"\n";
} break;
case framework::proto::AttrType::BOOLEANS: {
ss << GenSpaces(count) << "type: BOOLEANS\n";
ss << GenSpaces(count) << "bools: "
<< "\n";
} break;
case framework::proto::AttrType::INTS: {
ss << GenSpaces(count) << "type: INTS\n";
ss << GenSpaces(count) << "ints: "
<< "\n";
} break;
case framework::proto::AttrType::FLOATS: {
ss << GenSpaces(count) << "type: FLOATS\n";
ss << GenSpaces(count) << "floats: "
<< "\n";
} break;
case framework::proto::AttrType::STRINGS: {
ss << GenSpaces(count) << "type: STRINGS\n";
ss << GenSpaces(count) << "strings: "
<< "\n";
} break;
case framework::proto::AttrType::LONG: {
ss << GenSpaces(count) << "type: LONG\n";
ss << GenSpaces(count) << "l: " << boost::get<int64_t>(attr) << "\n";
} break;
case framework::proto::AttrType::LONGS: {
ss << GenSpaces(count) << "type: LONGS\n";
ss << GenSpaces(count) << "longs: "
<< "\n";
} break;
default:
PADDLE_THROW("Unsupport attr type %d", attr_type);
}
ss << GenSpaces(--count) << "}\n";
}
ss << GenSpaces(--count) << "}\n"; ss << GenSpaces(--count) << "}\n";
return ss.str(); return ss.str();
} }
...@@ -299,6 +471,7 @@ TEST(op_tester, base) { ...@@ -299,6 +471,7 @@ TEST(op_tester, base) {
FLAGS_op_config_list.c_str()); FLAGS_op_config_list.c_str());
std::vector<OpTesterConfig> op_configs; std::vector<OpTesterConfig> op_configs;
while (!fin.eof()) { while (!fin.eof()) {
VLOG(4) << "Reading config " << op_configs.size() << "...";
OpTesterConfig config; OpTesterConfig config;
bool result = config.Init(fin); bool result = config.Init(fin);
if (result) { if (result) {
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
...@@ -39,16 +41,21 @@ class OpTester { ...@@ -39,16 +41,21 @@ class OpTester {
private: private:
std::vector<std::string> GetOpProtoInputNames(); std::vector<std::string> GetOpProtoInputNames();
std::vector<std::string> GetOpProtoOutputNames(); std::vector<std::string> GetOpProtoOutputNames();
std::unordered_map<std::string, framework::proto::AttrType>
GetOpProtoAttrNames();
framework::proto::VarType::Type TransToVarType(std::string str);
void CreateInputVarDesc(); void CreateInputVarDesc();
void CreateOutputVarDesc(); void CreateOutputVarDesc();
void CreateOpDesc();
framework::VarDesc *Var(const std::string &name); framework::VarDesc *Var(const std::string &name);
void CreateVariables(framework::Scope *scope); void CreateVariables(framework::Scope *scope);
template <typename T> template <typename T>
void SetupTensor(framework::LoDTensor *input, void SetupTensor(framework::LoDTensor *input,
const std::vector<int64_t> &shape, T lower, T upper); const std::vector<int64_t> &shape, T lower, T upper,
const std::string &initializer);
void RunImpl(); void RunImpl();
...@@ -57,7 +64,7 @@ class OpTester { ...@@ -57,7 +64,7 @@ class OpTester {
std::string type_; std::string type_;
framework::OpDesc op_desc_; framework::OpDesc op_desc_;
std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_; std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
std::unordered_map<std::string, std::vector<std::vector<size_t>>> input_lods_; std::unordered_map<std::string, OpInputConfig> inputs_;
std::unique_ptr<framework::OperatorBase> op_; std::unique_ptr<framework::OperatorBase> op_;
platform::Place place_; platform::Place place_;
std::unique_ptr<framework::Scope> scope_; std::unique_ptr<framework::Scope> scope_;
......
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester_config.h" #include "paddle/fluid/operators/benchmark/op_tester_config.h"
#include <fstream> #include <fstream>
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -40,6 +39,62 @@ static void EraseEndSep(std::string* str, ...@@ -40,6 +39,62 @@ static void EraseEndSep(std::string* str,
} }
} }
OpInputConfig::OpInputConfig(std::istream& is) {
std::string sep;
is >> sep;
if (sep == kStartSeparator) {
while (sep != kEndSeparator) {
is >> sep;
if (sep == "name" || sep == "name:") {
is >> name;
EraseEndSep(&name);
} else if (sep == "dtype" || sep == "dtype:") {
ParseDType(is);
} else if (sep == "initializer" || sep == "initializer:") {
ParseInitializer(is);
} else if (sep == "dims" || sep == "dims:") {
ParseDims(is);
} else if (sep == "lod" || sep == "lod:") {
ParseLoD(is);
}
}
}
}
void OpInputConfig::ParseDType(std::istream& is) {
std::string dtype_str;
is >> dtype_str;
EraseEndSep(&dtype_str);
if (dtype_str == "int32" || dtype_str == "int") {
dtype = "int32";
} else if (dtype_str == "int64" || dtype_str == "long") {
dtype = "int64";
} else if (dtype_str == "fp32" || dtype_str == "float") {
dtype = "fp32";
} else if (dtype_str == "fp64" || dtype_str == "double") {
dtype = "fp64";
} else {
PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str());
}
VLOG(4) << "dtype of input " << name << " is: " << dtype;
}
void OpInputConfig::ParseInitializer(std::istream& is) {
std::string initializer_str;
is >> initializer_str;
EraseEndSep(&initializer_str);
const std::vector<std::string> supported_initializers = {"random", "natural",
"zeros"};
if (!Has(supported_initializers, initializer_str)) {
PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str());
}
initializer = initializer_str;
VLOG(4) << "initializer of input " << name << " is: " << initializer;
}
void OpInputConfig::ParseDims(std::istream& is) { void OpInputConfig::ParseDims(std::istream& is) {
std::string dims_str; std::string dims_str;
is >> dims_str; is >> dims_str;
...@@ -84,7 +139,7 @@ void OpInputConfig::ParseLoD(std::istream& is) { ...@@ -84,7 +139,7 @@ void OpInputConfig::ParseLoD(std::istream& is) {
number += lod_str[i]; number += lod_str[i];
++i; ++i;
} }
level.push_back(atoi(number.c_str())); level.push_back(StringTo<size_t>(number));
} }
lod.push_back(level); lod.push_back(level);
} else if (lod_str[i] == '}') { } else if (lod_str[i] == '}') {
...@@ -93,24 +148,6 @@ void OpInputConfig::ParseLoD(std::istream& is) { ...@@ -93,24 +148,6 @@ void OpInputConfig::ParseLoD(std::istream& is) {
} }
} }
OpInputConfig::OpInputConfig(std::istream& is) {
std::string sep;
is >> sep;
if (sep == kStartSeparator) {
while (sep != kEndSeparator) {
is >> sep;
if (sep == "name" || sep == "name:") {
is >> name;
EraseEndSep(&name);
} else if (sep == "dims" || sep == "dims:") {
ParseDims(is);
} else if (sep == "lod" || sep == "lod:") {
ParseLoD(is);
}
}
}
}
OpTesterConfig::OpTesterConfig(const std::string& filename) { OpTesterConfig::OpTesterConfig(const std::string& filename) {
std::ifstream fin(filename, std::ios::in | std::ios::binary); std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
...@@ -167,6 +204,7 @@ bool OpTesterConfig::ParseAttrs(std::istream& is) { ...@@ -167,6 +204,7 @@ bool OpTesterConfig::ParseAttrs(std::istream& is) {
is >> value; is >> value;
EraseEndSep(&key, ":"); EraseEndSep(&key, ":");
EraseEndSep(&value); EraseEndSep(&value);
VLOG(4) << "attrs: " << key << ", " << value;
attrs[key] = value; attrs[key] = value;
} }
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <istream> #include <istream>
#include <sstream>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
...@@ -27,10 +28,14 @@ struct OpInputConfig { ...@@ -27,10 +28,14 @@ struct OpInputConfig {
OpInputConfig() {} OpInputConfig() {}
explicit OpInputConfig(std::istream& is); explicit OpInputConfig(std::istream& is);
void ParseDType(std::istream& is);
void ParseInitializer(std::istream& is);
void ParseDims(std::istream& is); void ParseDims(std::istream& is);
void ParseLoD(std::istream& is); void ParseLoD(std::istream& is);
std::string name; std::string name;
std::string dtype{"fp32"}; // int32/int, int64/long, fp32/float, fp64/double
std::string initializer{"random"}; // random, natural
std::vector<int64_t> dims; std::vector<int64_t> dims;
std::vector<std::vector<size_t>> lod; std::vector<std::vector<size_t>> lod;
}; };
...@@ -55,6 +60,23 @@ struct OpTesterConfig { ...@@ -55,6 +60,23 @@ struct OpTesterConfig {
double runtime{0.0}; double runtime{0.0};
}; };
static bool Has(const std::vector<std::string>& vec, const std::string& item) {
for (size_t i = 0; i < vec.size(); ++i) {
if (vec[i] == item) {
return true;
}
}
return false;
}
template <typename T>
T StringTo(const std::string& str) {
std::istringstream is(str);
T value;
is >> value;
return value;
}
} // namespace benchmark } // namespace benchmark
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/operators/cast_op.h"
#include <memory>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
...@@ -30,7 +31,8 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -30,7 +31,8 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
Cast Operator. Cast Operator.
This Operator casts the input tensor to another data type and This Operator casts the input tensor to another data type and
returns tha Output Tensor. returns the Output Tensor. It's meaningless if the output dtype equals
the input dtype, but it's fine if you do so.
)DOC"); )DOC");
} }
......
...@@ -37,8 +37,10 @@ detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc bo ...@@ -37,8 +37,10 @@ detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc bo
if(WITH_GPU) if(WITH_GPU)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub) detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub)
else() else()
detection_library(generate_proposals_op SRCS generate_proposals_op.cc) detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
endif() endif()
detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu) detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
namespace paddle {
namespace operators {
class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("FpnRois"),
"Input(FpnRois) shouldn't be null");
PADDLE_ENFORCE_GE(
ctx->Outputs("MultiFpnRois").size(), 1UL,
"Outputs(MultiFpnRois) of DistributeOp should not be empty");
size_t min_level = static_cast<size_t>(ctx->Attrs().Get<int>("min_level"));
size_t max_level = static_cast<size_t>(ctx->Attrs().Get<int>("max_level"));
PADDLE_ENFORCE_GE(max_level, min_level,
"max_level must not lower than min_level");
// Set the output shape
size_t num_out_rois = max_level - min_level + 1;
std::vector<framework::DDim> outs_dims;
outs_dims.reserve(num_out_rois);
for (size_t i = 0; i < num_out_rois; ++i) {
framework::DDim out_dim = {-1, 4};
outs_dims.push_back(out_dim);
}
ctx->SetOutputsDim("MultiFpnRois", outs_dims);
ctx->SetOutputDim("RestoreIndex", {1, -1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("FpnRois"));
return framework::OpKernelType(data_type, platform::CPUPlace());
}
};
class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)");
AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator")
.AsDuplicable();
AddOutput("RestoreIndex",
"(Tensor) An array of positive number which is "
"used to restore the order of FpnRois");
AddAttr<int>("min_level",
"The lowest level of FPN layer where the"
" proposals come from");
AddAttr<int>("max_level",
"The highest level of FPN layer where the"
" proposals come from");
AddAttr<int>("refer_level",
"The referring level of FPN layer with"
" specified scale");
AddAttr<int>("refer_scale",
"The referring scale of FPN layer with"
" specified level");
AddComment(R"DOC(
This operator distribute all proposals into different fpn level,
with respect to scale of the proposals, the referring scale and
the referring level. Besides, to restore the order of proposals,
we return an array which indicate the original index of rois in
current proposals.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(distribute_fpn_proposals, ops::DistributeFpnProposalsOp,
ops::DistributeFpnProposalsOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals,
ops::DistributeFpnProposalsOpKernel<float>,
ops::DistributeFpnProposalsOpKernel<double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/fluid/memory/allocation/allocator.h>
#include "cub/cub.cuh"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
#include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
int const BBoxSize = 4;
struct RangeInitFunctor {
int start_;
int delta_;
int* out_;
__device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; }
};
static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaxinumNumBlocks);
}
static inline void TransLoD(const int* length_lod, const int lod_size,
int* offset_lod) {
int offset = 0;
for (int i = 0; i < lod_size; ++i) {
offset_lod[i] = offset;
offset += length_lod[i];
}
}
template <typename T>
static __device__ inline T RoIArea(const T* box, bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return static_cast<T>(0.);
} else {
const T w = box[2] - box[0];
const T h = box[3] - box[1];
if (normalized) {
return w * h;
} else {
// If coordinate values are not within range [0, 1].
return (w + 1) * (h + 1);
}
}
}
template <class T>
static __global__ void GPUDistFpnProposalsHelper(
const int nthreads, const T* rois, const int lod_size,
const int refer_level, const int refer_scale, const int max_level,
const int min_level, int* roi_batch_id_data, int* sub_lod_list,
int* target_lvls) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const T* offset_roi = rois + i * BBoxSize;
int roi_batch_ind = roi_batch_id_data[i];
// get the target level of current rois
T roi_area = RoIArea(offset_roi, false);
T roi_scale = sqrt(roi_area);
int tgt_lvl = floor(log2(roi_scale / refer_scale) + refer_level);
tgt_lvl = min(max_level, max(tgt_lvl, min_level));
target_lvls[i] = tgt_lvl;
// compute number of rois in the same batch and same target level
platform::CudaAtomicAdd(sub_lod_list + tgt_lvl * lod_size + roi_batch_ind,
1);
}
}
template <typename DeviceContext, typename T>
class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* fpn_rois = ctx.Input<paddle::framework::LoDTensor>("FpnRois");
auto multi_fpn_rois = ctx.MultiOutput<LoDTensor>("MultiFpnRois");
auto* restore_index = ctx.Output<Tensor>("RestoreIndex");
const int min_level = ctx.Attr<int>("min_level");
const int max_level = ctx.Attr<int>("max_level");
const int refer_level = ctx.Attr<int>("refer_level");
const int refer_scale = ctx.Attr<int>("refer_scale");
int num_level = max_level - min_level + 1;
// check that the fpn_rois is not empty
PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
"DistributeFpnProposalsOp need 1 level of LoD");
auto fpn_rois_lod = fpn_rois->lod().back();
int lod_size = fpn_rois_lod.size() - 1;
int roi_num = fpn_rois_lod[lod_size];
auto& dev_ctx = ctx.template device_context<DeviceContext>();
// get batch id by lod in CPU
Tensor roi_batch_id_list;
roi_batch_id_list.Resize({roi_num});
int* roi_batch_id_data =
roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
for (int n = 0; n < lod_size; ++n) {
for (size_t i = fpn_rois_lod[n]; i < fpn_rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
}
}
// copy batch id list to GPU
Tensor roi_batch_id_list_gpu;
framework::TensorCopySync(roi_batch_id_list, dev_ctx.GetPlace(),
&roi_batch_id_list_gpu);
Tensor sub_lod_list;
sub_lod_list.Resize({num_level, lod_size});
int* sub_lod_list_data = sub_lod_list.mutable_data<int>(dev_ctx.GetPlace());
Tensor target_lvls;
target_lvls.Resize({roi_num});
int* target_lvls_data = target_lvls.mutable_data<int>(dev_ctx.GetPlace());
int blocks = NumBlocks(roi_num);
int threads = kNumCUDAThreads;
// get target levels and sub_lod list
GPUDistFpnProposalsHelper<T><<<blocks, threads>>>(
roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
max_level, min_level, roi_batch_id_list_gpu.data<int>(),
sub_lod_list_data, target_lvls_data);
Tensor index_in_t;
int* idx_in = index_in_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, roi_num);
for_range(RangeInitFunctor{0, 1, idx_in});
Tensor keys_out_t;
int* keys_out = keys_out_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
Tensor index_out_t;
int* idx_out = index_out_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
// Determine temporary device storage requirements
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending<int, int>(
nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in,
idx_out, roi_num);
// Allocate temporary storage
auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
auto d_temp_storage = memory::Alloc(place, temp_storage_bytes,
memory::Allocator::kScratchpad);
// Run sorting operation
// sort target level to get corresponding index
cub::DeviceRadixSort::SortPairsDescending<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
idx_in, idx_out, roi_num);
int* restore_idx_data =
restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
// sort current index to get restore index
cub::DeviceRadixSort::SortPairsDescending<int, int>(
d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
restore_idx_data, roi_num);
Tensor offset_lod;
int* offset_lod_data =
offset_lod.mutable_data<int>({lod_size + 1}, dev_ctx.GetPlace());
for (int i = 0; i < num_level; ++i) {
Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
int* sub_lod_data = sub_lod.data<int>();
// transfer length-based lod to offset-based lod
TransLoD(sub_lod_data, lod_size + 1, offset_lod_data);
int sub_rois_num = offset_lod_data[lod_size];
Tensor sub_idx = index_out_t.Slice(0, sub_rois_num);
multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
dev_ctx.GetPlace());
GPUGather<T>(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]);
framework::LoD lod;
std::vector<size_t> offset;
memory::Copy(platform::CPUPlace(), offset.data(), place, offset_lod_data,
sizeof(int) * (lod_size + 1), 0);
lod.emplace_back(offset);
multi_fpn_rois[i]->set_lod(lod);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
distribute_fpn_proposals,
ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
float>,
ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <cmath>
#include <cstring>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
const int kBoxDim = 4;
template <typename T>
static inline T BBoxArea(const T* box, bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return static_cast<T>(0.);
} else {
const T w = box[2] - box[0];
const T h = box[3] - box[1];
if (normalized) {
return w * h;
} else {
// If coordinate values are not within range [0, 1].
return (w + 1) * (h + 1);
}
}
}
template <typename T>
class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* fpn_rois = context.Input<paddle::framework::LoDTensor>("FpnRois");
auto multi_fpn_rois =
context.MultiOutput<paddle::framework::LoDTensor>("MultiFpnRois");
auto* restore_index =
context.Output<paddle::framework::Tensor>("RestoreIndex");
const int min_level = context.Attr<int>("min_level");
const int max_level = context.Attr<int>("max_level");
const int refer_level = context.Attr<int>("refer_level");
const int refer_scale = context.Attr<int>("refer_scale");
const int num_level = max_level - min_level + 1;
// check that the fpn_rois is not empty
PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
"DistributeFpnProposalsOp need 1 level of LoD");
auto fpn_rois_lod = fpn_rois->lod().back();
int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
std::vector<int> target_level;
// std::vector<int> target_level(fpn_rois_num, -1);
// record the number of rois in each level
std::vector<int> num_rois_level(num_level, 0);
std::vector<int> num_rois_level_integral(num_level + 1, 0);
for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) {
Tensor fpn_rois_slice =
fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
const T* rois_data = fpn_rois_slice.data<T>();
for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
// get the target level of current rois
T roi_scale = std::sqrt(BBoxArea(rois_data, false));
int tgt_lvl =
std::floor(std::log2(roi_scale / refer_scale) + refer_level);
tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
target_level.push_back(tgt_lvl);
num_rois_level[tgt_lvl - min_level]++;
rois_data += kBoxDim;
}
}
// define the output rois
// pointer which point to each level fpn rois
std::vector<T*> multi_fpn_rois_data(num_level);
// lod0 which will record the offset information of each level rois
std::vector<std::vector<size_t>> multi_fpn_rois_lod0;
for (int i = 0; i < num_level; ++i) {
// allocate memory for each level rois
multi_fpn_rois[i]->mutable_data<T>({num_rois_level[i], kBoxDim},
context.GetPlace());
multi_fpn_rois_data[i] = multi_fpn_rois[i]->data<T>();
std::vector<size_t> lod0(1, 0);
multi_fpn_rois_lod0.push_back(lod0);
// statistic start point for each level rois
num_rois_level_integral[i + 1] =
num_rois_level_integral[i] + num_rois_level[i];
}
restore_index->mutable_data<int>({1, fpn_rois_num}, context.GetPlace());
int* restore_index_data = restore_index->data<int>();
std::vector<int> restore_index_inter(fpn_rois_num, -1);
// distribute the rois into different fpn level by target level
for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) {
Tensor fpn_rois_slice =
fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
const T* rois_data = fpn_rois_slice.data<T>();
size_t cur_offset = fpn_rois_lod[i];
// std::vector<size_t > lod_offset[num_level];
for (int j = 0; j < num_level; j++) {
multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]);
}
for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
int lvl = target_level[cur_offset + j];
memcpy(multi_fpn_rois_data[lvl - min_level], rois_data,
kBoxDim * sizeof(T));
multi_fpn_rois_data[lvl - min_level] += kBoxDim;
int index_in_shuffle = num_rois_level_integral[lvl - min_level] +
multi_fpn_rois_lod0[lvl - min_level][i + 1];
restore_index_inter[index_in_shuffle] = cur_offset + j;
multi_fpn_rois_lod0[lvl - min_level][i + 1]++;
rois_data += kBoxDim;
}
}
for (int i = 0; i < fpn_rois_num; ++i) {
restore_index_data[restore_index_inter[i]] = i;
}
// merge lod information into LoDTensor
for (int i = 0; i < num_level; ++i) {
framework::LoD lod;
lod.emplace_back(multi_fpn_rois_lod0[i]);
multi_fpn_rois[i]->set_lod(lod);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -56,15 +56,15 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> { ...@@ -56,15 +56,15 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
// the output tensor shape should be [num_instances, 1] // the output tensor shape should be [num_instances, 1]
auto dims = framework::make_ddim( auto dims = framework::make_ddim(
std::vector<int64_t>({static_cast<int>(num_instances), 1})); std::vector<int64_t>({static_cast<int>(num_instances), 1}));
selected_ids->Resize(dims);
selected_scores->Resize(dims);
parent_idx->Resize({static_cast<int64_t>(num_instances)});
auto *selected_ids_data = auto *selected_ids_data =
selected_ids->mutable_data<int64_t>(platform::CPUPlace()); selected_ids->mutable_data<int64_t>(dims, platform::CPUPlace());
auto *selected_scores_data = auto *selected_scores_data =
selected_scores->mutable_data<float>(platform::CPUPlace()); selected_scores->mutable_data<float>(dims, platform::CPUPlace());
auto *parent_idx_data = parent_idx->mutable_data<int>(platform::CPUPlace()); auto *parent_idx_data =
parent_idx
? parent_idx->mutable_data<int>(
{static_cast<int64_t>(num_instances)}, platform::CPUPlace())
: nullptr;
// fill in data // fill in data
std::vector<size_t> low_level; std::vector<size_t> low_level;
...@@ -72,7 +72,9 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> { ...@@ -72,7 +72,9 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
for (auto &items : selected_items) { for (auto &items : selected_items) {
low_level.push_back(low_offset); low_level.push_back(low_offset);
for (auto &item : items) { for (auto &item : items) {
parent_idx_data[low_offset] = static_cast<int>(low_level.size() - 1); if (parent_idx) {
parent_idx_data[low_offset] = static_cast<int>(low_level.size() - 1);
}
selected_ids_data[low_offset] = item.id; selected_ids_data[low_offset] = item.id;
selected_scores_data[low_offset] = item.score; selected_scores_data[low_offset] = item.score;
low_offset++; low_offset++;
......
...@@ -168,6 +168,7 @@ __device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local, ...@@ -168,6 +168,7 @@ __device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local,
return finish_flag; return finish_flag;
} }
template <bool ReturnParentIdx = false>
__device__ __forceinline__ void WriteBack( __device__ __forceinline__ void WriteBack(
int64_t* selected_ids, float* selected_scores, int* parent_idx, int64_t* selected_ids, float* selected_scores, int* parent_idx,
size_t* selected_offsets, Triple* top_beam_local, size_t* selected_offsets, Triple* top_beam_local,
...@@ -183,7 +184,9 @@ __device__ __forceinline__ void WriteBack( ...@@ -183,7 +184,9 @@ __device__ __forceinline__ void WriteBack(
selected_ids[global_index] = selected_ids[global_index] =
static_cast<int64_t>(top_beam_local[local_index].id); static_cast<int64_t>(top_beam_local[local_index].id);
selected_scores[global_index] = top_beam_local[local_index].score; selected_scores[global_index] = top_beam_local[local_index].score;
parent_idx[global_index] = static_cast<int>(global_offset); if (ReturnParentIdx) {
parent_idx[global_index] = static_cast<int>(global_offset);
}
global_index++; global_index++;
} }
} }
...@@ -241,9 +244,15 @@ __device__ void BeamSearchDetails( ...@@ -241,9 +244,15 @@ __device__ void BeamSearchDetails(
selected_offsets[0] = 0; selected_offsets[0] = 0;
} }
WriteBack(selected_ids, selected_scores, parent_idx, selected_offsets, if (parent_idx) {
top_beam_local, seq_offset_start, seq_offset_end, WriteBack<true>(selected_ids, selected_scores, parent_idx,
selected_seq_start, selected_seq_length); selected_offsets, top_beam_local, seq_offset_start,
seq_offset_end, selected_seq_start, selected_seq_length);
} else {
WriteBack<false>(selected_ids, selected_scores, parent_idx,
selected_offsets, top_beam_local, seq_offset_start,
seq_offset_end, selected_seq_start, selected_seq_length);
}
} }
} }
...@@ -337,8 +346,12 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> { ...@@ -337,8 +346,12 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace()); selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace());
float* selected_scores_data = float* selected_scores_data =
selected_scores->mutable_data<float>(selected_dims, context.GetPlace()); selected_scores->mutable_data<float>(selected_dims, context.GetPlace());
int* parent_idx_data = parent_idx->mutable_data<int>( int* parent_idx_data =
{static_cast<int64_t>(num_seqs * beam_size)}, context.GetPlace()); parent_idx
? parent_idx->mutable_data<int>(
{static_cast<int64_t>(num_seqs * beam_size)},
context.GetPlace())
: nullptr;
framework::LoD selected_lod(2); framework::LoD selected_lod(2);
selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end()); selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
...@@ -396,7 +409,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> { ...@@ -396,7 +409,9 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
{static_cast<int64_t>(selected_lod[1].back()), 1}); {static_cast<int64_t>(selected_lod[1].back()), 1});
selected_ids->Resize(final_selected_dims); selected_ids->Resize(final_selected_dims);
selected_scores->Resize(final_selected_dims); selected_scores->Resize(final_selected_dims);
parent_idx->Resize({static_cast<int64_t>(selected_lod[1].back())}); if (parent_idx) {
parent_idx->Resize({static_cast<int64_t>(selected_lod[1].back())});
}
} }
} }
}; };
......
...@@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Ys", "A list of outputs").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable();
AddAttr<std::string>("subgraph", "the subgraph."); AddAttr<std::string>("subgraph", "the subgraph.");
AddAttr<std::string>("calibration_data", "the calibration data for int8"); AddAttr<std::string>("calibration_data", "the calibration data for int8");
AddAttr<std::string>(
"engine_serialized_data",
"the serialized data contains the all info of the ICUDAEngine");
AddAttr<std::string>( AddAttr<std::string>(
"engine_key", "engine_key",
"The engine_key here is used to distinguish different TRT Engines"); "The engine_key here is used to distinguish different TRT Engines");
......
...@@ -16,8 +16,10 @@ ...@@ -16,8 +16,10 @@
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
...@@ -31,37 +33,6 @@ namespace paddle { ...@@ -31,37 +33,6 @@ namespace paddle {
namespace operators { namespace operators {
using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;
namespace { // NOLINT
TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) {
case FluidDT::VarType_Type_FP32:
return TRT_DT::kFLOAT;
case FluidDT::VarType_Type_INT32:
return TRT_DT::kINT32;
default:
return TRT_DT::kINT32;
}
PADDLE_THROW("unkown type");
return TRT_DT::kINT32;
}
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
PADDLE_ENFORCE_GT(shape.size(), 1UL,
"TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions");
PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
if (shape.size() == 4UL)
return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
return nvinfer1::DimsCHW(shape[1], 1, 1);
}
} // namespace // NOLINT
using inference::Singleton; using inference::Singleton;
using inference::tensorrt::TensorRTEngine; using inference::tensorrt::TensorRTEngine;
using inference::tensorrt::TRTInt8Calibrator; using inference::tensorrt::TRTInt8Calibrator;
...@@ -79,6 +50,7 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -79,6 +50,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
bool enable_int8_; bool enable_int8_;
std::string calibration_data_; std::string calibration_data_;
std::string engine_key_; std::string engine_key_;
std::string engine_serialized_data_;
bool calibration_mode_; bool calibration_mode_;
public: public:
...@@ -93,6 +65,7 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -93,6 +65,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
enable_int8_ = Attr<bool>("enable_int8"); enable_int8_ = Attr<bool>("enable_int8");
calibration_data_ = Attr<std::string>("calibration_data"); calibration_data_ = Attr<std::string>("calibration_data");
engine_key_ = Attr<std::string>("engine_key"); engine_key_ = Attr<std::string>("engine_key");
engine_serialized_data_ = Attr<std::string>("engine_serialized_data");
auto params = Attr<std::vector<std::string>>("parameters"); auto params = Attr<std::vector<std::string>>("parameters");
for (const auto &param : params) { for (const auto &param : params) {
...@@ -125,7 +98,8 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -125,7 +98,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
RunCalibration(scope, dev_place); RunCalibration(scope, dev_place);
return; return;
} }
RunTrt(scope, dev_place); auto *trt_engine = GetEngine(scope, dev_place);
RunTrt(scope, dev_place, trt_engine);
} }
void RunCalibration(const framework::Scope &scope, void RunCalibration(const framework::Scope &scope,
...@@ -136,10 +110,6 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -136,10 +110,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_ LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_
<< " is running calibration trt int8... "; << " is running calibration trt int8... ";
int runtime_batch = 1; int runtime_batch = 1;
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_key_)) { if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_key_)) {
TRTCalibratorEngine *calib_res = TRTCalibratorEngine *calib_res =
Singleton<TRTCalibratorEngineManager>::Global().Create(engine_key_); Singleton<TRTCalibratorEngineManager>::Global().Create(engine_key_);
...@@ -156,11 +126,11 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -156,11 +126,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
calib_buffers, runtime_batch, engine_key_, dev_place)); calib_buffers, runtime_batch, engine_key_, dev_place));
calib_res->thr_.reset(new std::thread([&]() { calib_res->thr_.reset(new std::thread([&]() {
calib_res->engine_.reset(new TensorRTEngine( calib_res->engine_.reset(new TensorRTEngine(
max_batch_size_, workspace_size_, stream, max_batch_size_, workspace_size_, enable_int8_,
boost::get<platform::CUDAPlace>(dev_place).device, enable_int8_, calib_res->calib_.get(),
calib_res->calib_.get())); boost::get<platform::CUDAPlace>(dev_place).device));
VLOG(3) << "start the calib trt engine thread"; VLOG(3) << "start the calib trt engine thread";
Prepare(scope, dev_place, calib_res->engine_.get()); PrepareTRTEngine(scope, calib_res->engine_.get());
})); }));
} }
...@@ -180,28 +150,29 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -180,28 +150,29 @@ class TensorRTEngineOp : public framework::OperatorBase {
RunNativeImpl(scope, dev_place); RunNativeImpl(scope, dev_place);
} }
void RunTrt(const framework::Scope &scope, void RunTrt(const framework::Scope &scope, const platform::Place &dev_place,
const platform::Place &dev_place) const { TensorRTEngine *engine) const {
int runtime_batch = 1; int runtime_batch = 1;
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place); auto &dev_ctx = *pool.Get(dev_place);
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
if (trt_engine_.get() == nullptr) {
trt_engine_.reset(
new TensorRTEngine(max_batch_size_, workspace_size_, stream,
boost::get<platform::CUDAPlace>(dev_place).device,
enable_int8_, calibrator_.get()));
Prepare(scope, dev_place, trt_engine_.get());
}
auto *engine = trt_engine_.get();
PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
std::vector<std::string> output_maps = std::vector<std::string> output_maps =
Attr<std::vector<std::string>>("output_name_mapping"); Attr<std::vector<std::string>>("output_name_mapping");
// Convert input tensor from fluid to engine. int num_inputs = 0;
for (const auto &x : Inputs("Xs")) {
if (param_names_.count(x)) continue;
num_inputs += 1;
}
const int num_bindings = num_inputs + Outputs("Ys").size();
std::vector<void *> buffers(num_bindings);
// Bind input tensor to TRT.
for (const auto &x : Inputs("Xs")) { for (const auto &x : Inputs("Xs")) {
if (param_names_.count(x)) continue; if (param_names_.count(x)) continue;
// convert input and copy to TRT engine's buffer // convert input and copy to TRT engine's buffer
...@@ -209,28 +180,20 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -209,28 +180,20 @@ class TensorRTEngineOp : public framework::OperatorBase {
inference::analysis::GetFromScope<framework::LoDTensor>(scope, x); inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
auto t_shape = framework::vectorize(t.dims()); auto t_shape = framework::vectorize(t.dims());
runtime_batch = t_shape[0]; runtime_batch = t_shape[0];
if (platform::is_cpu_place(t.place())) {
engine->SetInputFromCPU(x, static_cast<const void *>(t.data<void>()),
t.memory_size());
} else {
engine->SetInputFromGPU(x, static_cast<const void *>(t.data<void>()),
t.memory_size());
}
}
cudaStreamSynchronize(stream); const int bind_index = engine->engine()->getBindingIndex(x.c_str());
PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); PADDLE_ENFORCE(bind_index < num_bindings,
// Execute the engine. "The bind index should be less than num_bindings");
engine->Execute(runtime_batch); buffers[bind_index] = static_cast<void *>(t.data<float>());
}
// Convert output tensor from engine to fluid // Bind output tensor to TRT.
int output_index = 0; int output_index = 0;
VLOG(4) << "TensorRT Engine Op Outputs:"; VLOG(4) << "TensorRT Engine Op Outputs:";
for (const auto &y : Outputs("Ys")) { for (const auto &y : Outputs("Ys")) {
VLOG(4) << y; const int bind_index =
// convert output and copy to fluid. engine->engine()->getBindingIndex(output_maps[output_index].c_str());
nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]); auto dims = engine->engine()->getBindingDimensions(bind_index);
auto dims = trt_t->getDimensions();
// Use the output ITensor's dims to reshape the Fluid Tensor. // Use the output ITensor's dims to reshape the Fluid Tensor.
// The ITensor doesn't contain the batch size dim. // The ITensor doesn't contain the batch size dim.
std::vector<int> ddim; std::vector<int> ddim;
...@@ -238,71 +201,55 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -238,71 +201,55 @@ class TensorRTEngineOp : public framework::OperatorBase {
for (int i = 0; i < dims.nbDims; i++) { for (int i = 0; i < dims.nbDims; i++) {
ddim.push_back(dims.d[i]); ddim.push_back(dims.d[i]);
} }
auto *fluid_v = scope.FindVar(y); auto *fluid_v = scope.FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>(); auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
fluid_t->Resize(framework::make_ddim(ddim)); fluid_t->Resize(framework::make_ddim(ddim));
// TODO(Superjomn) change this float to dtype size. PADDLE_ENFORCE(bind_index < num_bindings,
auto size = "The bind index should be less than num_bindings");
inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch; buffers[bind_index] = static_cast<void *>(fluid_t->mutable_data<float>(
engine->GetOutputInGPU( boost::get<platform::CUDAPlace>(dev_place)));
output_maps[output_index],
fluid_t->mutable_data<float>(platform::CUDAPlace(
boost::get<platform::CUDAPlace>(dev_place).device)),
size * sizeof(float));
output_index += 1; output_index += 1;
} }
PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
// Execute the engine.
engine->Execute(runtime_batch, &buffers, stream);
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
} }
void Prepare(const framework::Scope &scope, const platform::Place &dev_place, TensorRTEngine *GetEngine(const framework::Scope &scope,
TensorRTEngine *engine) const { const platform::Place &dev_place) const {
if (!trt_engine_) {
trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
boost::get<platform::CUDAPlace>(dev_place).device));
if (!engine_serialized_data_.empty()) {
trt_engine_->Deserialize(engine_serialized_data_);
} else {
PrepareTRTEngine(scope, trt_engine_.get());
}
}
return trt_engine_.get();
}
void PrepareTRTEngine(const framework::Scope &scope,
TensorRTEngine *engine) const {
LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time."; "kernel etc). This process may cost a lot of time.";
framework::proto::BlockDesc block_desc; framework::proto::BlockDesc block_proto;
block_desc.ParseFromString(Attr<std::string>("subgraph")); block_proto.ParseFromString(Attr<std::string>("subgraph"));
framework::BlockDesc block_desc(nullptr, &block_proto);
std::vector<std::string> output_maps = std::vector<std::string> inputs = Inputs("Xs");
std::vector<std::string> outputs =
Attr<std::vector<std::string>>("output_name_mapping"); Attr<std::vector<std::string>>("output_name_mapping");
engine->InitNetwork();
framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
VLOG(4) << "parsed var size " << block.AllVars().size();
// Add inputs
VLOG(4) << "declare inputs";
for (auto &input : Inputs("Xs")) {
if (param_names_.count(input)) continue;
VLOG(4) << "declare input " << input;
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
auto t_shape = framework::vectorize(t.dims());
auto *var = block.FindVar(input);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input");
engine->DeclareInput(
input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(t_shape));
}
inference::Singleton<inference::tensorrt::OpConverter>::Global() inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlock(block_desc, param_names_, scope, engine); .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
outputs, engine);
// Add outputs
for (auto &output : output_maps) {
engine->DeclareOutput(output);
}
engine->FreezeNetwork();
} }
}; };
......
...@@ -107,6 +107,7 @@ TEST(TensorRTEngineOp, manual) { ...@@ -107,6 +107,7 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc.SetAttr("output_name_mapping", engine_op_desc.SetAttr("output_name_mapping",
std::vector<std::string>({"z0"})); std::vector<std::string>({"z0"}));
engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
LOG(INFO) << "create engine op"; LOG(INFO) << "create engine op";
auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
...@@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
engine_op_desc.SetAttr("output_name_mapping", engine_op_desc.SetAttr("output_name_mapping",
std::vector<std::string>({"z3"})); std::vector<std::string>({"z3"}));
engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
......
...@@ -65,7 +65,8 @@ void BindInferenceApi(py::module *m) { ...@@ -65,7 +65,8 @@ void BindInferenceApi(py::module *m) {
void BindPaddleDType(py::module *m) { void BindPaddleDType(py::module *m) {
py::enum_<PaddleDType>(*m, "PaddleDType") py::enum_<PaddleDType>(*m, "PaddleDType")
.value("FLOAT32", PaddleDType::FLOAT32) .value("FLOAT32", PaddleDType::FLOAT32)
.value("INT64", PaddleDType::INT64); .value("INT64", PaddleDType::INT64)
.value("INT32", PaddleDType::INT32);
} }
void BindPaddleBuf(py::module *m) { void BindPaddleBuf(py::module *m) {
...@@ -103,6 +104,11 @@ void BindPaddleBuf(py::module *m) { ...@@ -103,6 +104,11 @@ void BindPaddleBuf(py::module *m) {
int64_t *data = static_cast<int64_t *>(self.data()); int64_t *data = static_cast<int64_t *>(self.data());
return {data, data + self.length() / sizeof(*data)}; return {data, data + self.length() / sizeof(*data)};
}) })
.def("int32_data",
[](PaddleBuf &self) -> std::vector<int32_t> {
int32_t *data = static_cast<int32_t *>(self.data());
return {data, data + self.length() / sizeof(*data)};
})
.def("length", &PaddleBuf::length); .def("length", &PaddleBuf::length);
} }
...@@ -221,7 +227,8 @@ void BindAnalysisConfig(py::module *m) { ...@@ -221,7 +227,8 @@ void BindAnalysisConfig(py::module *m) {
.def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine,
py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
py::arg("min_subgraph_size") = 3, py::arg("min_subgraph_size") = 3,
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32) py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
py::arg("use_static") = true)
.def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
.def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug,
py::arg("x") = true) py::arg("x") = true)
......
...@@ -31,7 +31,7 @@ class RecordIOWriter { ...@@ -31,7 +31,7 @@ class RecordIOWriter {
RecordIOWriter(const std::string& filename, recordio::Compressor compressor, RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
size_t max_num_record) size_t max_num_record)
: closed_(false), : closed_(false),
stream_(filename), stream_(filename, std::ios::binary),
writer_(&stream_, compressor, max_num_record) {} writer_(&stream_, compressor, max_num_record) {}
void AppendTensor(const framework::LoDTensor& tensor) { void AppendTensor(const framework::LoDTensor& tensor) {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/recordio/scanner.h" #include "paddle/fluid/recordio/scanner.h"
#include <string> #include <string>
#include <utility>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -27,7 +28,8 @@ Scanner::Scanner(std::unique_ptr<std::istream> &&stream) ...@@ -27,7 +28,8 @@ Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
} }
Scanner::Scanner(const std::string &filename) Scanner::Scanner(const std::string &filename)
: stream_(new std::ifstream(filename)), parser_(*stream_) { : stream_(new std::ifstream(filename, std::ios::in | std::ios::binary)),
parser_(*stream_) {
PADDLE_ENFORCE(static_cast<bool>(*stream_), "Cannot open file %s", filename); PADDLE_ENFORCE(static_cast<bool>(*stream_), "Cannot open file %s", filename);
Reset(); Reset();
} }
......
...@@ -37,7 +37,7 @@ def _place_obj(place): ...@@ -37,7 +37,7 @@ def _place_obj(place):
def _is_pserver_mode(main_program): def _is_pserver_mode(main_program):
main = main_program if main_program \ main = main_program if main_program \
else default_main_program() else framework.default_main_program()
for op in main.global_block().ops: for op in main.global_block().ops:
if op.type in ["send", "recv"]: if op.type in ["send", "recv"]:
return True return True
......
...@@ -123,7 +123,7 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -123,7 +123,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
arg_name.endswith('.quantized.dequantized')) arg_name.endswith('.quantized.dequantized'))
self.assertTrue(arg_name in quantized_ops) self.assertTrue(arg_name in quantized_ops)
def linear_fc_quant(self, quant_type): def linear_fc_quant(self, quant_type, for_ci=False):
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
...@@ -138,29 +138,29 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -138,29 +138,29 @@ class TestQuantizationTransformPass(unittest.TestCase):
place=place, place=place,
activation_quantize_type=quant_type) activation_quantize_type=quant_type)
transform_pass.apply(graph) transform_pass.apply(graph)
marked_nodes = set() if not for_ci:
for op in graph.all_op_nodes(): marked_nodes = set()
if op.name().find('quantize') > -1: for op in graph.all_op_nodes():
marked_nodes.add(op) if op.name().find('quantize') > -1:
graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes) marked_nodes.add(op)
graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes)
program = graph.to_program() program = graph.to_program()
self.check_program(transform_pass, program) self.check_program(transform_pass, program)
val_graph = IrGraph(core.Graph(program.desc), for_test=False) val_graph = IrGraph(core.Graph(program.desc), for_test=False)
val_marked_nodes = set() if not for_ci:
for op in val_graph.all_op_nodes(): val_marked_nodes = set()
if op.name().find('quantize') > -1: for op in val_graph.all_op_nodes():
val_marked_nodes.add(op) if op.name().find('quantize') > -1:
val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes) val_marked_nodes.add(op)
val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes)
def test_linear_fc_quant_abs_max(self): def test_linear_fc_quant_abs_max(self):
self.act_quant_op_type = 'fake_quantize_abs_max' self.linear_fc_quant('abs_max', for_ci=True)
self.linear_fc_quant('abs_max')
def test_linear_fc_quant_range_abs_max(self): def test_linear_fc_quant_range_abs_max(self):
self.act_quant_op_type = 'fake_quantize_range_abs_max' self.linear_fc_quant('range_abs_max', for_ci=True)
self.linear_fc_quant('range_abs_max')
def residual_block_quant(self, quant_type): def residual_block_quant(self, quant_type, for_ci=False):
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
...@@ -175,31 +175,31 @@ class TestQuantizationTransformPass(unittest.TestCase): ...@@ -175,31 +175,31 @@ class TestQuantizationTransformPass(unittest.TestCase):
place=place, place=place,
activation_quantize_type=quant_type) activation_quantize_type=quant_type)
transform_pass.apply(graph) transform_pass.apply(graph)
marked_nodes = set() if not for_ci:
for op in graph.all_op_nodes(): marked_nodes = set()
if op.name().find('quantize') > -1: for op in graph.all_op_nodes():
marked_nodes.add(op) if op.name().find('quantize') > -1:
graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes) marked_nodes.add(op)
graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes)
program = graph.to_program() program = graph.to_program()
self.check_program(transform_pass, program) self.check_program(transform_pass, program)
val_graph = IrGraph(core.Graph(program.desc), for_test=False) val_graph = IrGraph(core.Graph(program.desc), for_test=False)
val_marked_nodes = set() if not for_ci:
for op in val_graph.all_op_nodes(): val_marked_nodes = set()
if op.name().find('quantize') > -1: for op in val_graph.all_op_nodes():
val_marked_nodes.add(op) if op.name().find('quantize') > -1:
val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes) val_marked_nodes.add(op)
val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
def test_residual_block_abs_max(self): def test_residual_block_abs_max(self):
self.act_quant_op_type = 'fake_quantize_abs_max' self.residual_block_quant('abs_max', for_ci=True)
self.residual_block_quant('abs_max')
def test_residual_block_range_abs_max(self): def test_residual_block_range_abs_max(self):
self.act_quant_op_type = 'fake_quantize_range_abs_max' self.residual_block_quant('range_abs_max', for_ci=True)
self.residual_block_quant('range_abs_max')
class TestQuantizationFreezePass(unittest.TestCase): class TestQuantizationFreezePass(unittest.TestCase):
def freeze_graph(self, use_cuda, seed, quant_type): def freeze_graph(self, use_cuda, seed, quant_type, for_ci=False):
def build_program(main, startup, is_test): def build_program(main, startup, is_test):
main.random_seed = seed main.random_seed = seed
startup.random_seed = seed startup.random_seed = seed
...@@ -237,16 +237,17 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -237,16 +237,17 @@ class TestQuantizationFreezePass(unittest.TestCase):
transform_pass.apply(main_graph) transform_pass.apply(main_graph)
transform_pass.apply(test_graph) transform_pass.apply(test_graph)
dev_name = '_gpu_' if use_cuda else '_cpu_' dev_name = '_gpu_' if use_cuda else '_cpu_'
marked_nodes = set() if not for_ci:
for op in main_graph.all_op_nodes(): marked_nodes = set()
if op.name().find('quantize') > -1: for op in main_graph.all_op_nodes():
marked_nodes.add(op) if op.name().find('quantize') > -1:
main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes) marked_nodes.add(op)
marked_nodes = set() main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes)
for op in test_graph.all_op_nodes(): marked_nodes = set()
if op.name().find('quantize') > -1: for op in test_graph.all_op_nodes():
marked_nodes.add(op) if op.name().find('quantize') > -1:
test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes) marked_nodes.add(op)
test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes)
quantized_main_program = main_graph.to_program() quantized_main_program = main_graph.to_program()
quantized_test_program = test_graph.to_program() quantized_test_program = test_graph.to_program()
...@@ -266,7 +267,9 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -266,7 +267,9 @@ class TestQuantizationFreezePass(unittest.TestCase):
loss_v = exe.run(program=quantized_main_program, loss_v = exe.run(program=quantized_main_program,
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[loss]) fetch_list=[loss])
print('{}: {}'.format('loss' + dev_name + quant_type, loss_v)) if not for_ci:
print('{}: {}'.format('loss' + dev_name + quant_type,
loss_v))
test_data = next(test_reader()) test_data = next(test_reader())
with fluid.program_guard(quantized_test_program): with fluid.program_guard(quantized_test_program):
...@@ -281,12 +284,13 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -281,12 +284,13 @@ class TestQuantizationFreezePass(unittest.TestCase):
# Freeze graph for inference, but the weight of fc/conv is still float type. # Freeze graph for inference, but the weight of fc/conv is still float type.
freeze_pass = QuantizationFreezePass(scope=scope, place=place) freeze_pass = QuantizationFreezePass(scope=scope, place=place)
freeze_pass.apply(test_graph) freeze_pass.apply(test_graph)
marked_nodes = set() if not for_ci:
for op in test_graph.all_op_nodes(): marked_nodes = set()
if op.name().find('quantize') > -1: for op in test_graph.all_op_nodes():
marked_nodes.add(op) if op.name().find('quantize') > -1:
test_graph.draw('.', 'test_freeze' + dev_name + quant_type, marked_nodes.add(op)
marked_nodes) test_graph.draw('.', 'test_freeze' + dev_name + quant_type,
marked_nodes)
server_program = test_graph.to_program() server_program = test_graph.to_program()
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
...@@ -294,24 +298,30 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -294,24 +298,30 @@ class TestQuantizationFreezePass(unittest.TestCase):
feed=feeder.feed(test_data), feed=feeder.feed(test_data),
fetch_list=[loss]) fetch_list=[loss])
self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3) self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
print('{}: {}'.format('test_loss1' + dev_name + quant_type, test_loss1)) if not for_ci:
print('{}: {}'.format('test_loss2' + dev_name + quant_type, test_loss2)) print('{}: {}'.format('test_loss1' + dev_name + quant_type,
test_loss1))
print('{}: {}'.format('test_loss2' + dev_name + quant_type,
test_loss2))
w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor()) w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
# Maybe failed, this is due to the calculation precision # Maybe failed, this is due to the calculation precision
# self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant)) # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
print('{}: {}'.format('w_freeze' + dev_name + quant_type, if not for_ci:
np.sum(w_freeze))) print('{}: {}'.format('w_freeze' + dev_name + quant_type,
print('{}: {}'.format('w_quant' + dev_name + quant_type, np.sum(w_freeze)))
np.sum(w_quant))) print('{}: {}'.format('w_quant' + dev_name + quant_type,
np.sum(w_quant)))
# Convert parameter to 8-bit. # Convert parameter to 8-bit.
convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place) convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
convert_int8_pass.apply(test_graph) convert_int8_pass.apply(test_graph)
marked_nodes = set() if not for_ci:
for op in test_graph.all_op_nodes(): marked_nodes = set()
if op.name().find('quantize') > -1: for op in test_graph.all_op_nodes():
marked_nodes.add(op) if op.name().find('quantize') > -1:
test_graph.draw('.', 'test_int8' + dev_name + quant_type, marked_nodes) marked_nodes.add(op)
test_graph.draw('.', 'test_int8' + dev_name + quant_type,
marked_nodes)
server_program_int8 = test_graph.to_program() server_program_int8 = test_graph.to_program()
# Save the 8-bit parameter and model file. # Save the 8-bit parameter and model file.
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
...@@ -325,18 +335,21 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -325,18 +335,21 @@ class TestQuantizationFreezePass(unittest.TestCase):
w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor()) w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
self.assertEqual(w_8bit.dtype, np.int8) self.assertEqual(w_8bit.dtype, np.int8)
self.assertEqual(np.sum(w_8bit), np.sum(w_freeze)) self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
print('{}: {}'.format('w_8bit' + dev_name + quant_type, np.sum(w_8bit))) if not for_ci:
print('{}: {}'.format('w_freeze' + dev_name + quant_type, print('{}: {}'.format('w_8bit' + dev_name + quant_type,
np.sum(w_freeze))) np.sum(w_8bit)))
print('{}: {}'.format('w_freeze' + dev_name + quant_type,
np.sum(w_freeze)))
mobile_pass = TransformForMobilePass() mobile_pass = TransformForMobilePass()
mobile_pass.apply(test_graph) mobile_pass.apply(test_graph)
marked_nodes = set() if not for_ci:
for op in test_graph.all_op_nodes(): marked_nodes = set()
if op.name().find('quantize') > -1: for op in test_graph.all_op_nodes():
marked_nodes.add(op) if op.name().find('quantize') > -1:
test_graph.draw('.', 'test_mobile' + dev_name + quant_type, marked_nodes.add(op)
marked_nodes) test_graph.draw('.', 'test_mobile' + dev_name + quant_type,
marked_nodes)
mobile_program = test_graph.to_program() mobile_program = test_graph.to_program()
with fluid.scope_guard(scope): with fluid.scope_guard(scope):
...@@ -347,20 +360,23 @@ class TestQuantizationFreezePass(unittest.TestCase): ...@@ -347,20 +360,23 @@ class TestQuantizationFreezePass(unittest.TestCase):
def test_freeze_graph_cuda_dynamic(self): def test_freeze_graph_cuda_dynamic(self):
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph(True, seed=1, quant_type='abs_max') self.freeze_graph(
True, seed=1, quant_type='abs_max', for_ci=True)
def test_freeze_graph_cpu_dynamic(self): def test_freeze_graph_cpu_dynamic(self):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph(False, seed=2, quant_type='abs_max') self.freeze_graph(False, seed=2, quant_type='abs_max', for_ci=True)
def test_freeze_graph_cuda_static(self): def test_freeze_graph_cuda_static(self):
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph(True, seed=1, quant_type='range_abs_max') self.freeze_graph(
True, seed=1, quant_type='range_abs_max', for_ci=True)
def test_freeze_graph_cpu_static(self): def test_freeze_graph_cpu_static(self):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
self.freeze_graph(False, seed=2, quant_type='range_abs_max') self.freeze_graph(
False, seed=2, quant_type='range_abs_max', for_ci=True)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -205,7 +205,7 @@ class FC(layers.Layer): ...@@ -205,7 +205,7 @@ class FC(layers.Layer):
self._num_flatten_dims = num_flatten_dims self._num_flatten_dims = num_flatten_dims
self._dtype = dtype self._dtype = dtype
self._param_attr = param_attr self._param_attr = param_attr
self._bias_attr = param_attr self._bias_attr = bias_attr
self._act = act self._act = act
def _build_once(self, input): def _build_once(self, input):
...@@ -219,10 +219,10 @@ class FC(layers.Layer): ...@@ -219,10 +219,10 @@ class FC(layers.Layer):
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False)
if self._param_attr: if self._bias_attr:
size = list([self._size]) size = list([self._size])
self._b = self.create_parameter( self._b = self.create_parameter(
attr=self._param_attr, attr=self._bias_attr,
shape=size, shape=size,
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True)
......
...@@ -270,6 +270,9 @@ class LayerHelperBase(object): ...@@ -270,6 +270,9 @@ class LayerHelperBase(object):
attr = copy.deepcopy(attr) attr = copy.deepcopy(attr)
if attr is None: if attr is None:
attr = ParamAttr._to_attr(attr) attr = ParamAttr._to_attr(attr)
if not attr:
return None
assert isinstance(attr, ParamAttr) assert isinstance(attr, ParamAttr)
suffix = 'b' if is_bias else 'w' suffix = 'b' if is_bias else 'w'
if attr.name is None: if attr.name is None:
......
...@@ -51,6 +51,7 @@ __all__ = [ ...@@ -51,6 +51,7 @@ __all__ = [
'yolov3_loss', 'yolov3_loss',
'box_clip', 'box_clip',
'multiclass_nms', 'multiclass_nms',
'distribute_fpn_proposals',
'box_decoder_and_assign', 'box_decoder_and_assign',
] ]
...@@ -2224,6 +2225,79 @@ def multiclass_nms(bboxes, ...@@ -2224,6 +2225,79 @@ def multiclass_nms(bboxes,
return output return output
def distribute_fpn_proposals(fpn_rois,
min_level,
max_level,
refer_level,
refer_scale,
name=None):
"""
In Feature Pyramid Networks (FPN) models, it is needed to distribute all
proposals into different FPN level, with respect to scale of the proposals,
the referring scale and the referring level. Besides, to restore the order
of proposals, we return an array which indicates the original index of rois
in current proposals. To compute FPN level for each roi, the formula is
given as follows:
.. math::
roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
where BBoxArea is a function to compute the area of each roi.
Args:
fpn_rois(variable): The input fpn_rois, the second dimension is 4.
min_level(int): The lowest level of FPN layer where the proposals come
from.
max_level(int): The highest level of FPN layer where the proposals
come from.
refer_level(int): The referring level of FPN layer with specified scale.
refer_scale(int): The referring scale of FPN layer with specified level.
name(str|None): The name of this operator.
Returns:
tuple:
A tuple(multi_rois, restore_ind) is returned. The multi_rois is
a list of segmented tensor variables. The restore_ind is a 2D
Tensor with shape [N, 1], N is the number of total rois. It is
used to restore the order of fpn_rois.
Examples:
.. code-block:: python
fpn_rois = fluid.layers.data(
name='data', shape=[4], dtype='float32', lod_level=1)
multi_rois, restore_ind = fluid.layers.distribute_fpn_proposals(
fpn_rois=fpn_rois,
min_level=2,
max_level=5,
refer_level=4,
refer_scale=224)
"""
helper = LayerHelper('distribute_fpn_proposals', **locals())
dtype = helper.input_dtype()
num_lvl = max_level - min_level + 1
multi_rois = [
helper.create_variable_for_type_inference(dtype) for i in range(num_lvl)
]
restore_ind = helper.create_variable_for_type_inference(dtype='int32')
helper.append_op(
type='distribute_fpn_proposals',
inputs={'FpnRois': fpn_rois},
outputs={'MultiFpnRois': multi_rois,
'RestoreIndex': restore_ind},
attrs={
'min_level': min_level,
'max_level': max_level,
'refer_level': refer_level,
'refer_scale': refer_scale
})
return multi_rois, restore_ind
@templatedoc() @templatedoc()
def box_decoder_and_assign(prior_box, def box_decoder_and_assign(prior_box,
prior_box_var, prior_box_var,
......
...@@ -187,6 +187,7 @@ __all__ = [ ...@@ -187,6 +187,7 @@ __all__ = [
'teacher_student_sigmoid_loss', 'teacher_student_sigmoid_loss',
'huber_loss', 'huber_loss',
'tree_conv', 'tree_conv',
'npair_loss',
] ]
kIgnoreIndex = -100 kIgnoreIndex = -100
...@@ -6977,7 +6978,6 @@ def image_resize(input, ...@@ -6977,7 +6978,6 @@ def image_resize(input,
H_out = (H_{in}+0.5) * scale_{factor} - 0.5 H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5 W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else: else:
input : (N,C,H_in,W_in) input : (N,C,H_in,W_in)
...@@ -10652,3 +10652,60 @@ def tree_conv(nodes_vector, ...@@ -10652,3 +10652,60 @@ def tree_conv(nodes_vector,
else: else:
pre_activation = out pre_activation = out
return helper.append_activation(pre_activation) return helper.append_activation(pre_activation)
from .ops import square
from .control_flow import equal
def npair_loss(anchor, positive, labels, l2_reg=0.002):
'''
**Npair Loss Layer**
Read `Improved Deep Metric Learning with Multi class N pair Loss Objective <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf>`_ .
Npair loss requires paired data. Npair loss has two parts: the first part is L2
regularizer on the embedding vector; the second part is cross entropy loss which
takes the similarity matrix of anchor and positive as logits.
Args:
anchor(Variable): embedding vector for the anchor image. shape=[batch_size, embedding_dims]
positive(Variable): embedding vector for the positive image. shape=[batch_size, embedding_dims]
labels(Variable): 1-D tensor. shape=[batch_size]
l2_reg(float32): L2 regularization term on embedding vector, default: 0.002
Returns:
npair loss(Variable): return npair loss, shape=[1]
Examples:
.. code-block:: python
anchor = fluid.layers.data(
name = 'anchor', shape = [18, 6], dtype = 'float32', append_batch_size=False)
positive = fluid.layers.data(
name = 'positive', shape = [18, 6], dtype = 'float32', append_batch_size=False)
labels = fluid.layers.data(
name = 'labels', shape = [18], dtype = 'float32', append_batch_size=False)
npair_loss = fluid.layers.npair_loss(anchor, positive, labels, l2_reg = 0.002)
'''
Beta = 0.25
batch_size = labels.shape[0]
labels = reshape(labels, shape=[batch_size, 1], inplace=True)
labels = expand(labels, expand_times=[1, batch_size])
labels = equal(labels, transpose(labels, perm=[1, 0])).astype('float32')
labels = labels / reduce_sum(labels, dim=1, keep_dim=True)
l2loss = reduce_mean(reduce_sum(square(anchor), 1)) \
+ reduce_mean(reduce_sum(square(positive), 1))
l2loss = l2loss * Beta * l2_reg
similarity_matrix = matmul(
anchor, positive, transpose_x=False, transpose_y=True)
softmax_value = softmax(similarity_matrix)
cross_entropy = -1 * reduce_sum(labels * log(softmax_value), 0)
celoss = reduce_mean(cross_entropy)
return l2loss + celoss
...@@ -142,7 +142,8 @@ def create_global_var(shape, ...@@ -142,7 +142,8 @@ def create_global_var(shape,
def cast(x, dtype): def cast(x, dtype):
""" """
This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts
it to the output with :attr:`dtype`. it to the output with :attr:`dtype`. It's meaningless if the output
dtype equals the input dtype, but it's fine if you do so.
Args: Args:
x (Variable): The input Variable for casting. x (Variable): The input Variable for casting.
......
...@@ -504,5 +504,21 @@ class TestMulticlassNMS(unittest.TestCase): ...@@ -504,5 +504,21 @@ class TestMulticlassNMS(unittest.TestCase):
self.assertIsNotNone(output) self.assertIsNotNone(output)
class TestDistributeFpnProposals(unittest.TestCase):
def test_distribute_fpn_proposals(self):
program = Program()
with program_guard(program):
fpn_rois = fluid.layers.data(
name='data', shape=[4], dtype='float32', lod_level=1)
multi_rois, restore_ind = layers.distribute_fpn_proposals(
fpn_rois=fpn_rois,
min_level=2,
max_level=5,
refer_level=4,
refer_scale=224)
self.assertIsNotNone(multi_rois)
self.assertIsNotNone(restore_ind)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -26,8 +26,8 @@ class TestAccuracyOp(OpTest): ...@@ -26,8 +26,8 @@ class TestAccuracyOp(OpTest):
self.init_dtype() self.init_dtype()
n = 8192 n = 8192
infer = np.random.random((n, 1)).astype(self.dtype) infer = np.random.random((n, 1)).astype(self.dtype)
indices = np.random.randint(0, 2, (n, 1)) indices = np.random.randint(0, 2, (n, 1)).astype('int64')
label = np.random.randint(0, 2, (n, 1)) label = np.random.randint(0, 2, (n, 1)).astype('int64')
self.inputs = {'Out': infer, 'Indices': indices, "Label": label} self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
num_correct = 0 num_correct = 0
for rowid in range(n): for rowid in range(n):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from test_dist_base import TestDistBase
class TestDistMnistNCCL2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_reduce = False
self._use_reader_alloc = False
self._nccl2_mode = True
def test_dist_train(self):
import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place(
"dist_mnist.py",
delta=1,
need_envs={
"FLAGS_enable_parallel_graph": "1",
"FLAGS_sync_nccl_allreduce": "1"
})
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import math
import sys
from op_test import OpTest
class TestDistributeFPNProposalsOp(OpTest):
def set_data(self):
self.init_test_case()
self.make_rois()
self.rois_fpn, self.rois_idx_restore = self.calc_rois_distribute()
self.inputs = {'FpnRois': (self.rois[:, 1:5], self.rois_lod)}
self.attrs = {
'max_level': self.roi_max_level,
'min_level': self.roi_min_level,
'refer_scale': self.canonical_scale,
'refer_level': self.canonical_level
}
output = [('out%d' % i, self.rois_fpn[i])
for i in range(len(self.rois_fpn))]
self.outputs = {
'MultiFpnRois': output,
'RestoreIndex': self.rois_idx_restore
}
def init_test_case(self):
self.roi_max_level = 5
self.roi_min_level = 2
self.canonical_scale = 224
self.canonical_level = 4
self.images_shape = [512, 512]
def boxes_area(self, boxes):
w = (boxes[:, 2] - boxes[:, 0] + 1)
h = (boxes[:, 3] - boxes[:, 1] + 1)
areas = w * h
assert np.all(areas >= 0), 'Negative areas founds'
return areas
def map_rois_to_fpn_levels(self, rois, lvl_min, lvl_max):
s = np.sqrt(self.boxes_area(rois))
s0 = self.canonical_scale
lvl0 = self.canonical_level
target_lvls = np.floor(lvl0 + np.log2(s / s0 + 1e-6))
target_lvls = np.clip(target_lvls, lvl_min, lvl_max)
return target_lvls
def get_sub_lod(self, sub_lvl):
sub_lod = []
max_batch_id = sub_lvl[-1]
for i in range(max_batch_id.astype(np.int32) + 1):
sub_lod.append(np.where(sub_lvl == i)[0].size)
return sub_lod
def add_multilevel_roi(self, rois, target_lvls, lvl_min, lvl_max):
rois_idx_order = np.empty((0, ))
rois_fpn = []
for lvl in range(lvl_min, lvl_max + 1):
idx_lvl = np.where(target_lvls == lvl)[0]
if len(idx_lvl) == 0:
rois_fpn.append((np.empty(shape=(0, 4)), [[0, 0]]))
continue
sub_lod = self.get_sub_lod(rois[idx_lvl, 0])
rois_fpn.append((rois[idx_lvl, 1:], [sub_lod]))
rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
rois_idx_restore = np.argsort(rois_idx_order).astype(
np.int32, copy=False)
return rois_fpn, rois_idx_restore
def calc_rois_distribute(self):
lvl_min = self.roi_min_level
lvl_max = self.roi_max_level
target_lvls = self.map_rois_to_fpn_levels(self.rois[:, 1:5], lvl_min,
lvl_max)
rois_fpn, rois_idx_restore = self.add_multilevel_roi(
self.rois, target_lvls, lvl_min, lvl_max)
return rois_fpn, rois_idx_restore
def make_rois(self):
self.rois_lod = [[100, 200]]
rois = []
lod = self.rois_lod[0]
bno = 0
for roi_num in lod:
for i in range(roi_num):
xywh = np.random.rand(4)
xy1 = xywh[0:2] * 20
wh = xywh[2:4] * (self.images_shape - xy1)
xy2 = xy1 + wh
roi = [bno, xy1[0], xy1[1], xy2[0], xy2[1]]
rois.append(roi)
bno += 1
self.rois = np.array(rois).astype("float32")
def setUp(self):
self.op_type = "distribute_fpn_proposals"
self.set_data()
def test_check_output(self):
self.check_output()
...@@ -31,7 +31,7 @@ class TestRandomCropOp(OpTest): ...@@ -31,7 +31,7 @@ class TestRandomCropOp(OpTest):
np.array([[6, 7, 8], [10, 11, 12]]).astype(np.int32) np.array([[6, 7, 8], [10, 11, 12]]).astype(np.int32)
] ]
self.op_type = "random_crop" self.op_type = "random_crop"
self.inputs = {'X': to_crop, 'Seed': np.array([10])} self.inputs = {'X': to_crop, 'Seed': np.array([10]).astype('int64')}
self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])} self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])}
self.attrs = {'shape': [2, 3]} self.attrs = {'shape': [2, 3]}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册