From b4194f294226a661688981929b864a760a3a73ec Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Tue, 21 Jan 2020 04:58:20 +0100 Subject: [PATCH] Design doc on NHWC layout support by MKL-DNN integration (#1727) --- .../data_transformation/data_transform.md | 10 +- .../design/mkldnn/nhwc/images/nhwc-grad.svg | 226 ++++++++++++++++++ doc/fluid/design/mkldnn/nhwc/images/nhwc.svg | 151 ++++++++++++ doc/fluid/design/mkldnn/nhwc/index_en.rst | 7 + doc/fluid/design/mkldnn/nhwc/nhwc.md | 95 ++++++++ .../design/mkldnn/nhwc/scripts/nhwc-grad.dot | 45 ++++ doc/fluid/design/mkldnn/nhwc/scripts/nhwc.dot | 17 ++ 7 files changed, 548 insertions(+), 3 deletions(-) create mode 100644 doc/fluid/design/mkldnn/nhwc/images/nhwc-grad.svg create mode 100644 doc/fluid/design/mkldnn/nhwc/images/nhwc.svg create mode 100644 doc/fluid/design/mkldnn/nhwc/index_en.rst create mode 100644 doc/fluid/design/mkldnn/nhwc/nhwc.md create mode 100644 doc/fluid/design/mkldnn/nhwc/scripts/nhwc-grad.dot create mode 100644 doc/fluid/design/mkldnn/nhwc/scripts/nhwc.dot diff --git a/doc/fluid/design/mkldnn/data_transformation/data_transform.md b/doc/fluid/design/mkldnn/data_transformation/data_transform.md index 0a11542b1..025277706 100644 --- a/doc/fluid/design/mkldnn/data_transformation/data_transform.md +++ b/doc/fluid/design/mkldnn/data_transformation/data_transform.md @@ -13,8 +13,12 @@ We can distinguish following scenarios(presented below on the picture): ### Paddle(CPU) kernel is followed by MKL-DNN kernel In a situation when Paddle(CPU) kernel finished execution, its outcome is one or many Tensors of Paddle layout. Each of those -Tensors to be feed into MKL-DNN kernel, needs to be transformed to be of MKL-DNN layout. For this scenario conversion of Paddle Tensor to MKL-DNN Tensor is done by just -changing layout flag to MKL-DNN and picking MKL-DNN format that match Paddle Tensor rank. This is computationally cheap operation as there is no real data rearrangement. +Tensors to be feed into MKL-DNN kernel, needs to be transformed to be of MKL-DNN layout. For this scenario conversion of Paddle Tensor to MKL-DNN Tensor is done by: +* changing layout flag to MKL-DNN +* picking MKL-DNN format that match Paddle Tensor rank +* Rearrange dims order to NCHW + +Those are computationally cheap operation as there is no real data rearrangement. More information on conversion from Paddle layout to MKL-DNN Tensor can be found in relevant [document](../nhwc/nhwc.md) This scenario is drawn on the picture with bold lines. Starting from Paddle(CPU) op on the left side , following arrows drawn in bold and finishing with MKL-DNN op on the right side of picture. @@ -22,7 +26,7 @@ This scenario is drawn on the picture with bold lines. Starting from Paddle(CPU) In this situation MKL-DNN kernel finished its execution and as a result it produced one or more output Tensors. Each of those Tensors are of MKL-DNN layout and to be fed into Paddle(CPU) kernel, they need to be converted into Paddle layout. In a detail MKL-DNN Tensor arrangement (mkl-dnn memory format) is checked if it is compatible with Paddle(CPU) layout and if positive then just layout of Tensor is set as Paddle and mkl-dnn format is set to ``undef``. In case when MKL-DNN Tensor data arrangement is not compatible with Paddle layout then actual data arrangement -is performed. For example MKL-DNN Tensor is 4D and having format ``NCHW16C`` and to convert it into Paddle layout we need to rearrange data to be ``NCHW`` format. To do so +is performed. For example MKL-DNN Tensor is 4D and having format ``NCHW16C`` and to convert it into Paddle layout of ``NCHW`` we need to rearrange data to be ``NCHW`` format. To do so MKL-DNN Reorder primitive is created that can do data rearrangement. This scenario is marked on the picture with outlined, empty inside arrows. Starting from MKL-DNN op on the left side , following empty arrows finishing with Paddle(CPU) op on the right side of picture. diff --git a/doc/fluid/design/mkldnn/nhwc/images/nhwc-grad.svg b/doc/fluid/design/mkldnn/nhwc/images/nhwc-grad.svg new file mode 100644 index 000000000..efc69b1cd --- /dev/null +++ b/doc/fluid/design/mkldnn/nhwc/images/nhwc-grad.svg @@ -0,0 +1,226 @@ + + + + + + +G + + +feed_op + +Feed Op +Paddle + + +tensor_feed + +Tensor feed_op +layout=kNCHW +NHWC dim format + + +feed_op->tensor_feed + + + + +fetch_op + +Fetch Op +Paddle + + +mean_op + +Mean Op +Paddle + + +tensor_output_mean + +Tensor mean op +layout=kNHWC +NHWC dim format + + +mean_op->tensor_output_mean + + + + +mean_grad_op + +Mean Grad Op +Paddle + + +tensor_mean_grad + +Tensor mean grad op +layout=kNHWC +NHWC dim format + + +mean_grad_op->tensor_mean_grad + + + + +conv_mkldnn + +Conv Op +data_format=NHWC +MKL-DNN + + +tensor_mkldnn + +Tensor conv op +layout=kMKLDNN +NCHW dim format + + +conv_mkldnn->tensor_mkldnn + + + + +conv_grad_mkldnn + +Conv Grad Op +data_format=NHWC +MKL-DNN + + +tensor_conv_grad_mkldnn + +Tensor Conv Grad op +layout=kMKLDNN +NCHW dim format + + +conv_grad_mkldnn->tensor_conv_grad_mkldnn + + + + +pool_mkldnn + +Pool Op +data_format=NHWC +MKL-DNN + + +tensor_mkldnn2 + +Tensor pool op +layout=kMKLDNN +NCHW dim format + + +pool_mkldnn->tensor_mkldnn2 + + + + +pool_grad_mkldnn + +Pool Grad Op +data_format=NHWC +MKL-DNN + + +tensor_pool_grad_mkldnn + +Tensor Pool Grad op +layout=kMKLDNN +NCHW dim format + + +pool_grad_mkldnn->tensor_pool_grad_mkldnn + + + + +tensor_mkldnn->pool_mkldnn + + + + +tensor_pool_grad_mkldnn->conv_grad_mkldnn + + + + +tensor_conv_grad_mkldnn->fetch_op + + + + +tensor_input_mean + +Tensor pool op +layout=kNHWC +NHWC dim format + + +tensor_mkldnn2->tensor_input_mean + + + + +tensor_mean_grad_mkldnn + +Tensor Grad mean +layout=kMKLDNN +NCHW dim format + + +tensor_mean_grad_mkldnn->pool_grad_mkldnn + + + + +tensor_input_mean->mean_op + + + + +tensor_input_mean->tensor_mean_grad + + +InferShape + + +tensor_output_mean->mean_grad_op + + + + +tensor_mean_grad->tensor_mean_grad_mkldnn + + + + +tensor_feed2 + +Tensor feed_op +layout=kMKLDNN +NCHW dim format + + +tensor_feed->tensor_feed2 + + + + +tensor_feed2->conv_mkldnn + + + + + diff --git a/doc/fluid/design/mkldnn/nhwc/images/nhwc.svg b/doc/fluid/design/mkldnn/nhwc/images/nhwc.svg new file mode 100644 index 000000000..85eed3e7e --- /dev/null +++ b/doc/fluid/design/mkldnn/nhwc/images/nhwc.svg @@ -0,0 +1,151 @@ + + + + + + +G + + +feed_op + +Feed Op (Input signal) +Paddle + + +input_feed + +Tensor Input signal +layout=kNCHW +NHWC + dim format + + +feed_op->input_feed + + + + +feed_op2 + +Feed Op (Filter data) +Paddle + + +filter_feed + +Tensor filter data +layout=kNCHW +NCHW dim format + + +feed_op2->filter_feed + + + + +fetch_op + +Fetch Op +Paddle + + +tensor_fetch + +Tensor fetch_op +layout=kNCHW +NHWC + dim format + + +fetch_op->tensor_fetch + + + + +conv_mkldnn + +conv Op +data_format=NHWC +MKL-DNN + + +tensor_mkldnn + +Tensor conv op +layout=kMKLDNN +NCHW dim format + + +conv_mkldnn->tensor_mkldnn + + + + +pool_mkldnn + +pool Op +data_format=NHWC +MKL-DNN + + +tensor_mkldnn2 + +Tensor conv op +layout=kMKLDNN +NCHW dim format + + +pool_mkldnn->tensor_mkldnn2 + + + + +tensor_mkldnn->pool_mkldnn + + + + +tensor_mkldnn2->fetch_op + + + + +input_feed2 + +Tensor Input signal +layout=kMKLDNN +NCHW dim format + + +input_feed->input_feed2 + + + + +input_feed2->conv_mkldnn + + + + +filter_feed2 + +Tensor filter data +layout=kMKLDNN +NCHW dim format + + +filter_feed->filter_feed2 + + + + +filter_feed2->conv_mkldnn + + + + + diff --git a/doc/fluid/design/mkldnn/nhwc/index_en.rst b/doc/fluid/design/mkldnn/nhwc/index_en.rst new file mode 100644 index 000000000..24bd3544a --- /dev/null +++ b/doc/fluid/design/mkldnn/nhwc/index_en.rst @@ -0,0 +1,7 @@ +MKL-DNN NHWC support +-------------------------------------- + +.. toctree:: + :maxdepth: 1 + + nhwc.md diff --git a/doc/fluid/design/mkldnn/nhwc/nhwc.md b/doc/fluid/design/mkldnn/nhwc/nhwc.md new file mode 100644 index 000000000..2cba67eac --- /dev/null +++ b/doc/fluid/design/mkldnn/nhwc/nhwc.md @@ -0,0 +1,95 @@ +# Design Doc: MKL-DNN NHWC support + +This document describes design & implementation of ``NHWC`` models using MKL-DNN engine. For overall +description of Tensors interoperability among Paddle and MKL-DNN Tensors please follow relevant [document](../data_transformation/data_transform.md) + +### Introduction + +PaddlePaddle does support execution of program/model using ``NCHW`` as well as ``NHWC`` data arrangement. Reasons for introducing data arrangements are: +* Execution performance of some of non MKL-DNN operators in ``NHWC`` may be faster that when ``NCHW`` data arrangement is used +* Convenience of use as sometimes user got his data prepared already in ``NHWC`` data arrangement. + +Choice among ``NCHW`` and ``NHWC`` is controlled with ``data_format`` attributes following operators: +* conv +* conv transposed +* pool +* LRN +* batch norm + +Other operators (those without data_format) are implemented so that they execute properly regardless the layout, for example elementwise operations. + +Having operators to control what layout (data arrangement) input and output data of operators is, allow +in theory to specify models that partially to work on ``NCHW`` and partially to work on ``NHWC`` data arrangement.However it was agreed on that given model will only have one type of data arrangement at during it execution. +Hence either all ``data_format`` attributes are set to ``NCHW`` (default) or to ``NHWC``, there is no support for having some operators having ``data_format`` set to ``NCHW`` while some others to ``NHWC``. + + +Another element to consider is that PaddlePaddle ``NHWC`` data layout as supported by non-MKLDNN CPU implementations is that ``NHWC`` data arrangement is only applicable to input signal e.g. parameters of listed operators are +always using ``NCHW`` PaddlePaddle layout. + +Final element is that PaddlePaddle data layout change how shape of data looks like. For example ``NCHW`` data shape of [2, 3, 4, 5] when being transformed to ``NHWC`` data will have a shape of [2, 4, 5, 3]. This is different from MKL-DNN shape description which is always ``NCHW`` order even if data underneath is ``NHWC``, ``NCHW16C`` or other. + +### Architecture of ``NHWC`` support in MKL-DNN integration + +Initially a request of ``NHWC`` and ``NCHW`` execution of program were implemented explicitly, e.g. by having MKL-DNN working on that selected data arrangement. This was proved to be very inefficient in terms of performance, as +performance-wise MKL-DNN is designed to work on data arrangements of its own choice (for example blocked formats ``NCHW16C``, ``NCHW8C`` etc.) rather than forcing MKL-DNN to use ``NHWC`` or ``NCHW`` data layout. + +Current solution is that MKL-DNN kernels are working on data layout best suitable for their performance, but +when upon completion of final MKL-DNN operator there has to be conversion (reorder) to either ``NCHW`` or ``NHWC`` Paddle data arrangement. Important note is that last operator executing MKL-DNN kernel may not have a ``data_format`` attribute hence there is need to store information on to what PaddlePaddle layout to convert to from MKL-DNN layouts. For this purpose We have global variable kept per thread (Thread Local Storage). + +To address the difference with shape description mechanism for shape transformation was added : *platform::MatchShapeToLayout()* which perform needed shape modification upon entering and exiting MKL-DNN execution thread of operators. + +Described architecture applied to simple execution of ``NHWC`` model that consists of convolution followed by pooling is presented in the following picture: + +![](images/nhwc.svg) + + +#### Notes on ``NHWC`` grad ops support + +Corresponding grad MKL-DNN kernels of operators listed at the beginning of this document, also are supporting +``NHWC`` models execution. + +All design concepts described in previous section do apply to MKL-DNN grad operators as well. However there +is also one additional element. Some grad operators like *mean* are inferring shape of their output, based on +shape of data produced during forward. In that situation kernel is actually having no need to operate on actual data, as only shape is needed to infer grad output's shape. In this scenario originally there was no data transformation of given variable, hence in particular no changing of shape of Tensor happened. This could result in having wrong shape send to *InferShape* of Grad op. This behaviour was modified to create dummy Variable that carries the shape of data in expected by grad operator, paddle format. + +Described situation is presented in the following picture: +![](images/nhwc-grad.svg) + +### Implementation guidelines + +Instead of modifying each MKL-DNN operator to match described architecture design, common code was modified which consists of modifications to: +* data transformation code +* *InferShape* of each operator supporting ``data_format`` attribute. +* Also each operator was added overloading of *GetKernelTypeForVar* method. + +Hence when enabling any operator to have ``NHWC`` data arrangement supported we need to extend *InferShape* and *GetKernelTypeForVar* + +#### *InferShape()* modifications +This modification is related to fact that MKL-DNN kernel does operate on data with shape described in ``NCHW`` +order, hence We need to make sure that even if ``data_format`` is having value ``NHWC`` still ``Infershape`` will work on ``NCHW`` order. + +Snippet from *PoolOp::InferShape()* that illustrated the idea of modifications to *InferShape*: + + // MKL-DNN Kernels are using NCHW order of dims description + // so we ignore data_format consideration for MKL-DNN kernel + const bool channel_last = (this->IsMKLDNNType() == false) && + (data_format == "NHWC" || data_format == "NDHWC"); + +#### *GetKernelTypeForVar()* overloading + +When performing data transformation we need a value of ``data_format`` and this value is acquired +inside of *GetKernelTypeForVar()* and based on that *data_layout* of Kernel Type is set, to be later +used by data transformation code. + + + + + + + + + + + + + diff --git a/doc/fluid/design/mkldnn/nhwc/scripts/nhwc-grad.dot b/doc/fluid/design/mkldnn/nhwc/scripts/nhwc-grad.dot new file mode 100644 index 000000000..cceb8f6cc --- /dev/null +++ b/doc/fluid/design/mkldnn/nhwc/scripts/nhwc-grad.dot @@ -0,0 +1,45 @@ +digraph G { +splines=ortho +rankdir=LR +feed_op[shape=circle,label="Feed Op\n\nPaddle"] +fetch_op[shape=circle,label="Fetch Op\n\nPaddle"] +mean_op[shape=circle,label="Mean Op\n\nPaddle"] +mean_grad_op[shape=circle,label="Mean Grad Op\n\nPaddle"] +conv_mkldnn[shape=circle,label="Conv Op\ndata_format=NHWC\n\nMKL-DNN"]; +conv_grad_mkldnn[shape=circle,label="Conv Grad Op\ndata_format=NHWC\n\nMKL-DNN"]; +pool_mkldnn[shape=circle,label="Pool Op\ndata_format=NHWC\n\nMKL-DNN"]; +pool_grad_mkldnn[shape=circle,label="Pool Grad Op\ndata_format=NHWC\n\nMKL-DNN"]; +tensor_mkldnn[shape=rectangle,label="Tensor conv op\nlayout=kMKLDNN\nNCHW dim format"] +tensor_pool_grad_mkldnn[shape=rectangle,label="Tensor Pool Grad op\nlayout=kMKLDNN\nNCHW dim format"] +tensor_conv_grad_mkldnn[shape=rectangle,label="Tensor Conv Grad op\nlayout=kMKLDNN\nNCHW dim format"] +tensor_mkldnn2[shape=rectangle,label="Tensor pool op\nlayout=kMKLDNN\nNCHW dim format"] +//tensor_input_data[shape=rectangle,label="Input Data\nlayout=kNCHW\nNCHW dim format"] +tensor_mean_grad_mkldnn[shape=rectangle,label="Tensor Grad mean\nlayout=kMKLDNN\nNCHW dim format"] +//tensor_fetch[shape=rectangle,label="Tensor Fetch_op\nlayout=kNCHW\nNHWC dim format"] +tensor_input_mean[shape=rectangle,label="Tensor pool op\nlayout=kNHWC\nNHWC dim format"] +tensor_output_mean[shape=rectangle,label="Tensor mean op\nlayout=kNHWC\nNHWC dim format"] +tensor_mean_grad[shape=rectangle,label="Tensor mean grad op\nlayout=kNHWC\nNHWC dim format"] +tensor_feed[shape=rectangle,label="Tensor feed_op\nlayout=kNCHW\nNHWC dim format"] +tensor_feed2[shape=rectangle,label="Tensor feed_op\nlayout=kMKLDNN\nNCHW dim format"] + +feed_op -> tensor_feed -> tensor_feed2 -> conv_mkldnn -> tensor_mkldnn -> pool_mkldnn -> tensor_mkldnn2 -> tensor_input_mean -> mean_op + +mean_op -> tensor_output_mean + +tensor_input_mean -> tensor_mean_grad[xlabel="InferShape", style=dashed] + +tensor_output_mean -> mean_grad_op + +mean_grad_op -> tensor_mean_grad -> tensor_mean_grad_mkldnn -> pool_grad_mkldnn -> tensor_pool_grad_mkldnn -> conv_grad_mkldnn -> tensor_conv_grad_mkldnn -> fetch_op + +{rank="same" mean_op; mean_grad_op; tensor_output_mean} +{rank="same" pool_mkldnn; pool_grad_mkldnn} +{rank="same" conv_mkldnn; conv_grad_mkldnn} +{rank="same" tensor_mean_grad; tensor_input_mean} +{rank="same" tensor_mkldnn2; tensor_mean_grad_mkldnn} +{rank="same" tensor_pool_grad_mkldnn; tensor_mkldnn} +{rank="same" tensor_conv_grad_mkldnn; tensor_feed} +{rank="same" fetch_op; feed_op} + +//tensor_mkldnn2 -> tensor_mean_grad[label="Infer shape"] +} diff --git a/doc/fluid/design/mkldnn/nhwc/scripts/nhwc.dot b/doc/fluid/design/mkldnn/nhwc/scripts/nhwc.dot new file mode 100644 index 000000000..dee0df615 --- /dev/null +++ b/doc/fluid/design/mkldnn/nhwc/scripts/nhwc.dot @@ -0,0 +1,17 @@ +digraph G { +rankdir=LR +feed_op[shape=circle,label="Feed Op (Input signal)\n\nPaddle"] +feed_op2[shape=circle,label="Feed Op (Filter data)\n\nPaddle"] +fetch_op[shape=circle,label="Fetch Op\n\nPaddle"] +conv_mkldnn[shape=circle,label="conv Op\ndata_format=NHWC\n\nMKL-DNN"]; +pool_mkldnn[shape=circle,label="pool Op\ndata_format=NHWC\n\nMKL-DNN"]; +tensor_mkldnn[shape=rectangle,label="Tensor conv op\nlayout=kMKLDNN\nNCHW dim format"] +tensor_mkldnn2[shape=rectangle,label="Tensor conv op\nlayout=kMKLDNN\nNCHW dim format"] +tensor_fetch[shape=rectangle,label=layout=kNCHW
NHWC dim format>] +input_feed[shape=rectangle,label=layout=kNCHW
NHWC dim format>] +input_feed2[shape=rectangle,label="Tensor Input signal\nlayout=kMKLDNN\nNCHW dim format"] +filter_feed[shape=rectangle,label="Tensor filter data\nlayout=kNCHW\nNCHW dim format"] +filter_feed2[shape=rectangle,label="Tensor filter data\nlayout=kMKLDNN\nNCHW dim format"] +feed_op -> input_feed -> input_feed2 -> conv_mkldnn -> tensor_mkldnn -> pool_mkldnn -> tensor_mkldnn2 -> fetch_op -> tensor_fetch +feed_op2 -> filter_feed -> filter_feed2 -> conv_mkldnn +} -- GitLab