From d4710163eb04b2c31850a8c0bd770ca44d53c1d1 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Tue, 8 Sep 2020 16:30:08 +0800
Subject: [PATCH 001/261] add timeout unittests retry (#27152)

* add timeout unittests retry

* modifed parameter use
---
 paddle/scripts/paddle_build.bat | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 11932ce7288..5fe311eb693 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -242,7 +242,7 @@ dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
-ctest.exe --output-on-failure -C Release -j 8 --repeat until-pass:4
+ctest.exe --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
 goto:eof
 
 :unit_test_error
-- 
GitLab


From 753a0748eedb7cfe82756f7bd64abf4ee48b5805 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Tue, 8 Sep 2020 16:45:14 +0800
Subject: [PATCH 002/261] Temporarily turn off WITH_INFERENCE_API_TEST (#27170)

---
 paddle/scripts/paddle_build.bat | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 5fe311eb693..1616e237092 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -111,6 +111,8 @@ goto:success
 :CASE_wincheck_openblas
 set WITH_MKL=OFF
 set WITH_GPU=ON
+rem Temporarily turn off WITH_INFERENCE_API_TEST on GPU due to compile hang
+set WITH_INFERENCE_API_TEST=OFF
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
-- 
GitLab


From 944f8ae09c177b44fb52158fa0baf6fc769dbbda Mon Sep 17 00:00:00 2001
From: chalsliu <45041955+chalsliu@users.noreply.github.com>
Date: Tue, 8 Sep 2020 17:21:42 +0800
Subject: [PATCH 003/261] Upgrade coverage tool to python3

---
 tools/coverage/coverage_diff.py      |  4 ++--
 tools/coverage/coverage_diff_list.py |  2 +-
 tools/coverage/coverage_lines.py     | 12 ++++++------
 tools/coverage/paddle_coverage.sh    | 13 +++++++------
 tools/coverage/pull_request.py       |  6 +++---
 tools/coverage/python_coverage.py    | 21 +++++++++------------
 6 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/tools/coverage/coverage_diff.py b/tools/coverage/coverage_diff.py
index 051348d358f..38f671fe408 100644
--- a/tools/coverage/coverage_diff.py
+++ b/tools/coverage/coverage_diff.py
@@ -90,12 +90,12 @@ def get_info_file_lines(info_file, diff_file):
                 continue
 
             elif line.startswith('LF:'):
-                print 'LF:{}'.format(current_lf)
+                print('LF:{}'.format(current_lf))
 
                 continue
 
             elif line.startswith('LH:'):
-                print 'LH:{}'.format(current_lh)
+                print('LH:{}'.format(current_lh))
 
                 continue
 
diff --git a/tools/coverage/coverage_diff_list.py b/tools/coverage/coverage_diff_list.py
index 57222da4d98..8975185edad 100644
--- a/tools/coverage/coverage_diff_list.py
+++ b/tools/coverage/coverage_diff_list.py
@@ -40,7 +40,7 @@ def filter_by(list_file, max_rate):
             except:
                 pass
 
-            print name, rate
+            print(name, rate)
 
 
 if __name__ == '__main__':
diff --git a/tools/coverage/coverage_lines.py b/tools/coverage/coverage_lines.py
index eb846cc9f24..cdec5b8b1bb 100644
--- a/tools/coverage/coverage_lines.py
+++ b/tools/coverage/coverage_lines.py
@@ -33,7 +33,7 @@ def get_lines(info_file):
                 hits += 1
 
     if total == 0:
-        print 'no data found'
+        print('no data found')
         exit()
 
     return hits / total
@@ -47,17 +47,17 @@ if __name__ == '__main__':
     expected = float(sys.argv[2])
 
     if not os.path.isfile(info_file):
-        print 'info file {} is not exists, ignored'.format(info_file)
+        print('info file {} is not exists, ignored'.format(info_file))
         exit()
 
     actual = get_lines(info_file)
     actual = round(actual, 3)
 
     if actual < expected:
-        print 'expected >= {} %, actual {} %, failed'.format(
-            round(expected * 100, 1), round(actual * 100, 1))
+        print('expected >= {} %, actual {} %, failed'.format(
+            round(expected * 100, 1), round(actual * 100, 1)))
 
         exit(1)
 
-    print 'expected >= {} %, actual {} %, passed'.format(
-        round(expected * 100, 1), round(actual * 100, 1))
+    print('expected >= {} %, actual {} %, passed'.format(
+        round(expected * 100, 1), round(actual * 100, 1)))
diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh
index d54434b738d..2ea2b1fe351 100644
--- a/tools/coverage/paddle_coverage.sh
+++ b/tools/coverage/paddle_coverage.sh
@@ -5,7 +5,7 @@ set -xe
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
 
 # install lcov
-curl -o /lcov-1.14.tar.gz -x "" -s https://paddle-ci.gz.bcebos.com/coverage/lcov-1.14.tar.gz
+curl -o /lcov-1.14.tar.gz -s https://paddle-ci.gz.bcebos.com/coverage%2Flcov-1.14.tar.gz
 tar -xf /lcov-1.14.tar.gz -C /
 cd /lcov-1.14
 make install
@@ -14,7 +14,7 @@ make install
 
 cd /paddle/build
 
-python ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID}
+python3 ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID}
 
 lcov --capture -d ./ -o coverage.info --rc lcov_branch_coverage=0
 
@@ -53,9 +53,9 @@ gen_full_html_report || true
 function gen_diff_html_report() {
     if [ "${GIT_PR_ID}" != "" ]; then
 
-        COVERAGE_DIFF_PATTERN="`python ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
+        COVERAGE_DIFF_PATTERN="`python3 ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
 
-        python ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > git-diff.out
+        python3 ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > git-diff.out
     fi
 
     lcov --extract coverage-full.info \
@@ -63,7 +63,7 @@ function gen_diff_html_report() {
         -o coverage-diff.info \
         --rc lcov_branch_coverage=0
 
-    python ${PADDLE_ROOT}/tools/coverage/coverage_diff.py coverage-diff.info git-diff.out > coverage-diff.tmp
+    python3 ${PADDLE_ROOT}/tools/coverage/coverage_diff.py coverage-diff.info git-diff.out > coverage-diff.tmp
 
     mv -f coverage-diff.tmp coverage-diff.info
 
@@ -82,7 +82,7 @@ set -x
 
 coverage xml -i -o python-coverage.xml
 
-python ${PADDLE_ROOT}/tools/coverage/python_coverage.py > python-coverage.info
+python3 ${PADDLE_ROOT}/tools/coverage/python_coverage.py > python-coverage.info
 
 # python full html report
 #
@@ -143,5 +143,6 @@ echo "Assert Python Diff Coverage"
 python ${PADDLE_ROOT}/tools/coverage/coverage_lines.py python-coverage-diff.info 0.9 || PYTHON_COVERAGE_LINES_ASSERT=1
 
 if [ "$COVERAGE_LINES_ASSERT" = "1" ] || [ "$PYTHON_COVERAGE_LINES_ASSERT" = "1" ]; then
+    echo "exit 9" > /tmp/paddle_coverage.result
     exit 9
 fi
diff --git a/tools/coverage/pull_request.py b/tools/coverage/pull_request.py
index 979f476d2a1..105460032f7 100644
--- a/tools/coverage/pull_request.py
+++ b/tools/coverage/pull_request.py
@@ -40,7 +40,7 @@ def get_files(args):
     pull = get_pull(args.pull_id)
 
     for file in pull.get_files():
-        print '/paddle/{}'.format(file.filename)
+        print('/paddle/{}'.format(file.filename))
 
 
 def diff(args):
@@ -55,8 +55,8 @@ def diff(args):
     pull = get_pull(args.pull_id)
 
     for file in pull.get_files():
-        print '+++ {}'.format(file.filename)
-        print file.patch
+        print('+++ {}'.format(file.filename))
+        print(file.patch)
 
 
 if __name__ == '__main__':
diff --git a/tools/coverage/python_coverage.py b/tools/coverage/python_coverage.py
index ba67e12249b..8ad9d85c1bf 100644
--- a/tools/coverage/python_coverage.py
+++ b/tools/coverage/python_coverage.py
@@ -12,10 +12,7 @@ root = tree.getroot()
 
 sources = root.findall('sources/source')
 
-if len(sources) > 1:
-    exit(1)
-
-source = sources[0].text
+source = sources[-1].text
 
 for clazz in root.findall('packages/package/classes/class'):
     clazz_filename = clazz.attrib.get('filename')
@@ -28,8 +25,8 @@ for clazz in root.findall('packages/package/classes/class'):
     if not path.exists(clazz_filename):
         continue
 
-    print 'TN:'
-    print 'SF:{}'.format(clazz_filename)
+    print('TN:')
+    print('SF:{}'.format(clazz_filename))
 
     branch_index = 0
 
@@ -50,16 +47,16 @@ for clazz in root.findall('packages/package/classes/class'):
             taken = int(taken)
 
             for _ in range(taken):
-                print 'BRDA:{},{},{},{}'.format(line_number, 0, branch_index,
-                                                line_hits)
+                print('BRDA:{},{},{},{}'.format(line_number, 0, branch_index,
+                                                line_hits))
                 branch_index += 1
 
             if line_missing_branches:
                 for missing_branch in line_missing_branches.split(','):
-                    print 'BRDA:{},{},{},{}'.format(line_number, 0,
-                                                    branch_index, 0)
+                    print('BRDA:{},{},{},{}'.format(line_number, 0,
+                                                    branch_index, 0))
                     branch_index += 1
 
-        print 'DA:{},{}'.format(line_number, line_hits)
+        print('DA:{},{}'.format(line_number, line_hits))
 
-    print 'end_of_record'
+    print('end_of_record')
-- 
GitLab


From 0dab0fc23c0e7d0baa9ae713cc847f4d6419b90c Mon Sep 17 00:00:00 2001
From: yaoxuefeng <yaoxuefeng@baidu.com>
Date: Tue, 8 Sep 2020 18:35:37 +0800
Subject: [PATCH 004/261] add back triu in fluid (#27135)

---
 python/paddle/fluid/layers/tensor.py                 |  8 +++++++-
 .../fluid/tests/unittests/test_tril_triu_op.py       | 12 ++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index a90551c1b7b..89acfc6075b 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -36,7 +36,7 @@ __all__ = [
     'tensor_array_to_tensor', 'concat', 'sums', 'assign',
     'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
     'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite',
-    'range', 'linspace', 'zeros_like', 'ones_like', 'diag', 'eye'
+    'range', 'linspace', 'zeros_like', 'ones_like', 'diag', 'eye', 'triu'
 ]
 
 
@@ -1725,3 +1725,9 @@ def ones_like(x, out=None):
         attrs={'value': 1.0},
         outputs={'Out': [out]})
     return out
+
+
+@deprecated(since="2.0.0", update_to="paddle.triu")
+def triu(input, diagonal=0, name=None):
+    import paddle
+    return paddle.tensor.triu(x=input, diagonal=diagonal, name=name)
diff --git a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
index aed265b21b5..2cd2599f2ea 100644
--- a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
@@ -142,6 +142,18 @@ class TestTrilTriuOpAPI(unittest.TestCase):
             self.assertTrue(np.allclose(tril_out, np.tril(data)))
             self.assertTrue(np.allclose(triu_out, np.triu(data)))
 
+    def test_fluid_api(self):
+        data = np.random.random([1, 9, 9, 4]).astype('float32')
+        x = fluid.data(shape=[1, 9, -1, 4], dtype='float32', name='x')
+        triu_out = fluid.layers.triu(x)
+
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        triu_out = exe.run(fluid.default_main_program(),
+                           feed={"x": data},
+                           fetch_list=[triu_out])
+
 
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From a28ae86e11f52f054f1c9dedc78ba9ddc3d83d7e Mon Sep 17 00:00:00 2001
From: wangguanzhong <jerrywgz@126.com>
Date: Tue, 8 Sep 2020 19:06:01 +0800
Subject: [PATCH 005/261] Enhance ops to support LoD as input for dygraph
 detection models. (#25316)

* enhance collect_op for dygraph, test=develop

* enhance detection ops with lod, test=develop

* support none bbox left in generate_proposals, test=develop

* unfiy MultiLevelRoisNum, test=develop

* update core.ops, test=develop

* add op register for new input & output, test=develop
---
 .../detection/collect_fpn_proposals_op.cc     |  26 +-
 .../detection/collect_fpn_proposals_op.cu     |  30 +-
 .../detection/collect_fpn_proposals_op.h      |  49 ++-
 .../detection/distribute_fpn_proposals_op.cc  |  32 +-
 .../detection/distribute_fpn_proposals_op.cu  |  25 +-
 .../detection/distribute_fpn_proposals_op.h   |  48 ++-
 .../detection/generate_proposals_op.cc        |  42 ++-
 .../detection/generate_proposals_op.cu        |  27 +-
 paddle/fluid/operators/roi_align_op.cc        |  26 +-
 paddle/fluid/operators/roi_align_op.cu        |  38 +--
 paddle/fluid/operators/roi_align_op.h         |  30 +-
 paddle/fluid/operators/roi_pool_op.cc         |  24 +-
 paddle/fluid/operators/roi_pool_op.cu         |  41 +--
 paddle/fluid/operators/roi_pool_op.h          |  30 +-
 paddle/fluid/pybind/op_function_generator.cc  |   9 +
 python/paddle/fluid/layers/detection.py       | 124 ++++++--
 python/paddle/fluid/layers/nn.py              |  63 ++--
 python/paddle/fluid/tests/test_detection.py   | 279 ++++++++++++++----
 .../test_collect_fpn_proposals_op.py          |  35 ++-
 .../test_distribute_fpn_proposals_op.py       |  32 +-
 .../unittests/test_generate_proposals_op.py   |  33 ++-
 .../fluid/tests/unittests/test_layers.py      |  61 ++--
 .../tests/unittests/test_roi_align_op.py      |   7 +-
 .../fluid/tests/unittests/test_roi_pool_op.py |   7 +-
 24 files changed, 854 insertions(+), 264 deletions(-)

diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
index b3e3332fe34..44f602237da 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License.*/
 
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -54,11 +55,14 @@ class CollectFpnProposalsOp : public framework::OperatorWithKernel {
               score_dim[1]));
     }
     context->SetOutputDim("FpnRois", {post_nms_topN, 4});
+    if (context->HasOutput("RoisNum")) {
+      context->SetOutputDim("RoisNum", {-1});
+    }
     if (!context->IsRuntime()) {  // Runtime LoD infershape will be computed
       // in Kernel.
       context->ShareLoD("MultiLevelRois", "FpnRois");
     }
-    if (context->IsRuntime()) {
+    if (context->IsRuntime() && !context->HasInputs("MultiLevelRoIsNum")) {
       std::vector<framework::InferShapeVarPtr> roi_inputs =
           context->GetInputVarPtrs("MultiLevelRois");
       std::vector<framework::InferShapeVarPtr> score_inputs =
@@ -99,7 +103,16 @@ class CollectFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor) Multiple score LoDTensors from each level in shape"
              " (N, 1), N is the number of RoIs.")
         .AsDuplicable();
+    AddInput(
+        "MultiLevelRoIsNum",
+        "(List of Tensor) The RoIs' number of each image on multiple levels."
+        "The number on each level has the shape of (N), N is the number of "
+        "images.")
+        .AsDuplicable()
+        .AsDispensable();
     AddOutput("FpnRois", "(LoDTensor) All selected RoIs with highest scores");
+    AddOutput("RoisNum", "(Tensor), Number of RoIs in each images.")
+        .AsDispensable();
     AddAttr<int>("post_nms_topN",
                  "Select post_nms_topN RoIs from"
                  " all images and all fpn layers");
@@ -123,3 +136,14 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(collect_fpn_proposals,
                        ops::CollectFpnProposalsOpKernel<float>,
                        ops::CollectFpnProposalsOpKernel<double>);
+REGISTER_OP_VERSION(collect_fpn_proposals)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade collect_fpn_proposals add a new input 
+              [MultiLevelRoIsNum] and add a new output [RoisNum].)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewInput("MultiLevelRoIsNum",
+                      "The RoIs' number of each image on multiple levels."
+                      "The number on each level has the shape of (N), "
+                      "N is the number of images.")
+            .NewOutput("RoisNum", "The number of RoIs in each image."));
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 35222a85cd3..86207052bb2 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -80,14 +80,27 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     int lod_size;
     auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
 
+    auto multi_rois_num = ctx.MultiInput<Tensor>("MultiLevelRoIsNum");
     for (size_t i = 0; i < roi_ins.size(); ++i) {
       auto roi_in = roi_ins[i];
       auto score_in = score_ins[i];
-      auto roi_lod = roi_in->lod().back();
-      lod_size = roi_lod.size() - 1;
-      for (size_t n = 0; n < lod_size; ++n) {
-        for (size_t j = roi_lod[n]; j < roi_lod[n + 1]; ++j) {
-          roi_batch_id_data[index++] = n;
+      if (multi_rois_num.size() > 0) {
+        framework::Tensor temp;
+        TensorCopySync(*multi_rois_num[i], platform::CPUPlace(), &temp);
+        const int* length_in = temp.data<int>();
+        lod_size = multi_rois_num[i]->numel();
+        for (size_t n = 0; n < lod_size; ++n) {
+          for (size_t j = 0; j < length_in[n]; ++j) {
+            roi_batch_id_data[index++] = n;
+          }
+        }
+      } else {
+        auto length_in = roi_in->lod().back();
+        lod_size = length_in.size() - 1;
+        for (size_t n = 0; n < lod_size; ++n) {
+          for (size_t j = length_in[n]; j < length_in[n + 1]; ++j) {
+            roi_batch_id_data[index++] = n;
+          }
         }
       }
 
@@ -190,6 +203,13 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
       offset.emplace_back(offset.back() + length_lod_cpu[i]);
     }
 
+    if (ctx.HasOutput("RoisNum")) {
+      auto* rois_num = ctx.Output<Tensor>("RoisNum");
+      int* rois_num_data = rois_num->mutable_data<int>({lod_size}, place);
+      memory::Copy(place, rois_num_data, place, length_lod_data,
+                   lod_size * sizeof(int), dev_ctx.stream());
+    }
+
     framework::LoD lod;
     lod.emplace_back(offset);
     fpn_rois->set_lod(lod);
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index badd88f0689..950b8b78933 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -17,6 +17,7 @@ limitations under the License.*/
 #include <algorithm>
 #include <cmath>
 #include <cstring>
+#include <numeric>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
@@ -65,6 +66,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     auto multi_layer_scores =
         context.MultiInput<paddle::framework::LoDTensor>("MultiLevelScores");
+    auto multi_rois_num = context.MultiInput<Tensor>("MultiLevelRoIsNum");
+    int num_size = multi_rois_num.size();
 
     auto* fpn_rois = context.Output<paddle::framework::LoDTensor>("FpnRois");
 
@@ -88,11 +91,21 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     const int num_fpn_level = multi_layer_rois.size();
     std::vector<int> integral_of_all_rois(num_fpn_level + 1, 0);
     for (int i = 0; i < num_fpn_level; ++i) {
-      auto cur_rois_lod = multi_layer_rois[i]->lod().back();
-      integral_of_all_rois[i + 1] =
-          integral_of_all_rois[i] + cur_rois_lod[cur_rois_lod.size() - 1];
+      int all_rois = 0;
+      if (num_size == 0) {
+        auto cur_rois_lod = multi_layer_rois[i]->lod().back();
+        all_rois = cur_rois_lod[cur_rois_lod.size() - 1];
+      } else {
+        const int* cur_rois_num = multi_rois_num[i]->data<int>();
+        all_rois = std::accumulate(
+            cur_rois_num, cur_rois_num + multi_rois_num[i]->numel(), 0);
+      }
+      integral_of_all_rois[i + 1] = integral_of_all_rois[i] + all_rois;
     }
 
+    const int batch_size = (num_size == 0)
+                               ? multi_layer_rois[0]->lod().back().size() - 1
+                               : multi_rois_num[0]->numel();
     // concatenate all fpn rois scores into a list
     // create a vector to store all scores
     std::vector<ScoreWithID<T>> scores_of_all_rois(
@@ -100,11 +113,20 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < num_fpn_level; ++i) {
       const T* cur_level_scores = multi_layer_scores[i]->data<T>();
       int cur_level_num = integral_of_all_rois[i + 1] - integral_of_all_rois[i];
-      auto cur_scores_lod = multi_layer_scores[i]->lod().back();
       int cur_batch_id = 0;
+      int pre_num = 0;
       for (int j = 0; j < cur_level_num; ++j) {
-        if (static_cast<size_t>(j) >= cur_scores_lod[cur_batch_id + 1]) {
-          cur_batch_id++;
+        if (num_size == 0) {
+          auto cur_scores_lod = multi_layer_scores[i]->lod().back();
+          if (static_cast<size_t>(j) >= cur_scores_lod[cur_batch_id + 1]) {
+            cur_batch_id++;
+          }
+        } else {
+          const int* rois_num_data = multi_rois_num[i]->data<int>();
+          if (j >= pre_num + rois_num_data[cur_batch_id]) {
+            pre_num += rois_num_data[cur_batch_id];
+            cur_batch_id++;
+          }
         }
         int cur_index = j + integral_of_all_rois[i];
         scores_of_all_rois[cur_index].score = cur_level_scores[j];
@@ -134,6 +156,9 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     T* fpn_rois_data = fpn_rois->data<T>();
     std::vector<size_t> lod0(1, 0);
     int cur_batch_id = 0;
+    std::vector<int64_t> num_per_batch;
+    int pre_idx = 0;
+    int cur_num = 0;
     for (int i = 0; i < post_nms_topN; ++i) {
       int cur_fpn_level = scores_of_all_rois[i].level;
       int cur_level_index = scores_of_all_rois[i].index;
@@ -144,6 +169,18 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
       if (scores_of_all_rois[i].batch_id != cur_batch_id) {
         cur_batch_id = scores_of_all_rois[i].batch_id;
         lod0.emplace_back(i);
+        cur_num = i - pre_idx;
+        pre_idx = i;
+        num_per_batch.emplace_back(cur_num);
+      }
+    }
+    num_per_batch.emplace_back(post_nms_topN - pre_idx);
+    if (context.HasOutput("RoisNum")) {
+      auto* rois_num = context.Output<Tensor>("RoisNum");
+      int* rois_num_data =
+          rois_num->mutable_data<int>({batch_size}, context.GetPlace());
+      for (int i = 0; i < batch_size; i++) {
+        rois_num_data[i] = num_per_batch[i];
       }
     }
     lod0.emplace_back(post_nms_topN);
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
index 160d43a917b..614b37e703e 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -48,6 +49,14 @@ class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputsDim("MultiFpnRois", outs_dims);
     ctx->SetOutputDim("RestoreIndex", {-1, 1});
+
+    if (ctx->HasOutputs("MultiLevelRoIsNum")) {
+      std::vector<framework::DDim> outs_num_dims;
+      for (size_t i = 0; i < num_out_rois; ++i) {
+        outs_num_dims.push_back({-1});
+      }
+      ctx->SetOutputsDim("MultiLevelRoIsNum", outs_num_dims);
+    }
     if (!ctx->IsRuntime()) {
       for (size_t i = 0; i < num_out_rois; ++i) {
         ctx->SetLoDLevel("MultiFpnRois", ctx->GetLoDLevel("FpnRois"), i);
@@ -66,12 +75,22 @@ class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
 class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)");
+    AddInput("FpnRois", "(LoDTensor) The RoIs at all levels in shape (-1, 4)");
+    AddInput("RoisNum",
+             "(Tensor) The number of RoIs in shape (B),"
+             "B is the number of images")
+        .AsDispensable();
     AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator")
         .AsDuplicable();
     AddOutput("RestoreIndex",
               "(Tensor) An array of positive number which is "
               "used to restore the order of FpnRois");
+    AddOutput("MultiLevelRoIsNum",
+              "(List of Tensor) The RoIs' number of each image on multiple "
+              "levels. The number on each level has the shape of (B),"
+              "B is the number of images.")
+        .AsDuplicable()
+        .AsDispensable();
     AddAttr<int>("min_level",
                  "The lowest level of FPN layer where the"
                  " proposals come from");
@@ -105,3 +124,14 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals,
                        ops::DistributeFpnProposalsOpKernel<float>,
                        ops::DistributeFpnProposalsOpKernel<double>);
+REGISTER_OP_VERSION(distribute_fpn_proposals)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade distribute_fpn_proposals add a new input
+              [RoisNum] and add a new output [MultiLevelRoIsNum].)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewInput("RoIsNum", "The number of RoIs in each image.")
+            .NewOutput("MultiLevelRoisNum",
+                       "The RoIs' number of each image on multiple "
+                       "levels. The number on each level has the shape of (B),"
+                       "B is the number of images."));
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 1e3cd9f36c5..27c06a0f8fb 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -76,12 +76,20 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     int num_level = max_level - min_level + 1;
 
     // check that the fpn_rois is not empty
-    PADDLE_ENFORCE_EQ(
-        fpn_rois->lod().size(), 1UL,
-        platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD"
-                                          "with one level"));
+    if (!ctx.HasInput("RoisNum")) {
+      PADDLE_ENFORCE_EQ(
+          fpn_rois->lod().size(), 1UL,
+          platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD"
+                                            "with one level"));
+    }
 
-    auto fpn_rois_lod = fpn_rois->lod().back();
+    std::vector<size_t> fpn_rois_lod;
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num = ctx.Input<Tensor>("RoisNum");
+      fpn_rois_lod = GetLodFromRoisNum(rois_num);
+    } else {
+      fpn_rois_lod = fpn_rois->lod().back();
+    }
     int lod_size = fpn_rois_lod.size() - 1;
     int roi_num = fpn_rois_lod[lod_size];
 
@@ -154,6 +162,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
         restore_idx_data, roi_num);
 
     int start = 0;
+    auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");
+
     for (int i = 0; i < num_level; ++i) {
       Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
       int* sub_lod_data = sub_lod.data<int>();
@@ -180,6 +190,11 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
         multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                            dev_ctx.GetPlace());
       }
+      if (multi_rois_num.size() > 0) {
+        Tensor* rois_num_t = multi_rois_num[i];
+        TensorCopySync(sub_lod, dev_ctx.GetPlace(), rois_num_t);
+        rois_num_t->Resize({lod_size});
+      }
       framework::LoD lod;
       lod.emplace_back(offset);
       multi_fpn_rois[i]->set_lod(lod);
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index 0c84b385ccb..79498f01536 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -28,6 +28,21 @@ namespace operators {
 
 const int kBoxDim = 4;
 
+inline std::vector<size_t> GetLodFromRoisNum(const Tensor* rois_num) {
+  std::vector<size_t> rois_lod;
+  auto* rois_num_data = rois_num->data<int>();
+  Tensor cpu_tensor;
+  if (platform::is_gpu_place(rois_num->place())) {
+    TensorCopySync(*rois_num, platform::CPUPlace(), &cpu_tensor);
+    rois_num_data = cpu_tensor.data<int>();
+  }
+  rois_lod.push_back(static_cast<size_t>(0));
+  for (int i = 0; i < rois_num->numel(); ++i) {
+    rois_lod.push_back(rois_lod.back() + static_cast<size_t>(rois_num_data[i]));
+  }
+  return rois_lod;
+}
+
 template <typename T>
 static inline T BBoxArea(const T* box, bool normalized) {
   if (box[2] < box[0] || box[3] < box[1]) {
@@ -65,13 +80,22 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     const int num_level = max_level - min_level + 1;
 
     // check that the fpn_rois is not empty
-    PADDLE_ENFORCE_EQ(
-        fpn_rois->lod().size(), 1UL,
-        platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD "
-                                          "with one level."));
+    if (!context.HasInput("RoisNum")) {
+      PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "DistributeFpnProposalsOp needs LoD "
+                            "with one level."));
+    }
 
-    auto fpn_rois_lod = fpn_rois->lod().back();
-    int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
+    std::vector<size_t> fpn_rois_lod;
+    int fpn_rois_num;
+    if (context.HasInput("RoisNum")) {
+      auto* rois_num = context.Input<Tensor>("RoisNum");
+      fpn_rois_lod = GetLodFromRoisNum(rois_num);
+    } else {
+      fpn_rois_lod = fpn_rois->lod().back();
+    }
+    fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
     std::vector<int> target_level;
     // std::vector<int> target_level(fpn_rois_num, -1);
     // record the number of rois in each level
@@ -136,6 +160,18 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < fpn_rois_num; ++i) {
       restore_index_data[restore_index_inter[i]] = i;
     }
+    auto multi_rois_num = context.MultiOutput<Tensor>("MultiLevelRoIsNum");
+    if (multi_rois_num.size() > 0) {
+      int batch_size = fpn_rois_lod.size() - 1;
+      for (int i = 0; i < num_level; ++i) {
+        int* rois_num_data = multi_rois_num[i]->mutable_data<int>(
+            {batch_size}, context.GetPlace());
+        for (int j = 0; j < batch_size; ++j) {
+          rois_num_data[j] = static_cast<int>(multi_fpn_rois_lod0[i][j + 1] -
+                                              multi_fpn_rois_lod0[i][j]);
+        }
+      }
+    }
     // merge lod information into LoDTensor
     for (int i = 0; i < num_level; ++i) {
       framework::LoD lod;
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 981a368e856..06e560f86d4 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -61,6 +62,10 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
 
     ctx->SetOutputDim("RpnRois", {-1, 4});
     ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
+    if (!ctx->IsRuntime()) {
+      ctx->SetLoDLevel("RpnRois", std::max(ctx->GetLoDLevel("Scores"), 1));
+      ctx->SetLoDLevel("RpnRoiProbs", std::max(ctx->GetLoDLevel("Scores"), 1));
+    }
   }
 
  protected:
@@ -347,7 +352,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     lod0.push_back(0);
     anchors.Resize({anchors.numel() / 4, 4});
     variances.Resize({variances.numel() / 4, 4});
-    std::vector<int64_t> tmp_lod;
+    std::vector<int> tmp_num;
 
     int64_t num_proposals = 0;
     for (int64_t i = 0; i < num; ++i) {
@@ -369,16 +374,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
       AppendProposals(rpn_roi_probs, num_proposals, scores);
       num_proposals += proposals.dims()[0];
       lod0.push_back(num_proposals);
-      tmp_lod.push_back(num_proposals);
+      tmp_num.push_back(proposals.dims()[0]);
     }
-    if (context.HasOutput("RpnRoisLod")) {
-      auto *rpn_rois_lod = context.Output<Tensor>("RpnRoisLod");
-      rpn_rois_lod->mutable_data<int64_t>({num}, context.GetPlace());
-      int64_t *lod_data = rpn_rois_lod->data<int64_t>();
+    if (context.HasOutput("RpnRoisNum")) {
+      auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
+      rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
+      int *num_data = rpn_rois_num->data<int>();
       for (int i = 0; i < num; i++) {
-        lod_data[i] = tmp_lod[i];
+        num_data[i] = tmp_num[i];
       }
-      rpn_rois_lod->Resize({num});
+      rpn_rois_num->Resize({num});
     }
     rpn_rois->set_lod(lod);
     rpn_roi_probs->set_lod(lod);
@@ -433,6 +438,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 
     Tensor keep;
     FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, &keep);
+    // Handle the case when there is no keep index left
+    if (keep.numel() == 0) {
+      math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
+      set_zero(ctx, &bbox_sel, static_cast<T>(0));
+      Tensor scores_filter;
+      scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
+      set_zero(ctx, &scores_filter, static_cast<T>(0));
+      return std::make_pair(bbox_sel, scores_filter);
+    }
 
     Tensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
@@ -481,7 +496,8 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
               "(LoDTensor), Output proposals with shape (rois_num, 4).");
     AddOutput("RpnRoiProbs",
               "(LoDTensor) Scores of proposals with shape (rois_num, 1).");
-    AddOutput("RpnRoisLod", "(Tensor), rpn rois's lod info").AsDispensable();
+    AddOutput("RpnRoisNum", "(Tensor), The number of Rpn RoIs in each image")
+        .AsDispensable();
     AddAttr<int>("pre_nms_topN",
                  "Number of top scoring RPN proposals to keep before "
                  "applying NMS.");
@@ -515,3 +531,11 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(generate_proposals, ops::GenerateProposalsKernel<float>,
                        ops::GenerateProposalsKernel<double>);
+REGISTER_OP_VERSION(generate_proposals)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade generate_proposals add a new output [RpnRoisNum])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewOutput(
+            "RpnRoisNum",
+            "The number of Rpn RoIs in each image. RpnRoisNum is "
+            "dispensable."));
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index fa7670f6d68..485136d8e2f 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -330,6 +330,15 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   keep_index.Resize({keep_num});
 
   Tensor scores_filter, proposals_filter;
+  // Handle the case when there is no keep index left
+  if (keep_num == 0) {
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    proposals_filter.mutable_data<T>({1, 4}, ctx.GetPlace());
+    scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
+    set_zero(ctx, &proposals_filter, static_cast<T>(0));
+    set_zero(ctx, &scores_filter, static_cast<T>(0));
+    return std::make_pair(proposals_filter, scores_filter);
+  }
   proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
   scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
   GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
@@ -421,7 +430,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
 
     int64_t num_proposals = 0;
     std::vector<size_t> offset(1, 0);
-    std::vector<int64_t> tmp_lod;
+    std::vector<int> tmp_num;
 
     for (int64_t i = 0; i < num; ++i) {
       Tensor im_info_slice = im_info->Slice(i, i + 1);
@@ -448,15 +457,15 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
       dev_ctx.Wait();
       num_proposals += proposals.dims()[0];
       offset.emplace_back(num_proposals);
-      tmp_lod.push_back(num_proposals);
+      tmp_num.push_back(proposals.dims()[0]);
     }
-    if (context.HasOutput("RpnRoisLod")) {
-      auto *rpn_rois_lod = context.Output<Tensor>("RpnRoisLod");
-      rpn_rois_lod->mutable_data<int64_t>({num}, context.GetPlace());
-      int64_t *lod_data = rpn_rois_lod->data<int64_t>();
-      memory::Copy(place, lod_data, cpu_place, &tmp_lod[0],
-                   sizeof(int64_t) * num, dev_ctx.stream());
-      rpn_rois_lod->Resize({num});
+    if (context.HasOutput("RpnRoisNum")) {
+      auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
+      rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
+      int *num_data = rpn_rois_num->data<int>();
+      memory::Copy(place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num,
+                   dev_ctx.stream());
+      rpn_rois_num->Resize({num});
     }
     framework::LoD lod;
     lod.emplace_back(offset);
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 911dfea50e2..0eeb7e0bb24 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/roi_align_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -35,13 +36,13 @@ class ROIAlignOp : public framework::OperatorWithKernel {
     auto input_dims = ctx->GetInputDim("X");
     auto rois_dims = ctx->GetInputDim("ROIs");
 
-    if (ctx->HasInput("RoisLod")) {
-      auto rois_lod_dims = ctx->GetInputDim("RoisLod");
+    if (ctx->HasInput("RoisNum")) {
+      auto rois_num_dims = ctx->GetInputDim("RoisNum");
       PADDLE_ENFORCE_EQ(
-          rois_lod_dims.size(), 1,
-          platform::errors::InvalidArgument("The RoisLod dimension should be 1"
-                                            ", but got dimension = %d",
-                                            rois_lod_dims.size()));
+          rois_num_dims.size(), 1,
+          platform::errors::InvalidArgument("The size of RoisNum should be 1"
+                                            ", but received size = %d",
+                                            rois_num_dims.size()));
     }
     PADDLE_ENFORCE_EQ(
         input_dims.size(), 4,
@@ -145,9 +146,9 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
              "given as [[x1, y1, x2, y2], ...]. "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
-    AddInput("RoisLod",
+    AddInput("RoisNum",
              "(Tensor), "
-             "The lod info of rois.")
+             "The number of RoIs in each image.")
         .AsDispensable();
     AddOutput("Out",
               "(Tensor), "
@@ -203,7 +204,7 @@ class ROIAlignGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("roi_align_grad");
     op->SetInput("X", this->Input("X"));
     op->SetInput("ROIs", this->Input("ROIs"));
-    op->SetInput("RoisLod", this->Input("RoisLod"));
+    op->SetInput("RoisNum", this->Input("RoisNum"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     op->SetAttrMap(this->Attrs());
@@ -231,3 +232,10 @@ REGISTER_OP_CPU_KERNEL(
     ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>,
     ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+REGISTER_OP_VERSION(roi_align)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade roi_align add a new input [RoisNum])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "RoisNum",
+            "The number of RoIs in each image. RoisNum is dispensable."));
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index f7ec13e5bcc..3a4ce55f4fb 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -257,24 +257,26 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto& dev_ctx = ctx.cuda_device_context();
     auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-      int rois_batch_size = rois_lod->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      int rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           platform::errors::InvalidArgument(
               "The rois_batch_size and imgs "
               "batch_size must be the same. But received rois_batch_size = %d, "
               "batch_size = %d",
               rois_batch_size, batch_size));
 
-      std::vector<int64_t> rois_lod_(rois_batch_size);
-      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
-                   sizeof(int64_t) * rois_batch_size, 0);
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+      std::vector<int> rois_num_list(rois_batch_size);
+      memory::Copy(cplace, rois_num_list.data(), gplace,
+                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_list[n];
       }
     } else {
       auto lod = rois->lod();
@@ -348,16 +350,18 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
     auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-      int rois_batch_size = rois_lod->numel();
-      std::vector<int64_t> rois_lod_(rois_batch_size);
-      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
-                   sizeof(int64_t) * rois_batch_size, 0);
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      int rois_batch_size = rois_num_t->numel();
+      std::vector<int> rois_num_list(rois_batch_size);
+      memory::Copy(cplace, rois_num_list.data(), gplace,
+                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = start; i < start + rois_num_list[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_list[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
index 366f8654114..066125a92fb 100644
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -165,21 +165,23 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
     int* roi_batch_id_data =
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
     int rois_batch_size;
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-      rois_batch_size = rois_lod_t->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           platform::errors::InvalidArgument(
               "The batch size of rois and the batch size of images "
               " must be the same. But received the batch size of rois is %d, "
               "and the batch size of images is %d",
               rois_batch_size, batch_size));
-      auto* rois_lod = rois_lod_t->data<int64_t>();
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      auto* rois_num_data = rois_num_t->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_data[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_data[n];
       }
     } else {
       auto lod = rois->lod();
@@ -303,14 +305,16 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
 
     int rois_batch_size;
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-      rois_batch_size = rois_lod_t->numel();
-      auto* rois_lod = rois_lod_t->data<int64_t>();
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
+      auto* rois_num_data = rois_num_t->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_data[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_data[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 8a34cb35f6b..be3187b7513 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/roi_pool_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -34,12 +35,13 @@ class ROIPoolOp : public framework::OperatorWithKernel {
     auto input_dims = ctx->GetInputDim("X");
     auto rois_dims = ctx->GetInputDim("ROIs");
 
-    if (ctx->HasInput("RoisLod")) {
-      auto rois_lod_dims = ctx->GetInputDim("RoisLod");
-      PADDLE_ENFORCE_EQ(rois_lod_dims.size(), 1,
+    if (ctx->HasInput("RoisNum")) {
+      auto rois_num_dims = ctx->GetInputDim("RoisNum");
+      PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1,
                         platform::errors::InvalidArgument(
-                            "The lod information tensor of ROIs should "
-                            "be one-dimensional"));
+                            "The second dimension of RoisNum should "
+                            "be 1, but received dimension is %d",
+                            rois_num_dims.size()));
     }
     PADDLE_ENFORCE_EQ(input_dims.size(), 4,
                       platform::errors::InvalidArgument(
@@ -140,7 +142,8 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "Where batch_id is the id of the data, "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
-    AddInput("RoisLod", "(Tensor), The lod info of rois.").AsDispensable();
+    AddInput("RoisNum", "(Tensor), The number of RoIs in each image.")
+        .AsDispensable();
     AddOutput("Out",
               "(Tensor), "
               "The output of ROIPoolOp is a 4-D tensor with shape "
@@ -197,7 +200,7 @@ class ROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("roi_pool_grad");
     op->SetInput("X", this->Input("X"));
     op->SetInput("ROIs", this->Input("ROIs"));
-    op->SetInput("RoisLod", this->Input("RoisLod"));
+    op->SetInput("RoisNum", this->Input("RoisNum"));
     op->SetInput("Argmax", this->Output("Argmax"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
@@ -223,3 +226,10 @@ REGISTER_OP_CPU_KERNEL(
     ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>,
     ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+REGISTER_OP_VERSION(roi_pool)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade roi_pool add a new input [RoisNum])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "RoisNum",
+            "The number of RoIs in each image. RoisNum is dispensable."));
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 1e8a8e3037d..98d9ef6b6e1 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -157,19 +157,21 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto& dev_ctx = ctx.cuda_device_context();
     auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-      int rois_batch_size = rois_lod->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      int rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           "The rois_batch_size and imgs batch_size must be the same.");
-      std::vector<int64_t> rois_lod_(rois_batch_size);
-      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
-                   sizeof(int64_t) * rois_batch_size, 0);
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+      std::vector<int> rois_num_list(rois_batch_size);
+      memory::Copy(cplace, rois_num_list.data(), gplace,
+                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_list[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
@@ -206,7 +208,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<Tensor>("X");
     auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* rois_lod = ctx.Input<Tensor>("RoisLod");
+    auto* rois_lod = ctx.Input<Tensor>("RoisNum");
     auto* argmax = ctx.Input<Tensor>("Argmax");
 
     auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -229,17 +231,18 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
 
       auto& dev_ctx = ctx.cuda_device_context();
       auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-      if (ctx.HasInput("RoisLod")) {
-        auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-        int rois_batch_size = rois_lod->numel();
-        std::vector<int64_t> rois_lod_(rois_batch_size);
-        memory::Copy(cplace, rois_lod_.data(), gplace,
-                     rois_lod->data<int64_t>(),
-                     sizeof(int64_t) * rois_batch_size, 0);
-        for (int n = 0; n < rois_batch_size - 1; ++n) {
-          for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+      if (ctx.HasInput("RoisNum")) {
+        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+        int rois_batch_size = rois_num_t->numel();
+        std::vector<int> rois_num_list(rois_batch_size);
+        memory::Copy(cplace, rois_num_list.data(), gplace,
+                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+        int start = 0;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (int i = start; i < start + rois_num_list[n]; ++i) {
             roi_batch_id_data[i] = n;
           }
+          start += rois_num_list[n];
         }
       } else {
         auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index 145b170dedf..40de6d0cf6a 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -58,18 +58,20 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
 
     int rois_batch_size;
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-      rois_batch_size = rois_lod_t->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           platform::errors::InvalidArgument("The rois_batch_size and imgs "
                                             "batch_size must be the same."));
-      auto* rois_lod = rois_lod_t->data<int64_t>();
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      auto* rois_num_data = rois_num_t->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_data[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_data[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
@@ -185,14 +187,16 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
           roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
 
       int rois_batch_size;
-      if (ctx.HasInput("RoisLod")) {
-        auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-        rois_batch_size = rois_lod_t->numel();
-        auto* rois_lod = rois_lod_t->data<int64_t>();
-        for (int n = 0; n < rois_batch_size - 1; ++n) {
-          for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      if (ctx.HasInput("RoisNum")) {
+        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+        rois_batch_size = rois_num_t->numel();
+        auto* rois_num_data = rois_num_t->data<int>();
+        int start = 0;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (int i = start; i < start + rois_num_data[n]; ++i) {
             roi_batch_id_data[i] = n;
           }
+          start += rois_num_data[n];
         }
       } else {
         auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 256faf04ea6..178ecaff7e8 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -43,6 +43,11 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"nll_loss", {"X", "Label", "Weight"}},
     {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}},
     {"gather", {"X", "Index", "Axis"}},
+    {"roi_pool", {"X", "ROIs", "RoisNum"}},
+    {"roi_align", {"X", "ROIs", "RoisNum"}},
+    {"collect_fpn_proposals",
+     {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
+    {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -63,6 +68,10 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
     {"unique", {"Out", "Index", "Indices", "Counts"}},
+    {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
+    {"collect_fpn_proposals", {"FpnRois", "RoisNum"}},
+    {"distribute_fpn_proposals",
+     {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index ea6abe2d335..bf87d1fc5a9 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -20,7 +20,8 @@ from __future__ import print_function
 from .layer_function_generator import generate_layer_fn
 from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
-from ..framework import Variable
+from ..framework import Variable, in_dygraph_mode
+from .. import core
 from .loss import softmax_with_cross_entropy
 from . import tensor
 from . import nn
@@ -2893,8 +2894,8 @@ def generate_proposals(scores,
                        nms_thresh=0.5,
                        min_size=0.1,
                        eta=1.0,
-                       name=None,
-                       return_rois_num=False):
+                       return_rois_num=False,
+                       name=None):
     """
 	:alias_main: paddle.nn.functional.generate_proposals
 	:alias: paddle.nn.functional.generate_proposals,paddle.nn.functional.vision.generate_proposals
@@ -2949,6 +2950,10 @@ def generate_proposals(scores,
             num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents
             the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. 
             'False' by default. 
+        name(str, optional): For detailed information, please refer 
+            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            None by default. 
+
     Returns:
         tuple:
         A tuple with format ``(rpn_rois, rpn_roi_probs)``.
@@ -2969,6 +2974,14 @@ def generate_proposals(scores,
                          im_info, anchors, variances)
 
     """
+    if in_dygraph_mode():
+        assert return_rois_num, "return_rois_num should be True in dygraph mode."
+        attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n,
+                 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta)
+        rpn_rois, rpn_roi_probs, rpn_rois_num = core.ops.generate_proposals(
+            scores, bbox_deltas, im_info, anchors, variances, *attrs)
+        return rpn_rois, rpn_roi_probs, rpn_rois_num
+
     helper = LayerHelper('generate_proposals', **locals())
 
     check_variable_and_dtype(scores, 'scores', ['float32'],
@@ -2986,7 +2999,14 @@ def generate_proposals(scores,
         dtype=bbox_deltas.dtype)
     rpn_roi_probs = helper.create_variable_for_type_inference(
         dtype=scores.dtype)
-    rpn_rois_lod = helper.create_variable_for_type_inference(dtype='int32')
+    outputs = {
+        'RpnRois': rpn_rois,
+        'RpnRoiProbs': rpn_roi_probs,
+    }
+    if return_rois_num:
+        rpn_rois_num = helper.create_variable_for_type_inference(dtype='int32')
+        rpn_rois_num.stop_gradient = True
+        outputs['RpnRoisNum'] = rpn_rois_num
 
     helper.append_op(
         type="generate_proposals",
@@ -3004,17 +3024,12 @@ def generate_proposals(scores,
             'min_size': min_size,
             'eta': eta
         },
-        outputs={
-            'RpnRois': rpn_rois,
-            'RpnRoiProbs': rpn_roi_probs,
-            'RpnRoisLod': rpn_rois_lod
-        })
+        outputs=outputs)
     rpn_rois.stop_gradient = True
     rpn_roi_probs.stop_gradient = True
-    rpn_rois_lod.stop_gradient = True
 
     if return_rois_num:
-        return rpn_rois, rpn_roi_probs, rpn_rois_lod
+        return rpn_rois, rpn_roi_probs, rpn_rois_num
     else:
         return rpn_rois, rpn_roi_probs
 
@@ -3656,6 +3671,7 @@ def distribute_fpn_proposals(fpn_rois,
                              max_level,
                              refer_level,
                              refer_scale,
+                             rois_num=None,
                              name=None):
     """
 	:alias_main: paddle.nn.functional.distribute_fpn_proposals
@@ -3687,6 +3703,11 @@ def distribute_fpn_proposals(fpn_rois,
             come from.
         refer_level(int32): The referring level of FPN layer with specified scale.
         refer_scale(int32): The referring scale of FPN layer with specified level.
+        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. 
+            The shape is [B] and data type is int32. B is the number of images.
+            If it is not None then return a list of 1-D Tensor. Each element 
+            is the output RoIs' number of each image on the corresponding level
+            and the shape is [B]. None by default.
         name(str, optional): For detailed information, please refer 
             to :ref:`api_guide_Name`. Usually name is no need to set and 
             None by default. 
@@ -3702,6 +3723,10 @@ def distribute_fpn_proposals(fpn_rois,
         the number of total rois. The data type is int32. It is
         used to restore the order of fpn_rois.
 
+        rois_num_per_level(List): A list of 1-D Tensor and each Tensor is 
+        the RoIs' number in each image on the corresponding level. The shape 
+        is [B] and data type of int32. B is the number of images
+
 
     Examples:
         .. code-block:: python
@@ -3716,26 +3741,52 @@ def distribute_fpn_proposals(fpn_rois,
                 refer_level=4,
                 refer_scale=224)
     """
+    num_lvl = max_level - min_level + 1
+
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',
+                 refer_level, 'refer_scale', refer_scale)
+        multi_rois, restore_ind, rois_num_per_level = core.ops.distribute_fpn_proposals(
+            fpn_rois, rois_num, num_lvl, num_lvl, *attrs)
+        return multi_rois, restore_ind, rois_num_per_level
+
     check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'],
                              'distribute_fpn_proposals')
     helper = LayerHelper('distribute_fpn_proposals', **locals())
     dtype = helper.input_dtype('fpn_rois')
-    num_lvl = max_level - min_level + 1
     multi_rois = [
         helper.create_variable_for_type_inference(dtype) for i in range(num_lvl)
     ]
+
     restore_ind = helper.create_variable_for_type_inference(dtype='int32')
+
+    inputs = {'FpnRois': fpn_rois}
+    outputs = {
+        'MultiFpnRois': multi_rois,
+        'RestoreIndex': restore_ind,
+    }
+
+    if rois_num is not None:
+        inputs['RoisNum'] = rois_num
+        rois_num_per_level = [
+            helper.create_variable_for_type_inference(dtype='int32')
+            for i in range(num_lvl)
+        ]
+        outputs['MultiLevelRoIsNum'] = rois_num_per_level
+
     helper.append_op(
         type='distribute_fpn_proposals',
-        inputs={'FpnRois': fpn_rois},
-        outputs={'MultiFpnRois': multi_rois,
-                 'RestoreIndex': restore_ind},
+        inputs=inputs,
+        outputs=outputs,
         attrs={
             'min_level': min_level,
             'max_level': max_level,
             'refer_level': refer_level,
             'refer_scale': refer_scale
         })
+    if rois_num is not None:
+        return multi_rois, restore_ind, rois_num_per_level
     return multi_rois, restore_ind
 
 
@@ -3820,6 +3871,7 @@ def collect_fpn_proposals(multi_rois,
                           min_level,
                           max_level,
                           post_nms_top_n,
+                          rois_num_per_level=None,
                           name=None):
     """
 	:alias_main: paddle.nn.functional.collect_fpn_proposals
@@ -3846,6 +3898,12 @@ def collect_fpn_proposals(multi_rois,
         min_level(int): The lowest level of FPN layer to collect
         max_level(int): The highest level of FPN layer to collect
         post_nms_top_n(int): The number of selected RoIs
+        rois_num_per_level(list, optional): The List of RoIs' numbers. 
+            Each element is 1-D Tensor which contains the RoIs' number of each 
+            image on each level and the shape is [B] and data type is 
+            int32, B is the number of images. If it is not None then return 
+            a 1-D Tensor contains the output RoIs' number of each image and 
+            the shape is [B]. Default: None
         name(str, optional): For detailed information, please refer 
             to :ref:`api_guide_Name`. Usually name is no need to set and 
             None by default.        
@@ -3856,6 +3914,9 @@ def collect_fpn_proposals(multi_rois,
         fpn_rois(Variable): 2-D LoDTensor with shape [N, 4] and data type is 
         float32 or float64. Selected RoIs. 
 
+        rois_num(Tensor): 1-D Tensor contains the RoIs's number of each 
+        image. The shape is [B] and data type is int32. B is the number of 
+        images. 
 
     Examples:
         .. code-block:: python
@@ -3879,21 +3940,38 @@ def collect_fpn_proposals(multi_rois,
     """
     check_type(multi_rois, 'multi_rois', list, 'collect_fpn_proposals')
     check_type(multi_scores, 'multi_scores', list, 'collect_fpn_proposals')
+    num_lvl = max_level - min_level + 1
+    input_rois = multi_rois[:num_lvl]
+    input_scores = multi_scores[:num_lvl]
+
+    if in_dygraph_mode():
+        assert rois_num_per_level is not None, "rois_num_per_level should not be None in dygraph mode."
+        attrs = ('post_nms_topN', post_nms_top_n)
+        output_rois, rois_num = core.ops.collect_fpn_proposals(
+            input_rois, input_scores, rois_num_per_level, *attrs)
+
     helper = LayerHelper('collect_fpn_proposals', **locals())
     dtype = helper.input_dtype('multi_rois')
     check_dtype(dtype, 'multi_rois', ['float32', 'float64'],
                 'collect_fpn_proposals')
-    num_lvl = max_level - min_level + 1
-    input_rois = multi_rois[:num_lvl]
-    input_scores = multi_scores[:num_lvl]
     output_rois = helper.create_variable_for_type_inference(dtype)
     output_rois.stop_gradient = True
+
+    inputs = {
+        'MultiLevelRois': input_rois,
+        'MultiLevelScores': input_scores,
+    }
+    outputs = {'FpnRois': output_rois}
+    if rois_num_per_level is not None:
+        inputs['MultiLevelRoIsNum'] = rois_num_per_level
+        rois_num = helper.create_variable_for_type_inference(dtype='int32')
+        rois_num.stop_gradient = True
+        outputs['RoisNum'] = rois_num
     helper.append_op(
         type='collect_fpn_proposals',
-        inputs={
-            'MultiLevelRois': input_rois,
-            'MultiLevelScores': input_scores
-        },
-        outputs={'FpnRois': output_rois},
+        inputs=inputs,
+        outputs=outputs,
         attrs={'post_nms_topN': post_nms_top_n})
+    if rois_num_per_level is not None:
+        return output_rois, rois_num
     return output_rois
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 868deb66280..5a14b9fdc7b 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6862,7 +6862,8 @@ def roi_pool(input,
              pooled_height=1,
              pooled_width=1,
              spatial_scale=1.0,
-             rois_lod=None):
+             rois_num=None,
+             name=None):
     """
     :alias_main: paddle.nn.functional.roi_pool
 	:alias: paddle.nn.functional.roi_pool,paddle.nn.functional.vision.roi_pool
@@ -6882,10 +6883,14 @@ def roi_pool(input,
     Args:
         input (Variable): Input feature, 4D-Tensor with the shape of [N,C,H,W], where N is the batch size, C is the input channel, H is Height, W is weight. The data type is float32 or float64.
         rois (Variable): ROIs (Regions of Interest) to pool over. 2D-LoDTensor with the shape of [num_rois,4], the lod level is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates, and (x2, y2) is the bottom right coordinates.
-        rois_lod (Variable): The lod info of rois. Default: None
         pooled_height (int, optional): The pooled output height, data type is int32. Default: 1
         pooled_width (int, optional): The pooled output height, data type is int32. Default: 1
         spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0
+        rois_num (Tensor): The number of RoIs in each image. Default: None
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+
 
     Returns:
         Variable: The pooled feature, 4D-Tensor with the shape of [num_rois, C, pooled_height, pooled_width].
@@ -6905,11 +6910,11 @@ def roi_pool(input,
 
         input_data = np.array([i for i in range(1,17)]).reshape(1,1,4,4).astype(DATATYPE)
         roi_data =fluid.create_lod_tensor(np.array([[1., 1., 2., 2.], [1.5, 1.5, 3., 3.]]).astype(DATATYPE),[[2]], place)
-        rois_lod_data = np.array([0, 2])
+        rois_num_data = np.array([2]).astype('int32')
 
         x = fluid.data(name='input', shape=[None,1,4,4], dtype=DATATYPE)
         rois = fluid.data(name='roi', shape=[None,4], dtype=DATATYPE)
-        rois_lod = fluid.data(name='rois_lod', shape=[None], dtype='int64')
+        rois_num = fluid.data(name='rois_num', shape=[None], dtype='int32')
 
         pool_out = fluid.layers.roi_pool(
                 input=x,
@@ -6917,24 +6922,36 @@ def roi_pool(input,
                 pooled_height=1,
                 pooled_width=1,
                 spatial_scale=1.0,
-                rois_lod=rois_lod)
+                rois_num=rois_num)
 
         exe = fluid.Executor(place)
-        out, = exe.run(feed={'input':input_data ,'roi':roi_data, 'rois_lod': rois_lod_data}, fetch_list=[pool_out.name])
+        out, = exe.run(feed={'input':input_data ,'roi':roi_data, 'rois_num': rois_num_data}, fetch_list=[pool_out.name])
         print(out)   #array([[[[11.]]], [[[16.]]]], dtype=float32)
         print(np.array(out).shape)  # (2, 1, 1, 1)
     """
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        pool_out, argmaxes = core.ops.roi_pool(
+            input, rois, rois_num, "pooled_height", pooled_height,
+            "pooled_width", pooled_width, "spatial_scale", spatial_scale)
+        return pool_out, argmaxes
+
     check_variable_and_dtype(input, 'input', ['float32'], 'roi_pool')
     check_variable_and_dtype(rois, 'rois', ['float32'], 'roi_pool')
     helper = LayerHelper('roi_pool', **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
     argmaxes = helper.create_variable_for_type_inference(dtype='int32')
+
+    inputs = {
+        "X": input,
+        "ROIs": rois,
+    }
+    if rois_num is not None:
+        inputs['RoisNum'] = rois_num
     helper.append_op(
         type="roi_pool",
-        inputs={"X": input,
-                "ROIs": rois,
-                "RoisLod": rois_lod},
+        inputs=inputs,
         outputs={"Out": pool_out,
                  "Argmax": argmaxes},
         attrs={
@@ -6952,8 +6969,8 @@ def roi_align(input,
               pooled_width=1,
               spatial_scale=1.0,
               sampling_ratio=-1,
-              name=None,
-              rois_lod=None):
+              rois_num=None,
+              name=None):
     """
     :alias_main: paddle.nn.functional.roi_align
 	:alias: paddle.nn.functional.roi_align,paddle.nn.functional.vision.roi_align
@@ -6968,11 +6985,11 @@ def roi_align(input,
             data type is float32 or float64. Given as [[x1, y1, x2, y2], ...],
             (x1, y1) is the top left coordinates, and (x2, y2) is the bottom
             right coordinates.
-        rois_lod (Variable): The lod info of rois. Default: None
         pooled_height (int32, optional): ${pooled_height_comment} Default: 1
         pooled_width (int32, optional): ${pooled_width_comment} Default: 1
         spatial_scale (float32, optional): ${spatial_scale_comment} Default: 1.0
         sampling_ratio(int32, optional): ${sampling_ratio_comment} Default: -1
+        rois_num (Tensor): The number of RoIs in each image. Default: None
         name(str, optional): For detailed information, please refer
             to :ref:`api_guide_Name`. Usually name is no need to set and
             None by default.
@@ -6991,26 +7008,38 @@ def roi_align(input,
                 name='data', shape=[None, 256, 32, 32], dtype='float32')
             rois = fluid.data(
                 name='rois', shape=[None, 4], dtype='float32')
-            rois_lod = fluid.data(name='rois_lod', shape=[None], dtype='int64')
+            rois_num = fluid.data(name='rois_num', shape=[None], dtype='int32')
             align_out = fluid.layers.roi_align(input=x,
                                                rois=rois,
                                                pooled_height=7,
                                                pooled_width=7,
                                                spatial_scale=0.5,
                                                sampling_ratio=-1,
-                                               rois_lod=rois_lod)
+                                               rois_num=rois_num)
     """
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        align_out = core.ops.roi_align(
+            input, rois, rois_num, "pooled_height", pooled_height,
+            "pooled_width", pooled_width, "spatial_scale", spatial_scale,
+            "sampling_ratio", sampling_ratio)
+        return align_out
+
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'roi_align')
     check_variable_and_dtype(rois, 'rois', ['float32', 'float64'], 'roi_align')
     helper = LayerHelper('roi_align', **locals())
     dtype = helper.input_dtype()
     align_out = helper.create_variable_for_type_inference(dtype)
+    inputs = {
+        "X": input,
+        "ROIs": rois,
+    }
+    if rois_num is not None:
+        inputs['RoisNum'] = rois_num
     helper.append_op(
         type="roi_align",
-        inputs={"X": input,
-                "ROIs": rois,
-                "RoisLod": rois_lod},
+        inputs=inputs,
         outputs={"Out": align_out},
         attrs={
             "pooled_height": pooled_height,
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index a1526934f4a..425c4e3c7e3 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -19,6 +19,57 @@ import paddle.fluid.layers as layers
 from paddle.fluid.layers import detection
 from paddle.fluid.framework import Program, program_guard
 import unittest
+import contextlib
+import numpy as np
+from unittests.test_imperative_base import new_program_scope
+from paddle.fluid.dygraph import base
+from paddle.fluid import core
+
+
+class LayerTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.seed = 111
+
+    @classmethod
+    def tearDownClass(cls):
+        pass
+
+    def _get_place(self, force_to_use_cpu=False):
+        # this option for ops that only have cpu kernel
+        if force_to_use_cpu:
+            return core.CPUPlace()
+        else:
+            if core.is_compiled_with_cuda():
+                return core.CUDAPlace(0)
+            return core.CPUPlace()
+
+    @contextlib.contextmanager
+    def static_graph(self):
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = self.seed
+            fluid.default_main_program().random_seed = self.seed
+            yield
+
+    def get_static_graph_result(self,
+                                feed,
+                                fetch_list,
+                                with_lod=False,
+                                force_to_use_cpu=False):
+        exe = fluid.Executor(self._get_place(force_to_use_cpu))
+        exe.run(fluid.default_startup_program())
+        return exe.run(fluid.default_main_program(),
+                       feed=feed,
+                       fetch_list=fetch_list,
+                       return_numpy=(not with_lod))
+
+    @contextlib.contextmanager
+    def dynamic_graph(self, force_to_use_cpu=False):
+        with fluid.dygraph.guard(
+                self._get_place(force_to_use_cpu=force_to_use_cpu)):
+            fluid.default_startup_program().random_seed = self.seed
+            fluid.default_main_program().random_seed = self.seed
+            yield
 
 
 class TestDetection(unittest.TestCase):
@@ -481,45 +532,67 @@ class TestRpnTargetAssign(unittest.TestCase):
             print(str(program))
 
 
-class TestGenerateProposals(unittest.TestCase):
+class TestGenerateProposals(LayerTest):
     def test_generate_proposals(self):
-        program = Program()
-        with program_guard(program):
-            data_shape = [20, 64, 64]
-            images = fluid.layers.data(
-                name='images', shape=data_shape, dtype='float32')
-            im_info = fluid.layers.data(
-                name='im_info', shape=[3], dtype='float32')
-            anchors, variances = fluid.layers.anchor_generator(
-                name='anchor_generator',
-                input=images,
-                anchor_sizes=[32, 64],
-                aspect_ratios=[1.0],
-                variance=[0.1, 0.1, 0.2, 0.2],
-                stride=[16.0, 16.0],
-                offset=0.5)
-            num_anchors = anchors.shape[2]
-            scores = fluid.layers.data(
-                name='scores', shape=[num_anchors, 8, 8], dtype='float32')
-            bbox_deltas = fluid.layers.data(
-                name='bbox_deltas',
-                shape=[num_anchors * 4, 8, 8],
-                dtype='float32')
-            rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
-                name='generate_proposals',
-                scores=scores,
-                bbox_deltas=bbox_deltas,
-                im_info=im_info,
-                anchors=anchors,
-                variances=variances,
-                pre_nms_top_n=6000,
-                post_nms_top_n=1000,
-                nms_thresh=0.5,
-                min_size=0.1,
-                eta=1.0)
-            self.assertIsNotNone(rpn_rois)
-            self.assertIsNotNone(rpn_roi_probs)
-            print(rpn_rois.shape)
+        scores_np = np.random.rand(2, 3, 4, 4).astype('float32')
+        bbox_deltas_np = np.random.rand(2, 12, 4, 4).astype('float32')
+        im_info_np = np.array([[8, 8, 0.5], [6, 6, 0.5]]).astype('float32')
+        anchors_np = np.reshape(np.arange(4 * 4 * 3 * 4),
+                                [4, 4, 3, 4]).astype('float32')
+        variances_np = np.ones((4, 4, 3, 4)).astype('float32')
+
+        with self.static_graph():
+            scores = fluid.data(
+                name='scores', shape=[2, 3, 4, 4], dtype='float32')
+            bbox_deltas = fluid.data(
+                name='bbox_deltas', shape=[2, 12, 4, 4], dtype='float32')
+            im_info = fluid.data(name='im_info', shape=[2, 3], dtype='float32')
+            anchors = fluid.data(
+                name='anchors', shape=[4, 4, 3, 4], dtype='float32')
+            variances = fluid.data(
+                name='var', shape=[4, 4, 3, 4], dtype='float32')
+            rois, roi_probs, rois_num = fluid.layers.generate_proposals(
+                scores,
+                bbox_deltas,
+                im_info,
+                anchors,
+                variances,
+                pre_nms_top_n=10,
+                post_nms_top_n=5,
+                return_rois_num=True)
+            rois_stat, roi_probs_stat, rois_num_stat = self.get_static_graph_result(
+                feed={
+                    'scores': scores_np,
+                    'bbox_deltas': bbox_deltas_np,
+                    'im_info': im_info_np,
+                    'anchors': anchors_np,
+                    'var': variances_np
+                },
+                fetch_list=[rois, roi_probs, rois_num],
+                with_lod=True)
+
+        with self.dynamic_graph():
+            scores_dy = base.to_variable(scores_np)
+            bbox_deltas_dy = base.to_variable(bbox_deltas_np)
+            im_info_dy = base.to_variable(im_info_np)
+            anchors_dy = base.to_variable(anchors_np)
+            variances_dy = base.to_variable(variances_np)
+            rois, roi_probs, rois_num = fluid.layers.generate_proposals(
+                scores_dy,
+                bbox_deltas_dy,
+                im_info_dy,
+                anchors_dy,
+                variances_dy,
+                pre_nms_top_n=10,
+                post_nms_top_n=5,
+                return_rois_num=True)
+            rois_dy = rois.numpy()
+            roi_probs_dy = roi_probs.numpy()
+            rois_num_dy = rois_num.numpy()
+
+        self.assertTrue(np.array_equal(np.array(rois_stat), rois_dy))
+        self.assertTrue(np.array_equal(np.array(roi_probs_stat), roi_probs_dy))
+        self.assertTrue(np.array_equal(np.array(rois_num_stat), rois_num_dy))
 
 
 class TestYoloDetection(unittest.TestCase):
@@ -648,30 +721,81 @@ class TestMulticlassNMS2(unittest.TestCase):
             self.assertIsNotNone(index)
 
 
-class TestCollectFpnPropsals(unittest.TestCase):
+class TestCollectFpnPropsals(LayerTest):
     def test_collect_fpn_proposals(self):
-        program = Program()
-        with program_guard(program):
+        multi_bboxes_np = []
+        multi_scores_np = []
+        rois_num_per_level_np = []
+        for i in range(4):
+            bboxes_np = np.random.rand(5, 4).astype('float32')
+            scores_np = np.random.rand(5, 1).astype('float32')
+            rois_num = np.array([2, 3]).astype('int32')
+            multi_bboxes_np.append(bboxes_np)
+            multi_scores_np.append(scores_np)
+            rois_num_per_level_np.append(rois_num)
+
+        with self.static_graph():
             multi_bboxes = []
             multi_scores = []
+            rois_num_per_level = []
             for i in range(4):
-                bboxes = layers.data(
+                bboxes = fluid.data(
                     name='rois' + str(i),
-                    shape=[10, 4],
+                    shape=[5, 4],
                     dtype='float32',
-                    lod_level=1,
-                    append_batch_size=False)
-                scores = layers.data(
+                    lod_level=1)
+                scores = fluid.data(
                     name='scores' + str(i),
-                    shape=[10, 1],
+                    shape=[5, 1],
                     dtype='float32',
-                    lod_level=1,
-                    append_batch_size=False)
+                    lod_level=1)
+                rois_num = fluid.data(
+                    name='rois_num' + str(i), shape=[None], dtype='int32')
+
                 multi_bboxes.append(bboxes)
                 multi_scores.append(scores)
-            fpn_rois = layers.collect_fpn_proposals(multi_bboxes, multi_scores,
-                                                    2, 5, 10)
-            self.assertIsNotNone(fpn_rois)
+                rois_num_per_level.append(rois_num)
+
+            fpn_rois, rois_num = layers.collect_fpn_proposals(
+                multi_bboxes,
+                multi_scores,
+                2,
+                5,
+                10,
+                rois_num_per_level=rois_num_per_level)
+            feed = {}
+            for i in range(4):
+                feed['rois' + str(i)] = multi_bboxes_np[i]
+                feed['scores' + str(i)] = multi_scores_np[i]
+                feed['rois_num' + str(i)] = rois_num_per_level_np[i]
+            fpn_rois_stat, rois_num_stat = self.get_static_graph_result(
+                feed=feed, fetch_list=[fpn_rois, rois_num], with_lod=True)
+            fpn_rois_stat = np.array(fpn_rois_stat)
+            rois_num_stat = np.array(rois_num_stat)
+
+        with self.dynamic_graph():
+            multi_bboxes_dy = []
+            multi_scores_dy = []
+            rois_num_per_level_dy = []
+            for i in range(4):
+                bboxes_dy = base.to_variable(multi_bboxes_np[i])
+                scores_dy = base.to_variable(multi_scores_np[i])
+                rois_num_dy = base.to_variable(rois_num_per_level_np[i])
+                multi_bboxes_dy.append(bboxes_dy)
+                multi_scores_dy.append(scores_dy)
+                rois_num_per_level_dy.append(rois_num_dy)
+            fpn_rois_dy, rois_num_dy = fluid.layers.collect_fpn_proposals(
+                multi_bboxes_dy,
+                multi_scores_dy,
+                2,
+                5,
+                10,
+                rois_num_per_level=rois_num_per_level_dy)
+            fpn_rois_dy = fpn_rois_dy.numpy()
+            rois_num_dy = rois_num_dy.numpy()
+
+        self.assertTrue(np.array_equal(fpn_rois_stat, fpn_rois_dy))
+        self.assertTrue(np.array_equal(rois_num_stat, rois_num_dy))
 
     def test_collect_fpn_proposals_error(self):
         def generate_input(bbox_type, score_type, name):
@@ -717,20 +841,51 @@ class TestCollectFpnPropsals(unittest.TestCase):
                 post_nms_top_n=2000)
 
 
-class TestDistributeFpnProposals(unittest.TestCase):
+class TestDistributeFpnProposals(LayerTest):
     def test_distribute_fpn_proposals(self):
-        program = Program()
-        with program_guard(program):
-            fpn_rois = fluid.layers.data(
-                name='data', shape=[4], dtype='float32', lod_level=1)
-            multi_rois, restore_ind = layers.distribute_fpn_proposals(
-                fpn_rois=fpn_rois,
+        rois_np = np.random.rand(10, 4).astype('float32')
+        rois_num_np = np.array([4, 6]).astype('int32')
+        with self.static_graph():
+            rois = fluid.data(name='rois', shape=[10, 4], dtype='float32')
+            rois_num = fluid.data(name='rois_num', shape=[None], dtype='int32')
+            multi_rois, restore_ind, rois_num_per_level = layers.distribute_fpn_proposals(
+                fpn_rois=rois,
                 min_level=2,
                 max_level=5,
                 refer_level=4,
-                refer_scale=224)
-            self.assertIsNotNone(multi_rois)
-            self.assertIsNotNone(restore_ind)
+                refer_scale=224,
+                rois_num=rois_num)
+            fetch_list = multi_rois + [restore_ind] + rois_num_per_level
+            output_stat = self.get_static_graph_result(
+                feed={'rois': rois_np,
+                      'rois_num': rois_num_np},
+                fetch_list=fetch_list,
+                with_lod=True)
+            output_stat_np = []
+            for output in output_stat:
+                output_np = np.array(output)
+                if len(output_np) > 0:
+                    output_stat_np.append(output_np)
+
+        with self.dynamic_graph():
+            rois_dy = base.to_variable(rois_np)
+            rois_num_dy = base.to_variable(rois_num_np)
+            multi_rois_dy, restore_ind_dy, rois_num_per_level_dy = layers.distribute_fpn_proposals(
+                fpn_rois=rois_dy,
+                min_level=2,
+                max_level=5,
+                refer_level=4,
+                refer_scale=224,
+                rois_num=rois_num_dy)
+            output_dy = multi_rois_dy + [restore_ind_dy] + rois_num_per_level_dy
+            output_dy_np = []
+            for output in output_dy:
+                output_np = output.numpy()
+                if len(output_np) > 0:
+                    output_dy_np.append(output_np)
+
+        for res_stat, res_dy in zip(output_stat_np, output_dy_np):
+            self.assertTrue(np.array_equal(res_stat, res_dy))
 
     def test_distribute_fpn_proposals_error(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
index 034bb7f8dc7..a2f56c42801 100644
--- a/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
@@ -33,10 +33,14 @@ class TestCollectFPNProposalstOp(OpTest):
                     for i in range(self.num_level)]
         self.inputs = {
             'MultiLevelRois': inputs_x,
-            "MultiLevelScores": self.scores_input
+            "MultiLevelScores": self.scores_input,
+            'MultiLevelRoIsNum': []
         }
         self.attrs = {'post_nms_topN': self.post_nms_top_n, }
-        self.outputs = {'FpnRois': (self.rois, [self.lod])}
+        self.outputs = {
+            'FpnRois': (self.rois, [self.lod]),
+            'RoisNum': np.array(self.lod).astype('int32')
+        }
 
     def init_test_case(self):
         self.post_nms_top_n = 20
@@ -96,5 +100,32 @@ class TestCollectFPNProposalstOp(OpTest):
         self.check_output(check_dygraph=False)
 
 
+class TestCollectFPNProposalstOpWithRoisNum(TestCollectFPNProposalstOp):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.scores_input = [('y%d' % i,
+                              (self.scores[i].reshape(-1, 1), self.rois_lod[i]))
+                             for i in range(self.num_level)]
+        self.rois, self.lod = self.calc_rois_collect()
+        inputs_x = [('x%d' % i, (self.roi_inputs[i][:, 1:], self.rois_lod[i]))
+                    for i in range(self.num_level)]
+        rois_num_per_level = [
+            ('rois%d' % i, np.array(self.rois_lod[i][0]).astype('int32'))
+            for i in range(self.num_level)
+        ]
+
+        self.inputs = {
+            'MultiLevelRois': inputs_x,
+            "MultiLevelScores": self.scores_input,
+            'MultiLevelRoIsNum': rois_num_per_level
+        }
+        self.attrs = {'post_nms_topN': self.post_nms_top_n, }
+        self.outputs = {
+            'FpnRois': (self.rois, [self.lod]),
+            'RoisNum': np.array(self.lod).astype('int32')
+        }
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
index 55b21f1a722..ec0125b28ed 100644
--- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
@@ -35,9 +35,10 @@ class TestDistributeFPNProposalsOp(OpTest):
         }
         output = [('out%d' % i, self.rois_fpn[i])
                   for i in range(len(self.rois_fpn))]
+
         self.outputs = {
             'MultiFpnRois': output,
-            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1)
+            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1),
         }
 
     def init_test_case(self):
@@ -117,5 +118,34 @@ class TestDistributeFPNProposalsOp(OpTest):
         self.check_output()
 
 
+class TestDistributeFPNProposalsOpWithRoisNum(TestDistributeFPNProposalsOp):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.rois_fpn, self.rois_idx_restore = self.calc_rois_distribute()
+        self.inputs = {
+            'FpnRois': (self.rois[:, 1:5], self.rois_lod),
+            'RoisNum': np.array(self.rois_lod[0]).astype('int32')
+        }
+        self.attrs = {
+            'max_level': self.roi_max_level,
+            'min_level': self.roi_min_level,
+            'refer_scale': self.canonical_scale,
+            'refer_level': self.canonical_level
+        }
+        output = [('out%d' % i, self.rois_fpn[i])
+                  for i in range(len(self.rois_fpn))]
+        rois_num_per_level = [
+            ('rois_num%d' % i, np.array(self.rois_fpn[i][1][0]).astype('int32'))
+            for i in range(len(self.rois_fpn))
+        ]
+
+        self.outputs = {
+            'MultiFpnRois': output,
+            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1),
+            'MultiLevelRoIsNum': rois_num_per_level
+        }
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index ce561cd317c..26fc01ca045 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -34,18 +34,18 @@ def generate_proposals_in_python(scores, bbox_deltas, im_info, anchors,
 
     rpn_rois = []
     rpn_roi_probs = []
-    lod = []
+    rois_num = []
     num_images = scores.shape[0]
     for img_idx in range(num_images):
         img_i_boxes, img_i_probs = proposal_for_one_image(
             im_info[img_idx, :], all_anchors, variances,
             bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :],
             pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta)
-        lod.append(img_i_probs.shape[0])
+        rois_num.append(img_i_probs.shape[0])
         rpn_rois.append(img_i_boxes)
         rpn_roi_probs.append(img_i_probs)
 
-    return rpn_rois, rpn_roi_probs, lod
+    return rpn_rois, rpn_roi_probs, rois_num
 
 
 def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
@@ -87,6 +87,10 @@ def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
     proposals = clip_tiled_boxes(proposals, im_info[:2])
     # remove predicted boxes with height or width < min_size
     keep = filter_boxes(proposals, min_size, im_info)
+    if len(keep) == 0:
+        proposals = np.zeros((1, 4)).astype('float32')
+        scores = np.zeros((1, 1)).astype('float32')
+        return proposals, scores
     proposals = proposals[keep, :]
     scores = scores[keep, :]
 
@@ -280,8 +284,8 @@ class TestGenerateProposalsOp(OpTest):
         }
 
         self.outputs = {
-            'RpnRois': (self.rpn_rois[0], [self.lod]),
-            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]),
+            'RpnRois': (self.rpn_rois[0], [self.rois_num]),
+            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
         }
 
     def test_check_output(self):
@@ -320,7 +324,7 @@ class TestGenerateProposalsOp(OpTest):
             (batch_size, num_anchors * 4, layer_h, layer_w)).astype('float32')
 
     def init_test_output(self):
-        self.rpn_rois, self.rpn_roi_probs, self.lod = generate_proposals_in_python(
+        self.rpn_rois, self.rpn_roi_probs, self.rois_num = generate_proposals_in_python(
             self.scores, self.bbox_deltas, self.im_info, self.anchors,
             self.variances, self.pre_nms_topN, self.post_nms_topN,
             self.nms_thresh, self.min_size, self.eta)
@@ -349,12 +353,21 @@ class TestGenerateProposalsOutLodOp(TestGenerateProposalsOp):
         }
 
         self.outputs = {
-            'RpnRois': (self.rpn_rois[0], [self.lod]),
-            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]),
-            'RpnRoisLod': (np.asarray(
-                self.lod, dtype=np.int32))
+            'RpnRois': (self.rpn_rois[0], [self.rois_num]),
+            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
+            'RpnRoisNum': (np.asarray(
+                self.rois_num, dtype=np.int32))
         }
 
 
+class TestGenerateProposalsOpNoBoxLeft(TestGenerateProposalsOp):
+    def init_test_params(self):
+        self.pre_nms_topN = 12000  # train 12000, test 2000
+        self.post_nms_topN = 5000  # train 6000, test 1000
+        self.nms_thresh = 0.7
+        self.min_size = 1000.0
+        self.eta = 1.
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index b76887f0965..89e9f7aad85 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -3318,15 +3318,29 @@ class TestBook(LayerTest):
             return (out)
 
     def test_roi_pool(self):
-        # TODO(minqiyang): dygraph do not support lod now
+        x_np = np.random.rand(2, 3, 8, 8).astype('float32')
+        rois_np = np.random.rand(3, 4).astype('float32')
+        rois_num_np = np.array([1, 2]).astype('int32')
+
         with self.static_graph():
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            rois_lod = layers.data(
-                name="rois_lod", shape=[None, ], dtype="int", lod_level=1)
-            output = layers.roi_pool(x, rois, 7, 7, 0.6, rois_lod)
-            return (output)
+            x = layers.data(name="x", shape=[3, 8, 8], dtype="float32")
+            rois = layers.data(name="rois", shape=[4], dtype="float32")
+            rois_num = fluid.data(name="rois_num", shape=[None], dtype="int32")
+            output = layers.roi_pool(x, rois, 4, 4, 0.5, rois_num=rois_num)
+            static_res = self.get_static_graph_result(
+                feed={'x': x_np,
+                      'rois': rois_np,
+                      'rois_num': rois_num_np},
+                fetch_list=[output])[0]
+
+        with self.dynamic_graph():
+            x_dy = base.to_variable(x_np)
+            rois_dy = base.to_variable(rois_np)
+            rois_num_dy = base.to_variable(rois_num_np)
+            dy_res = layers.roi_pool(
+                x_dy, rois_dy, 4, 4, 0.5, rois_num=rois_num_dy)
+            dy_res_value = dy_res[0].numpy()
+        self.assertTrue(np.array_equal(static_res, dy_res_value))
 
     def test_sequence_enumerate(self):
         # TODO(minqiyang): dygraph do not support lod now
@@ -3335,16 +3349,29 @@ class TestBook(LayerTest):
             out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
 
     def test_roi_align(self):
-        # TODO(minqiyang): dygraph do not support lod now
+        x_np = np.random.rand(2, 3, 8, 8).astype('float32')
+        rois_np = np.random.rand(3, 4).astype('float32')
+        rois_num_np = np.array([1, 2]).astype('int32')
+
         with self.static_graph():
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            rois_lod = layers.data(
-                name="rois_lod", shape=[None, ], dtype="int", lod_level=1)
-            output = layers.roi_align(x, rois, 14, 14, 0.5, 2, 'roi_align',
-                                      rois_lod)
-            return (output)
+            x = layers.data(name="x", shape=[3, 8, 8], dtype="float32")
+            rois = layers.data(name="rois", shape=[4], dtype="float32")
+            rois_num = fluid.data(name="rois_num", shape=[None], dtype="int32")
+            output = layers.roi_align(x, rois, 4, 4, 0.5, 2, rois_num=rois_num)
+            static_res = self.get_static_graph_result(
+                feed={'x': x_np,
+                      'rois': rois_np,
+                      'rois_num': rois_num_np},
+                fetch_list=[output])[0]
+
+        with self.dynamic_graph():
+            x_dy = base.to_variable(x_np)
+            rois_dy = base.to_variable(rois_np)
+            rois_num_dy = base.to_variable(rois_num_np)
+            dy_res = layers.roi_align(
+                x_dy, rois_dy, 4, 4, 0.5, 2, rois_num=rois_num_dy)
+            dy_res_value = dy_res.numpy()
+        self.assertTrue(np.array_equal(static_res, dy_res_value))
 
     def test_roi_perspective_transform(self):
         # TODO(minqiyang): dygraph do not support lod now
diff --git a/python/paddle/fluid/tests/unittests/test_roi_align_op.py b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
index b0186388086..fb8a090b807 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
@@ -181,16 +181,11 @@ class TestROIAlignInLodOp(TestROIAlignOp):
         self.calc_roi_align()
 
         seq_len = self.rois_lod[0]
-        cur_len = 0
-        lod = [cur_len]
-        for l in seq_len:
-            cur_len += l
-            lod.append(cur_len)
 
         self.inputs = {
             'X': self.x,
             'ROIs': (self.rois[:, 1:5], self.rois_lod),
-            'RoisLod': np.asarray(lod).astype('int64')
+            'RoisNum': np.asarray(seq_len).astype('int32')
         }
 
         self.attrs = {
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index 1200b0e3470..c6622cf8d9c 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -174,16 +174,11 @@ class TestROIPoolInLodOp(TestROIPoolOp):
         self.calc_roi_pool()
 
         seq_len = self.rois_lod[0]
-        cur_len = 0
-        lod = [cur_len]
-        for l in seq_len:
-            cur_len += l
-            lod.append(cur_len)
 
         self.inputs = {
             'X': self.x,
             'ROIs': (self.rois[:, 1:5], self.rois_lod),
-            'RoisLod': np.asarray(lod).astype('int64')
+            'RoisNum': np.asarray(seq_len).astype('int32')
         }
 
         self.attrs = {
-- 
GitLab


From eb019760374defc98cebc0a69577906856ccc672 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Tue, 8 Sep 2020 19:11:08 +0800
Subject: [PATCH 006/261] [2.0 API]Add checker in grid_sample_grad op (#27126)

---
 paddle/fluid/operators/grid_sampler_op.cc                 | 6 +++++-
 paddle/fluid/operators/grid_sampler_op.cu                 | 4 ++--
 paddle/fluid/operators/grid_sampler_op.h                  | 4 ++--
 .../fluid/tests/unittests/test_grid_sample_function.py    | 2 +-
 .../paddle/fluid/tests/unittests/test_grid_sampler_op.py  | 8 ++++----
 python/paddle/nn/functional/vision.py                     | 4 ++--
 6 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index deb71b80712..f5224239eb2 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -115,7 +115,7 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::string>(
         "padding_mode",
         "(bool, default true) The padding method used when source"
-        "index is out of input images. It can be 'zeros', 'reflect' and "
+        "index is out of input images. It can be 'zeros', 'reflection' and "
         "'border'.")
         .SetDefault("zeros");
 
@@ -174,6 +174,10 @@ class GridSampleOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   framework::GradVarName("X"), "grid_sampler");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Grid")), "Output",
+                   framework::GradVarName("Grid"), "grid_sampler");
     auto input_dims = ctx->GetInputDim("X");
     auto grid_dims = ctx->GetInputDim("Grid");
     if (ctx->HasOutput(framework::GradVarName("X"))) {
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
index 999f990448c..4e61d0c2ea7 100644
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -268,7 +268,7 @@ class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
     Mode mode;
     if (padding_mode_s == "border") {
       padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflect") {
+    } else if (padding_mode_s == "reflection") {
       padding_mode = PaddingMode::reflect;
     } else {
       padding_mode = PaddingMode::zeros;
@@ -432,7 +432,7 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
     Mode mode;
     if (padding_mode_s == "border") {
       padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflect") {
+    } else if (padding_mode_s == "reflection") {
       padding_mode = PaddingMode::reflect;
     } else {
       padding_mode = PaddingMode::zeros;
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index eda800e78fa..b8faef759ae 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -76,7 +76,7 @@ static inline void clip(const platform::CPUDeviceContext& ctx,
   if (padding_mode == "border") {
     grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
                                      .cwiseMin(static_cast<T>(max_val));
-  } else if (padding_mode == "reflect") {
+  } else if (padding_mode == "reflection") {
     if (align_corners) {
       auto double_range = static_cast<T>(max_val * 2);
       auto grid_abs = grid_slice_t.abs();
@@ -117,7 +117,7 @@ static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
     auto in_bound = (res == grid_slice_t);
     grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
     grid_slice_t.device(place) = res;
-  } else if (padding_mode == "reflect") {
+  } else if (padding_mode == "reflection") {
     if (align_corners) {
       auto double_range = static_cast<T>(max_val * 2);
       auto is_neg = (grid_slice_t < static_cast<T>(0));
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
index 4a33f32a0b6..ea94a8ba69a 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
@@ -100,7 +100,7 @@ def add_cases(suite):
         GridSampleTestCase(
             methodName='runTest',
             mode='bilinear',
-            padding_mode='reflect',
+            padding_mode='reflection',
             align_corners=True))
     suite.addTest(
         GridSampleTestCase(
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
index 4d1ed5aeb96..bf2f9518fb0 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -73,7 +73,7 @@ def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
 
     if padding_mode == "border":
         grid_slice = clip(grid_slice, 0, max_val)
-    elif padding_mode == "reflect":
+    elif padding_mode == "reflection":
         double_range = 2 * max_val if align_corners else (max_val + 1) * 2
         grid_abs = np.abs(grid_slice) if align_corners else np.abs(grid_slice +
                                                                    0.5)
@@ -211,7 +211,7 @@ class Case2(TestGridSamplerOp):
         self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
         self.align_corners = False
-        self.padding_mode = "reflect"
+        self.padding_mode = "reflection"
         self.mode = "bilinear"
 
 
@@ -221,7 +221,7 @@ class Case3(TestGridSamplerOp):
         self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
         self.align_corners = True
-        self.padding_mode = "reflect"
+        self.padding_mode = "reflection"
         self.mode = "bilinear"
 
 
@@ -231,7 +231,7 @@ class Case4(TestGridSamplerOp):
         self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
         self.align_corners = False
-        self.padding_mode = "reflect"
+        self.padding_mode = "reflection"
         self.mode = "nearest"
         self.numeric_grad_delta = 0.0001
 
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 1dfdac26e99..a74a98d5ed4 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -249,7 +249,7 @@ def grid_sample(x,
         mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
                          Default: 'bilinear'.
         padding_mode(str, optional) The padding method used when source index
-                   is out of input images. It can be 'zeros', 'reflect' and 'border'.
+                   is out of input images. It can be 'zeros', 'reflection' and 'border'.
                    Default: zeros.
         align_corners(bool, optional): If `align_corners` is true, it will projects
                    -1 and 1 to the centers of the corner pixels. Otherwise, it will
@@ -312,7 +312,7 @@ def grid_sample(x,
     if not isinstance(grid, Variable):
         raise ValueError("The grid should be a Variable")
     _modes = ['bilinear', 'nearest']
-    _padding_modes = ['zeros', 'reflect', 'border']
+    _padding_modes = ['zeros', 'reflection', 'border']
     if mode not in _modes:
         raise ValueError(
             "The mode of grid sample function should be in {}, but got: {}".
-- 
GitLab


From c7b9d97fa95f63eaa74e5da5c6592e1025c81cba Mon Sep 17 00:00:00 2001
From: Double_V <liuvv0203@163.com>
Date: Tue, 8 Sep 2020 19:23:00 +0800
Subject: [PATCH 007/261] fix avg_pool3d count_include_pad as True,test=develop
 (#27155)

---
 python/paddle/fluid/tests/unittests/test_pool3d_api.py | 2 --
 python/paddle/nn/functional/pooling.py                 | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
index a77f1cdd57d..505a1c73838 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -165,7 +165,6 @@ class TestPool3d_API(unittest.TestCase):
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
     def check_max_dygraph_ndhwc_results(self, place):
-        print("run ndchw max pool3d")
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(
@@ -190,7 +189,6 @@ class TestPool3d_API(unittest.TestCase):
                     np.transpose(result.numpy(), [0, 4, 1, 2, 3]), result_np))
 
     def check_max_dygraph_ceilmode_results(self, place):
-        print("run ceil mode max pool3d")
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 662205ab695..042625a3dbd 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -389,7 +389,7 @@ def avg_pool3d(x,
                stride=None,
                padding=0,
                ceil_mode=False,
-               count_include_pad=False,
+               count_include_pad=True,
                divisor_override=None,
                data_format="NCDHW",
                name=None):
-- 
GitLab


From 13804ed80cfd01e47d52876f8694cc0a8a3ad311 Mon Sep 17 00:00:00 2001
From: WeiXin <2279280558@qq.com>
Date: Tue, 8 Sep 2020 19:56:39 +0800
Subject: [PATCH 008/261] Error msg/polish tensor error msg (#26976)

* polish one line error message in tensor.cc

* polish error messages in tensor.cc,tensor.h tensor_impl.h

* polish error messages in tensor.cc tensor.h tensor_impl.h

* polish error messages in tensor.cc,tensor.h tensor_impl.h

* polish error messages in tensor.cc tensor.h tensor_impl.h tensor_test.cc

* polish error messages in tensor.cc tensor.h tensor_impl.h
---
 paddle/fluid/framework/tensor.cc      | 54 ++++++++++++++++++---------
 paddle/fluid/framework/tensor.h       |  8 +++-
 paddle/fluid/framework/tensor_impl.h  | 19 +++++++---
 paddle/fluid/framework/tensor_test.cc |  4 +-
 4 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 544c014eaf9..0b22bab2678 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -19,13 +19,17 @@ namespace paddle {
 namespace framework {
 extern size_t SizeOfType(proto::VarType::Type type);
 void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_NOT_NULL(holder_, platform::errors::PreconditionNotMet(
+                                       "Tensor holds no memory. "
+                                       "Call Tensor::mutable_data firstly."));
   PADDLE_ENFORCE_LE(
       numel() * SizeOfType(type()), memory_size(),
-      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-      "first to re-allocate memory.\n"
-      "or maybe the required data-type mismatches the data already stored.");
+      platform::errors::PreconditionNotMet(
+          "Tensor's dimension is out of bound."
+          "Tensor's dimension must be equal or less than the size of its "
+          "memory."
+          "But received  Tensor's dimension is d%, memory's size is %d.",
+          numel() * SizeOfType(type()), memory_size()));
 }
 
 Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
@@ -37,15 +41,21 @@ size_t Tensor::memory_size() const {
 void* Tensor::mutable_data(const platform::Place& place,
                            proto::VarType::Type type, size_t requested_size) {
   type_ = type;
-  PADDLE_ENFORCE_GE(numel(), 0,
-                    "When calling this method, the Tensor's numel must be "
-                    "equal or larger than zero. "
-                    "Please check Tensor::dims, or Tensor::Resize has been "
-                    "called first. The Tensor's shape is [",
-                    dims(), "] now");
+  PADDLE_ENFORCE_GE(
+      numel(), 0,
+      platform::errors::PreconditionNotMet(
+          "The Tensor's element number must be equal or greater than zero. "
+          "The Tensor's shape is [",
+          dims(), "] now"));
   size_t size = numel() * SizeOfType(type);
   if (requested_size) {
-    PADDLE_ENFORCE_GE(requested_size, size);
+    PADDLE_ENFORCE_GE(
+        requested_size, size,
+        platform::errors::InvalidArgument(
+            "The requested memory size is less than the memory size of Tensor. "
+            "But received requested memory size is d%, "
+            "memory size of Tensor is %d.",
+            requested_size, size));
     size = requested_size;
   }
   /* some versions of boost::variant don't have operator!= */
@@ -62,8 +72,8 @@ void* Tensor::mutable_data(const platform::Place& place,
 
 void* Tensor::mutable_data(const platform::Place& place,
                            size_t requested_size) {
-  PADDLE_ENFORCE_NOT_NULL(
-      this->holder_, "Cannot invoke mutable data if current hold nothing.");
+  PADDLE_ENFORCE_NOT_NULL(this->holder_, platform::errors::PreconditionNotMet(
+                                             "The tensor is not initialized."));
   return mutable_data(place, type_, requested_size);
 }
 
@@ -75,12 +85,20 @@ Tensor& Tensor::ShareDataWith(const Tensor& src) {
 
 Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const {
   check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0,
-                    "The start row index must be greater than 0.");
-  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
+  PADDLE_ENFORCE_GE(
+      begin_idx, 0,
+      platform::errors::OutOfRange("The start row index must be greater than 0."
+                                   "But received the start index is d%.",
+                                   begin_idx));
+  PADDLE_ENFORCE_LE(
+      end_idx, dims_[0],
+      platform::errors::OutOfRange("The end row index is out of bound."));
   PADDLE_ENFORCE_LT(
       begin_idx, end_idx,
-      "The start row index must be lesser than the end row index.");
+      platform::errors::InvalidArgument(
+          "The start row index must be less than the end row index."
+          "But received the start index = %d, the end index = %d.",
+          begin_idx, end_idx));
 
   if (dims_[0] == 1) {
     return *this;
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index d9fddc4c77d..f2ccff2c133 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -131,13 +131,17 @@ class Tensor {
 
   const platform::Place& place() const {
     PADDLE_ENFORCE_NOT_NULL(
-        holder_, "Tensor not initialized yet when Tensor::place() is called.");
+        holder_,
+        platform::errors::PreconditionNotMet(
+            "Tensor not initialized yet when Tensor::place() is called."));
     return holder_->place();
   }
 
   proto::VarType::Type type() const {
     PADDLE_ENFORCE_NOT_NULL(
-        holder_, "Tensor not initialized yet when Tensor::type() is called.");
+        holder_,
+        platform::errors::PreconditionNotMet(
+            "Tensor not initialized yet when Tensor::type() is called."));
     return type_;
   }
 
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index f5171b0a8d1..986551b935e 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -43,9 +43,13 @@ inline T* Tensor::data() {
   check_memory_size();
   bool valid =
       std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType();
-  PADDLE_ENFORCE(
-      valid, "Tensor holds the wrong type, it holds %s, but desires to be %s",
-      DataTypeToString(type_), DataTypeToString(DataTypeTrait<T>::DataType()));
+  PADDLE_ENFORCE_EQ(
+      valid, true,
+      platform::errors::InvalidArgument(
+          "Tensor holds the wrong type, it holds %s, but desires to be %s",
+          DataTypeToString(type_),
+          DataTypeToString(DataTypeTrait<T>::DataType())));
+
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
 }
@@ -69,9 +73,12 @@ inline T* Tensor::mutable_data(const platform::Place& place,
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   int rank = src.dims().size();
   PADDLE_ENFORCE_GE(
-      rank, 2,
-      "'ReshapeToMatrix()' is only used for flatten high rank "
-      "tensors to matrixs. Can not be used in reshaping vectors.");
+      rank, 2, platform::errors::InvalidArgument(
+                   "'ReshapeToMatrix()' is only used for flatten high rank "
+                   "tensors to matrixs. The dimensions of Tensor must be "
+                   "greater or equal than 2. "
+                   "But received dimensions of Tensor is %d",
+                   rank));
   if (rank == 2) {
     return src;
   }
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 84f98d339a2..cc972dd93d0 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -41,7 +41,7 @@ TEST(Tensor, DataAssert) {
     std::string ex_msg = err.what();
     EXPECT_TRUE(ex_msg.find("holder_ should not be null") != std::string::npos);
     EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
-                            "Tensor::mutable_data first.") !=
+                            "Tensor::mutable_data firstly.") !=
                 std::string::npos);
   }
   ASSERT_TRUE(caught);
@@ -157,7 +157,7 @@ TEST(Tensor, ShareDataWith) {
       EXPECT_TRUE(ex_msg.find("holder_ should not be null") !=
                   std::string::npos);
       EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
-                              "Tensor::mutable_data first.") !=
+                              "Tensor::mutable_data firstly.") !=
                   std::string::npos);
     }
     ASSERT_TRUE(caught);
-- 
GitLab


From 4d7d661249652f957ee918c02760213cd3681799 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Tue, 8 Sep 2020 21:37:46 +0800
Subject: [PATCH 009/261] Fix kl and summary bug (#27132)

* fix summary rnn

* fix kl_div bug when input shape is [1] and reduction is batchmean
---
 paddle/fluid/operators/kldiv_loss_op.h        |  6 +-
 .../tests/unittests/test_kldiv_loss_op.py     |  8 +-
 python/paddle/hapi/model_summary.py           | 73 ++++++++++++++-----
 python/paddle/nn/functional/loss.py           |  4 +-
 python/paddle/nn/layer/loss.py                | 17 +++--
 python/paddle/tests/test_model.py             | 18 +++++
 6 files changed, 98 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h
index 369fdb4872b..857ecda303c 100644
--- a/paddle/fluid/operators/kldiv_loss_op.h
+++ b/paddle/fluid/operators/kldiv_loss_op.h
@@ -72,7 +72,11 @@ class KLDivLossKernel : public framework::OpKernel<T> {
       loss_t.device(place) = output;
     } else if ("batchmean" == reduction) {
       auto output_sum = output.sum();
-      loss_t.device(place) = output_sum / output_sum.constant(n);
+      if (n > 0) {
+        loss_t.device(place) = output_sum / output_sum.constant(n);
+      } else {
+        loss_t.device(place) = output_sum;
+      }
     } else if ("mean" == reduction) {
       loss_t.device(place) = output.mean();
     } else if ("sum" == reduction) {
diff --git a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
index 8780727e4cb..041fe4e9043 100644
--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
@@ -24,7 +24,10 @@ def kldiv_loss(x, target, reduction):
     loss = np.where(target >= 0, output, np.zeros_like(x))
 
     if reduction == "batchmean":
-        return loss.sum() / x.shape[0]
+        if len(x.shape) > 0:
+            return loss.sum() / x.shape[0]
+        else:
+            return loss.sum()
     if reduction == "mean":
         return loss.mean()
     if reduction == "sum":
@@ -93,6 +96,9 @@ class TestKLDivLossDygraph(unittest.TestCase):
     def test_kl_loss_batchmean(self):
         self.run_kl_loss('batchmean')
 
+    def test_kl_loss_batchmean_shape(self):
+        self.run_kl_loss('batchmean', ())
+
     def test_kl_loss_mean(self):
         self.run_kl_loss('mean')
 
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index 716be1b5398..d388ba62f2a 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 import numpy as np
+import numbers
 
 import paddle
 import paddle.nn as nn
@@ -107,6 +109,11 @@ def summary(net, input_size, batch_size=None, dtypes=None):
     if batch_size is None:
         batch_size = -1
 
+    if not paddle.in_dynamic_mode():
+        warnings.warn(
+            "Your model was created in static mode, this may not get correct summary information!"
+        )
+
     result, params_info = summary_string(net, _input_size, batch_size, dtypes)
     print(result)
 
@@ -121,16 +128,16 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
 
     depth = len(list(model.sublayers()))
 
-    def register_hook(module):
-        def hook(module, input, output):
-            class_name = str(module.__class__).split(".")[-1].split("'")[0]
+    def register_hook(layer):
+        def hook(layer, input, output):
+            class_name = str(layer.__class__).split(".")[-1].split("'")[0]
 
             try:
-                module_idx = int(module._full_name.split('_')[-1])
+                layer_idx = int(layer._full_name.split('_')[-1])
             except:
-                module_idx = len(summary)
+                layer_idx = len(summary)
 
-            m_key = "%s-%i" % (class_name, module_idx + 1)
+            m_key = "%s-%i" % (class_name, layer_idx + 1)
             summary[m_key] = OrderedDict()
             summary[m_key]["input_shape"] = list(input[0].shape)
             summary[m_key]["input_shape"][0] = batch_size
@@ -142,23 +149,50 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
                 summary[m_key]["output_shape"][0] = batch_size
 
             params = 0
-            if hasattr(module, "weight") and hasattr(module.weight, "shape"):
-                params += np.prod(module.weight.shape)
-                summary[m_key]["trainable"] = module.weight.trainable or (
-                    not module.weight.stop_gradient)
-            if hasattr(module, "bias") and hasattr(module.bias, "shape"):
-                params += np.prod(module.bias.shape)
+
+            if paddle.in_dynamic_mode():
+                layer_state_dict = layer._parameters
+            else:
+                layer_state_dict = layer.state_dict()
+
+            for k, v in layer_state_dict.items():
+                params += np.prod(v.shape)
+
+                try:
+                    if (getattr(getattr(layer, k), 'trainable')) and (
+                            not getattr(getattr(layer, k), 'stop_gradient')):
+                        summary[m_key]["trainable"] = True
+                    else:
+                        summary[m_key]["trainable"] = False
+                except:
+                    summary[m_key]["trainable"] = True
+
             summary[m_key]["nb_params"] = params
 
-        if (not isinstance(module, nn.Sequential) and
-                not isinstance(module, nn.LayerList) and
-            (not (module == model) or depth < 1)):
+        if (not isinstance(layer, nn.Sequential) and
+                not isinstance(layer, nn.LayerList) and
+            (not (layer == model) or depth < 1)):
+
+            hooks.append(layer.register_forward_post_hook(hook))
+
+    def _check_input_size(input_sizes):
+        for input_size in input_sizes:
+            for item in input_size:
+                if not isinstance(item, numbers.Number):
+                    raise TypeError(
+                        "Expected item in input size be a number, but got {}".
+                        format(type(item)))
 
-            hooks.append(module.register_forward_post_hook(hook))
+                if item <= 0:
+                    raise ValueError(
+                        "Expected item in input size greater than zero, but got {}".
+                        format(item))
 
     if isinstance(input_size, tuple):
         input_size = [input_size]
 
+    _check_input_size(input_size)
+
     x = [
         paddle.rand(
             [2] + list(in_size), dtype=dtype)
@@ -197,7 +231,12 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
             "{0:,}".format(summary[layer]["nb_params"]), )
         total_params += summary[layer]["nb_params"]
 
-        total_output += np.prod(summary[layer]["output_shape"])
+        try:
+            total_output += np.prod(summary[layer]["output_shape"])
+        except:
+            for output_shape in summary[layer]["output_shape"]:
+                total_output += np.prod(output_shape)
+
         if "trainable" in summary[layer]:
             if summary[layer]["trainable"] == True:
                 trainable_params += summary[layer]["nb_params"]
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 3d5894064c4..6c139b0ddbb 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -780,10 +780,10 @@ def kl_div(input, label, reduction='mean', name=None):
             input = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
 
-            # 'batchmean' reduction, loss shape will be [N]
+            # 'batchmean' reduction, loss shape will be [1]
             pred_loss = F.kl_div(paddle.to_tensor(input),
                                  paddle.to_tensor(target), reduction='batchmean')
-            # shape=[5]
+            # shape=[1]
 
             # 'mean' reduction, loss shape will be [1]
             pred_loss = F.kl_div(paddle.to_tensor(input),
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index a60e615d506..271dc9b4e68 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -627,10 +627,13 @@ class KLDivLoss(fluid.dygraph.Layer):
     $$l(x, y) = y * (\log(y) - x)$$
 
     Parameters:
-        reduction (str, optional): Indicate how to average the loss,
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
-            Default is ``'mean'``.
+        reduction (Tensor): Indicate how to average the loss,
+             the candicates are ``'none'`` | ``'batchmean'`` | ``'mean'`` | ``'sum'``.
+             If `reduction` is ``'mean'``, the reduced mean loss is returned;
+             If `reduction` is ``'batchmean'``, the sum loss divided by batch size is returned;
+             if `reduction` is ``'sum'``, the reduced sum loss is returned;
+             if `reduction` is ``'none'``, no reduction will be apllied.
+             Default is ``'mean'``.
 
     Shape:
 
@@ -654,11 +657,11 @@ class KLDivLoss(fluid.dygraph.Layer):
             x = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
 
-            # 'batchmean' reduction, loss shape will be [N]
+            # 'batchmean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
             pred_loss = kldiv_criterion(paddle.to_tensor(x),
                                         paddle.to_tensor(target))
-            # shape=[5]
+            # shape=[1]
 
             # 'mean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='mean')
@@ -684,7 +687,7 @@ class KLDivLoss(fluid.dygraph.Layer):
         self.reduction = reduction
 
     def forward(self, input, label):
-        out = paddle.nn.functional.kl_div(input, label, self.reduction)
+        out = F.kl_div(input, label, self.reduction)
         return out
 
 
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index b7b5d44650f..5c4e98feaa6 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -523,6 +523,24 @@ class TestModelFunction(unittest.TestCase):
             model.summary(input_size=[(20)])
             model.summary(input_size=(20), batch_size=2)
 
+    def test_summary_nlp(self):
+        paddle.enable_static()
+        nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+        paddle.summary(nlp_net, (1, 2))
+
+    def test_summary_error(self):
+        with self.assertRaises(TypeError):
+            nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+            paddle.summary(nlp_net, (1, '2'))
+
+        with self.assertRaises(ValueError):
+            nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+            paddle.summary(nlp_net, (-1, -1))
+
+        paddle.disable_static()
+        nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+        paddle.summary(nlp_net, (1, 2))
+
     def test_export_deploy_model(self):
         for dynamic in [True, False]:
             fluid.enable_dygraph() if dynamic else None
-- 
GitLab


From 4558d395e91695f788d662bd3d8312726c5a96e2 Mon Sep 17 00:00:00 2001
From: yongqiangma <xing.wo@163.com>
Date: Tue, 8 Sep 2020 21:39:05 +0800
Subject: [PATCH 010/261] fix Norm op error (#26771)

* fix frobenius_norm error, rm p=0 2-axis support. test=develop
---
 paddle/fluid/operators/p_norm_op.cc           |   6 +
 .../fluid/tests/unittests/test_norm_all.py    | 186 ++++++++++++++----
 python/paddle/tensor/linalg.py                |  84 +++-----
 3 files changed, 187 insertions(+), 89 deletions(-)

diff --git a/paddle/fluid/operators/p_norm_op.cc b/paddle/fluid/operators/p_norm_op.cc
index 59035d5a8ca..cd7a8c6d24e 100644
--- a/paddle/fluid/operators/p_norm_op.cc
+++ b/paddle/fluid/operators/p_norm_op.cc
@@ -105,6 +105,12 @@ class PnormOp : public framework::OperatorWithKernel {
     bool asvector = ctx->Attrs().Get<bool>("asvector");
     if (asvector) {
       reduce_dims.emplace_back(1);
+      if (keepdim) {
+        for (int i = 1; i < x_dim.size(); ++i) {
+          reduce_dims.emplace_back(1);
+        }
+        x_dim = framework::make_ddim(reduce_dims);
+      }
     } else {
       if (axis < 0) axis = x_dim.size() + axis;
       for (int i = 0; i < x_dim.size(); ++i) {
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index c047cf6ddff..352089e1fb7 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -26,11 +26,11 @@ def p_norm(x, axis, porder, keepdims=False):
     if axis is None:
         x = x.flatten()
         if porder == np.inf:
-            r = np.amax(np.abs(x))
+            r = np.amax(np.abs(x), keepdims=keepdims)
         elif porder == -np.inf:
-            r = np.amin(np.abs(x))
+            r = np.amin(np.abs(x), keepdims=keepdims)
         else:
-            r = np.linalg.norm(x, ord=porder)
+            r = np.linalg.norm(x, ord=porder, keepdims=keepdims)
     elif isinstance(axis, list or tuple) and len(axis) == 2:
         if porder == np.inf:
             axis = tuple(axis)
@@ -41,10 +41,10 @@ def p_norm(x, axis, porder, keepdims=False):
         elif porder == 0:
             axis = tuple(axis)
             r = x.astype(bool)
-            r = np.sum(r, axis)
+            r = np.sum(r, axis, keepdims=keepdims)
         elif porder == 1:
             axis = tuple(axis)
-            r = np.sum(np.abs(x), axis)
+            r = np.sum(np.abs(x), axis, keepdims=keepdims)
         else:
             axis = tuple(axis)
             xp = np.power(np.abs(x), porder)
@@ -61,7 +61,7 @@ def p_norm(x, axis, porder, keepdims=False):
 
 def frobenius_norm(x, axis=None, keepdims=False):
     if isinstance(axis, list): axis = tuple(axis)
-    if axis is None: axis = (-2, -1)
+    if axis is None: x = x.reshape(1, x.size)
     r = np.linalg.norm(
         x, ord='fro', axis=axis, keepdims=keepdims).astype(x.dtype)
     return r
@@ -217,28 +217,37 @@ class TestPnormOp5(TestPnormOp):
         self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
 
 
-def run_fro(self, p, axis, shape_x, dtype):
+def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(x=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis, keepdim=keep_dim)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
-        expected_result = frobenius_norm(np_input, axis=axis)
+        expected_result = frobenius_norm(np_input, axis=axis, keepdims=keep_dim)
         result, = exe.run(feed={"X": np_input}, fetch_list=[out])
     self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+    if keep_dim and check_dim:
+        self.assertEqual(
+            (np.abs(np.array(result.shape) - np.array(expected_result.shape)) <
+             1e-6).all(), True)
 
 
-def run_pnorm(self, p, axis, shape_x, dtype):
+def run_pnorm(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(x=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis, keepdim=keep_dim)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
-        expected_result = p_norm(np_input, porder=p, axis=axis).astype(dtype)
+        expected_result = p_norm(
+            np_input, porder=p, axis=axis, keepdims=keep_dim).astype(dtype)
         result, = exe.run(feed={"X": np_input}, fetch_list=[out])
-        self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+    self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+    if keep_dim and check_dim:
+        self.assertEqual(
+            (np.abs(np.array(result.shape) - np.array(expected_result.shape)) <
+             1e-6).all(), True)
 
 
 def run_graph(self, p, axis, shape_x, dtype):
@@ -253,6 +262,7 @@ def run_graph(self, p, axis, shape_x, dtype):
 
     # compute frobenius norm along last two dimensions.
     out_fro = paddle.norm(x, p='fro')
+    out_fro = paddle.norm(x, p='fro', axis=0)
     out_fro = paddle.norm(x, p='fro', axis=[0, 1])
     # compute 2-order  norm along [0,1] dimension.
     out_pnorm = paddle.norm(x, p=2, axis=[0, 1])
@@ -274,27 +284,133 @@ def run_graph(self, p, axis, shape_x, dtype):
 
 class API_NormTest(unittest.TestCase):
     def test_basic(self):
-        run_fro(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
-        run_fro(self, p='fro', axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=2, axis=None, shape_x=[3, 4], dtype="float32")
-        run_pnorm(self, p=2, axis=1, shape_x=[3, 4], dtype="float64")
-        run_pnorm(self, p=np.inf, axis=0, shape_x=[2, 3, 4], dtype="float32")
-        run_pnorm(self, p=np.inf, axis=None, shape_x=[2, 3, 4], dtype="float32")
-        run_pnorm(self, p=-np.inf, axis=0, shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(
-            self, p=-np.inf, axis=None, shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=0, axis=1, shape_x=[3, 4], dtype="float64")
-
-        run_pnorm(self, p=1, axis=1, shape_x=[3, 4], dtype="float64")
-        run_pnorm(self, p=0, axis=None, shape_x=[3, 4], dtype="float64")
-        run_pnorm(self, p=2, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=2, axis=-1, shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=1, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=0, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(
-            self, p=np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(
-            self, p=-np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        keep_dims = {False, True}
+        for keep in keep_dims:
+            run_fro(
+                self,
+                p='fro',
+                axis=None,
+                shape_x=[2, 3, 4],
+                dtype="float32",
+                keep_dim=keep)
+            run_fro(
+                self,
+                p='fro',
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=2,
+                axis=None,
+                shape_x=[3, 4],
+                dtype="float32",
+                keep_dim=keep)
+            run_pnorm(
+                self,
+                p=2,
+                axis=1,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=np.inf,
+                axis=0,
+                shape_x=[2, 3, 4],
+                dtype="float32",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=np.inf,
+                axis=None,
+                shape_x=[2, 3, 4],
+                dtype="float32",
+                keep_dim=keep)
+            run_pnorm(
+                self,
+                p=-np.inf,
+                axis=0,
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=-np.inf,
+                axis=None,
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep)
+            run_pnorm(
+                self,
+                p=0,
+                axis=1,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+
+            run_pnorm(
+                self,
+                p=1,
+                axis=1,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=0,
+                axis=None,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=2,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=2,
+                axis=-1,
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=1,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=np.inf,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=-np.inf,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
 
     def test_dygraph(self):
         run_graph(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
@@ -315,6 +431,7 @@ class API_NormTest(unittest.TestCase):
                 paddle.norm(data, p=p, out=out)
 
             self.assertRaises(TypeError, err_dtype, "fro", [2, 2], "int64")
+            self.assertRaises(ValueError, paddle.norm, "inf", [2], "int64")
             out = fluid.data(name="out", shape=[1], dtype="int64")
             self.assertRaises(TypeError, err_dtype, "fro", [2, 2], "float64",
                               out)
@@ -325,6 +442,7 @@ class API_NormTest(unittest.TestCase):
             self.assertRaises(ValueError, paddle.norm, data, p="unsupport norm")
             self.assertRaises(ValueError, paddle.norm, data, p=[1])
             self.assertRaises(ValueError, paddle.norm, data, p=[1], axis=-1)
+            self.assertRaises(ValueError, paddle.norm, 0, [1, 0], "float64")
             data = fluid.data(name="data_3d", shape=[2, 2, 2], dtype="float64")
             self.assertRaises(
                 ValueError, paddle.norm, data, p='unspport', axis=[-3, -2, -1])
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 7ddda5091a0..67e3ce21ffb 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -183,12 +183,13 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
         x (Tensor): The input tensor could be N-D tensor, and the input data
             type could be float32 or float64.
         p (float|string, optional): Order of the norm. Supported values are `fro`, `0`, `1`, `2`,
-           `inf`,`-inf` and any positive real number yielding the corresponding p-norm.
-            Not supported: ord < 0, nuclear norm.
+            `inf`, `-inf` and any positive real number yielding the corresponding p-norm. Not supported: ord < 0 and nuclear norm. 
+            Default value is `fro`.
         axis (int|list|tuple, optional): The axis on which to apply norm operation. If axis is int
             or list(int)/tuple(int)  with only one element, the vector norm is computed over the axis.
             If `axis < 0`, the dimension to norm operation is rank(input) + axis.
             If axis is a list(int)/tuple(int) with two elements, the matrix norm is computed over the axis.
+            Defalut value is `None`.
         keepdim (bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have fewer dimension
             than the :attr:`input` unless :attr:`keepdim` is true, default
@@ -197,13 +198,9 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: Tensor, results of norm operation on the specified axis of input tensor,
+        Tensor: results of norm operation on the specified axis of input tensor,
         it's data type is the same as input's Tensor.
  
-    Raises:
-        TypeError, if out data type is different with the input data type.
-        ValueError, If `p` or `axis` is invalid.
-    
     Examples:
         .. code-block:: python
             
@@ -256,15 +253,13 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
                 "The dim of frobenius norm op should be None or two elements list!"
             )
         if in_dygraph_mode():
-            if dim is None: dim = [-1]
-            return core.ops.frobenius_norm(input, 'dim', dim, 'keepdim',
-                                           keepdim)
-        attrs = {
-            'dim': dim if dim != None else [-2, -1],
-            'keep_dim': keepdim,
-            'reduce_all': False
-        }
-        if len(attrs['dim']) == len(input.shape):
+            if dim is None:
+                return core.ops.frobenius_norm(input, 'keep_dim', keepdim,
+                                               'reduce_all', True)
+            return core.ops.frobenius_norm(input, 'dim', dim, 'keep_dim',
+                                           keepdim, 'reduce_all', False)
+        attrs = {'dim': dim, 'keep_dim': keepdim, 'reduce_all': False}
+        if dim is None:
             attrs['reduce_all'] = True
         check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                                  'frobenius_norm')
@@ -351,42 +346,6 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
 
         return reduce_out
 
-    def p0_matrix_norm(input, porder=0., axis=axis, keepdim=False, name=None):
-        block = LayerHelper('norm', **locals())
-        out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-
-        cast_out = block.create_variable_for_type_inference(dtype=bool)
-        block.append_op(
-            type='cast',
-            inputs={'X': input},
-            outputs={'Out': cast_out},
-            attrs={
-                'in_dtype': input.dtype,
-                'out_dtype': int(core.VarDesc.VarType.BOOL)
-            })
-        cast_out2 = block.create_variable_for_type_inference(dtype=bool)
-        block.append_op(
-            type='cast',
-            inputs={'X': cast_out},
-            outputs={'Out': cast_out2},
-            attrs={
-                'in_dtype': cast_out.dtype,
-                'out_dtype': int(core.VarDesc.VarType.FP32)
-            })
-        sum_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-        block.append_op(
-            type='reduce_sum',
-            inputs={'X': cast_out2},
-            outputs={'Out': sum_out},
-            attrs={
-                'dim': axis,
-                'keep_dim': keepdim,
-                'reduce_all': True if axis is None else False
-            })
-        return sum_out
-
     def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
         block = LayerHelper('norm', **locals())
         out = block.create_variable_for_type_inference(
@@ -448,7 +407,20 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
 
     #calculate vector norm, where axis is int or list with only one integer
     if isinstance(axis, int):
-        if isinstance(p, (int, float)):
+        if isinstance(p, str):
+            if p == "fro":
+                return vector_norm(
+                    x,
+                    porder=2,
+                    axis=axis,
+                    keepdim=keepdim,
+                    asvector=False,
+                    name=name)
+
+            else:
+                raise ValueError(
+                    "only valid string values are 'fro', found {}".format(p))
+        elif isinstance(p, (int, float)):
             return vector_norm(
                 x,
                 axis=axis,
@@ -464,10 +436,12 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
     elif isinstance(axis, list) and len(axis) == 2:
         if p == "fro":
             return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
-        elif p == 0:
-            return p0_matrix_norm(x, axis=axis, keepdim=keepdim, name=name)
         elif p == np.inf or p == -np.inf:
             return inf_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name)
+        elif p == 0:
+            raise ValueError(
+                "just suport axis type int or list (length of list <=1) if p = 0, found {}".
+                format(axis))
         else:
             return p_matrix_norm(
                 x, porder=p, axis=axis, keepdim=keepdim, name=name)
-- 
GitLab


From eb276632c61ae569d811520bec8565ccf6d42ff7 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Tue, 8 Sep 2020 23:43:01 +0800
Subject: [PATCH 011/261] resolve the issue of curl having same exit code with
 paddle build failed (#27035)

* resolve the issue of curl having same exit code with paddle build failed

* modifed the value of exit code

* simplify the code

Co-authored-by: chalsliu <45041955+chalsliu@users.noreply.github.com>
---
 tools/coverage/paddle_coverage.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh
index 2ea2b1fe351..008b35d01ca 100644
--- a/tools/coverage/paddle_coverage.sh
+++ b/tools/coverage/paddle_coverage.sh
@@ -5,7 +5,7 @@ set -xe
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
 
 # install lcov
-curl -o /lcov-1.14.tar.gz -s https://paddle-ci.gz.bcebos.com/coverage%2Flcov-1.14.tar.gz
+curl -o /lcov-1.14.tar.gz -x "" -s https://paddle-ci.gz.bcebos.com/coverage/lcov-1.14.tar.gz || exit 101
 tar -xf /lcov-1.14.tar.gz -C /
 cd /lcov-1.14
 make install
-- 
GitLab


From ed292695c577c85e730931daf948b8e4dd0236dc Mon Sep 17 00:00:00 2001
From: kinghuin <kinghuin_chull@163.com>
Date: Wed, 9 Sep 2020 10:10:33 +0800
Subject: [PATCH 012/261] optimize the error message for math dir

optimize the error message for math dir
---
 paddle/fluid/operators/math/sequence2batch.cc | 21 ++++-
 paddle/fluid/operators/math/sequence2batch.cu | 21 ++++-
 paddle/fluid/operators/math/sequence2batch.h  | 36 +++++--
 .../fluid/operators/math/sequence_padding.cc  | 17 +++-
 .../fluid/operators/math/sequence_padding.cu  | 20 ++--
 .../fluid/operators/math/sequence_padding.h   | 25 +++--
 .../fluid/operators/math/sequence_pooling.cc  | 93 ++++++++++++++-----
 .../fluid/operators/math/sequence_pooling.cu  | 10 +-
 .../operators/math/sequence_pooling_test.cc   | 16 +++-
 paddle/fluid/operators/math/tree2col.cc       |  6 +-
 10 files changed, 203 insertions(+), 62 deletions(-)

diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
index e4ffeedb5a0..300a3692012 100644
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -29,11 +29,24 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
     auto src_dims = src.dims();
     auto dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
-                      "The src must be matrix with rank 2.");
+                      platform::errors::InvalidArgument(
+                          "The source tensor must be a matrix with rank 2, but "
+                          "got the source tensor rank is %lu. "
+                          "Please check the rank of the source tensor",
+                          src_dims.size()));
     PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
-                      "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
-                      "The width of src and dst must be same.");
+                      platform::errors::InvalidArgument(
+                          "The destination tensor must be a matrix with rank, "
+                          "but got the destination tensor rank is %lu. "
+                          "Please check the rank of the destination tensor",
+                          dst_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        src_dims[1], dst_dims[1],
+        platform::errors::InvalidArgument(
+            "The width of the source tensor and the destination tensor must be "
+            "same. But got %lu != %lu.Please check the rank of the source "
+            "tensor",
+            src_dims.size(), dst_dims.size()));
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu
index 9ab13659c1c..cd1ca572689 100644
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -46,11 +46,24 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
     auto src_dims = src.dims();
     auto dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2,
-                      "The src must be matrix with rank 2.");
+                      platform::errors::InvalidArgument(
+                          "The source tensor must be a matrix with rank 2, but "
+                          "got the source tensor rank is %lu. "
+                          "Please check the rank of the source tensor",
+                          src_dims.size()));
     PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
-                      "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
-                      "The width of src and dst must be same.");
+                      platform::errors::InvalidArgument(
+                          "The destination tensor must be a matrix with rank, "
+                          "but got the destination tensor rank is %lu. "
+                          "Please check the rank of the destination tensor",
+                          dst_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        src_dims[1], dst_dims[1],
+        platform::errors::InvalidArgument(
+            "The width of the source tensor and the destination tensor must be "
+            "same. But got %lu != %lu.Please check the rank of the source "
+            "tensor",
+            src_dims.size(), dst_dims.size()));
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
index 9d9f7ef00b8..6aa513e4d10 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -64,19 +64,30 @@ class LoDTensor2BatchFunctor {
                   bool is_reverse = false) const {
     if (!is_cal_batch_lod) {
       auto lods = batch->lod();
-      PADDLE_ENFORCE_GT(lods.size(), 2UL,
-                        "The LoD of LoDTensor should inlcude at least 2-level "
-                        "sequence information.");
+      PADDLE_ENFORCE_GT(
+          lods.size(), 2UL,
+          platform::errors::InvalidArgument(
+              "The LoD of LoDTensor should inlcude at least 2-level "
+              "sequence information, but got the LoD level is %lu. Please "
+              "check the input value.",
+              lods.size()));
       PADDLE_ENFORCE_EQ(
           lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]),
-          "The LoD information should be consistent with the dims.");
+          platform::errors::InvalidArgument(
+              "The LoD information should be consistent with the dims, but got "
+              "%lu != %lu. Please check the input value.",
+              lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0])));
       CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
       to_batch(context, lod_tensor, lods[1], batch, true);
       return;
     }
 
     auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lods.size(), 1UL,
+                      platform::errors::InvalidArgument(
+                          "Only support one level sequence now, but got the "
+                          "LoD level is %lu. Please check the input value.",
+                          lods.size()));
 
     const auto& lod = lods[0];
 
@@ -161,12 +172,19 @@ class Batch2LoDTensorFunctor {
                   const framework::LoDTensor& batch,
                   framework::LoDTensor* lod_tensor) const {
     auto in_lod = batch.lod();
-    PADDLE_ENFORCE_GT(in_lod.size(), 2UL,
-                      "The LoD of LoDTensor should inlcude at least 2-level "
-                      "sequence information.");
+    PADDLE_ENFORCE_GT(
+        in_lod.size(), 2UL,
+        platform::errors::InvalidArgument(
+            "The LoD of LoDTensor should inlcude at least 2-level "
+            "sequence information, but got the LoD level is %lu. Please check "
+            "the input value.",
+            in_lod.size()));
     PADDLE_ENFORCE_EQ(
         in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]),
-        "The LoD information should be consistent with the dims.");
+        platform::errors::InvalidArgument(
+            "The LoD information should be consistent with the dims, but got "
+            "%lu != %lu. Please check the input value.",
+            in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0])));
     CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
     to_seq(context, batch, in_lod[1], lod_tensor, false);
   }
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 4630689dec1..076df017642 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -35,7 +35,11 @@ void CopyValidData(framework::Tensor* dst_tensor,
     int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
     PADDLE_ENFORCE_GE(
         pad_seq_len, valid_seq_len,
-        "The padded sequence length can not be less than its original length.");
+        platform::errors::InvalidArgument(
+            "The padded sequence length can not "
+            "be less than its original length. Expected %ld >= %ld, but got "
+            "%ld < %ld. Please check input value.",
+            pad_seq_len, valid_seq_len, pad_seq_len, valid_seq_len));
     int seq_data_offset = seq_offsets[seq_idx] * step_width;
     int pad_data_offset = layout == kBatchLengthWidth
                               ? seq_idx * pad_seq_len * step_width
@@ -95,9 +99,14 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
 
     CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
               step_width, layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
+
+    PADDLE_ENFORCE_EQ(
+        pad_value.numel() == 1 || pad_value.numel() == step_width, true,
+        platform::errors::InvalidArgument(
+            "The numel of 'pad_value' can only be 1 or be equal to the "
+            "'step_width', but got %ld != 1 and %ld. Please check the input "
+            "value.",
+            pad_value.numel(), step_width));
 
     // fill padding value
     T* pad_data = pad_tensor->data<T>();
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
index 1b433067900..19c3af03411 100644
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -66,17 +66,25 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     if (pad_seq_len == -1) {
       pad_seq_len = max_seq_len;
     }
-    PADDLE_ENFORCE_GE(pad_seq_len, max_seq_len,
-                      "The pad_seq_len must be equal to or greater than the "
-                      "original max sequence length.");
+    PADDLE_ENFORCE_GE(
+        pad_seq_len, max_seq_len,
+        platform::errors::InvalidArgument(
+            "The pad_seq_len must be equal to or greater than the "
+            "original max sequence length. Expected %ld >= %ld, but got %ld < "
+            "%ld. Please check the input value.",
+            pad_seq_len, max_seq_len, pad_seq_len, max_seq_len));
     int step_width = seq_tensor.numel() / seq_tensor_dims[0];
     int seq_num = seq_offsets.size() - 1;
 
     CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
               step_width, layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
+    PADDLE_ENFORCE_EQ(
+        pad_value.numel() == 1 || pad_value.numel() == step_width, true,
+        platform::errors::InvalidArgument(
+            "The numel of 'pad_value' can only be 1 or be equal to "
+            "the 'step_width', but got %ld != 1 and %ld. Please check the "
+            "input value.",
+            pad_value.numel(), step_width));
 
     const int kBlockSize = 512;
 
diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h
index 5580ee53746..956a4ff6a2d 100644
--- a/paddle/fluid/operators/math/sequence_padding.h
+++ b/paddle/fluid/operators/math/sequence_padding.h
@@ -52,14 +52,25 @@ inline static void CheckDims(const framework::DDim& seq_tensor_dims,
                              const framework::Vector<size_t>& seq_offset,
                              int64_t padded_seq_len, int64_t step_width,
                              const PadLayout& layout) {
-  PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
-                    "Value of 1st dimension of the sequence tensor should be "
-                    "equal to sum of lengths of all sequences.");
+  PADDLE_ENFORCE_EQ(
+      static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
+      platform::errors::InvalidArgument(
+          "Value of 1st dimension of the sequence tensor should be "
+          "equal to sum of lengths of all sequences. Expected %ld == %ld, but "
+          "got %ld != %ld. Please check the input value.",
+          static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
+          static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back()));
 
-  PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
-                     seq_tensor_dims.size() == pad_tensor_dims.size(),
-                 "pad_tensor's rank should be 1 greater than seq_tensor's "
-                 "rank, or be equal with it.");
+  PADDLE_ENFORCE_EQ(
+      seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
+          seq_tensor_dims.size() == pad_tensor_dims.size(),
+      true, platform::errors::InvalidArgument(
+                "pad_tensor's rank should be 1 greater than seq_tensor's "
+                "rank, or be equal with it. The pad_tensor's rank is %ld, "
+                "expected the seq_tensor's rank is %ld or %ld, but got %ld. "
+                "Please check the input value.",
+                pad_tensor_dims.size(), pad_tensor_dims.size(),
+                pad_tensor_dims.size() - 1, seq_tensor_dims.size()));
 }
 
 /*
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index cc3fbd58766..2eee4d0a6c1 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -42,15 +42,29 @@ class MaxSeqPoolFunctor {
     auto out_dims = output->dims();
     auto idx_dims = index->dims();
     PADDLE_ENFORCE_GT(in_dims.size(), 1,
-                      "The rank of input shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of input shall be greater than 1, but got "
+                          "the rank is %ld. Please check the input value",
+                          in_dims.size()));
     PADDLE_ENFORCE_GT(out_dims.size(), 1,
-                      "The rank of output shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of output shall be greater than 1, but got "
+                          "the rank is %ld. Please check the input value",
+                          out_dims.size()));
     for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i],
-                        "The dimension of input and output shall be same.");
+      PADDLE_ENFORCE_EQ(
+          in_dims[i], out_dims[i],
+          platform::errors::InvalidArgument(
+              "The dimension of input and output shall be same. Expected %ld "
+              "== %ld, but got %ld != %ld. Please check the input value.",
+              in_dims[i], out_dims[i], in_dims[i], out_dims[i]));
     }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims,
-                      "The dimension of index and output shall be same.");
+    PADDLE_ENFORCE_EQ(
+        idx_dims, out_dims,
+        platform::errors::InvalidArgument(
+            "The dimension of index and output shall be same. Expected %ld == "
+            "%ld, but got %ld != %ld. Please check the input value.",
+            idx_dims, out_dims, idx_dims, out_dims));
 
     auto lod_level = input.lod().size();
     auto starts = input.lod()[lod_level - 1];
@@ -94,12 +108,22 @@ class MaxSeqPoolFunctor<T, true> {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
     PADDLE_ENFORCE_GT(in_dims.size(), 1,
-                      "The rank of input shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of input shall be greater than 1, but got "
+                          "%ld <= 1. Please check the input value.",
+                          in_dims.size()));
     PADDLE_ENFORCE_GT(out_dims.size(), 1,
-                      "The rank of output shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of output shall be greater than 1, but got "
+                          "%ld <= 1. Please check the input value.",
+                          out_dims.size()));
     for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i],
-                        "The dimension of input and output shall be same.");
+      PADDLE_ENFORCE_EQ(
+          in_dims[i], out_dims[i],
+          platform::errors::InvalidArgument(
+              "The dimension of input and output shall be same. Expected %ld "
+              "== %ld, but got %ld != %ld. Please check the input value.",
+              in_dims[i], out_dims[i], in_dims[i], out_dims[i]));
     }
 
     auto lod_level = input.lod().size();
@@ -139,16 +163,29 @@ class MaxSeqPoolGradFunctor {
     auto ig_dims = in_grad->dims();
     auto idx_dims = index.dims();
     PADDLE_ENFORCE_GT(og_dims.size(), 1,
-                      "The rank of output@Grad shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of output@Grad shall be greater than 1, "
+                          "but got %ld <= 1. Please check the input value.",
+                          og_dims.size()));
     PADDLE_ENFORCE_GT(ig_dims.size(), 1,
-                      "The rank of input@Grad shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of input@Grad shall be greater than 1, but "
+                          "got %ld <= 1. Please check the input value.",
+                          ig_dims.size()));
     for (int64_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(
-          og_dims[i], ig_dims[i],
-          "The dimension of input@Grad and output@Grad shall be same.");
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i],
+                        platform::errors::InvalidArgument(
+                            "The dimension of input@Grad and output@Grad shall "
+                            "be same. Expected %ld == %ld, but got %ld != %ld. "
+                            "Please check the input value.",
+                            og_dims[i], ig_dims[i], og_dims[i], ig_dims[i]));
     }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims,
-                      "The dimension of index and output@Grad shall be same.");
+    PADDLE_ENFORCE_EQ(
+        idx_dims, og_dims,
+        platform::errors::InvalidArgument(
+            "The dimension of index and output@Grad shall be same. Expected "
+            "%ld == %ld, but got %ld != %ld. Please check the input value.",
+            idx_dims, og_dims, idx_dims, og_dims));
 
     const T* og_data = out_grad.data<T>();
     const int* max_index = index.data<int>();
@@ -244,9 +281,12 @@ class SumSeqPoolGradFunctor {
     auto lod = in_grad->lod()[lod_level - 1];
     int64_t out_w = out_grad.numel() / out_grad.dims()[0];
     int64_t in_w = in_grad->numel() / in_grad->dims()[0];
-    PADDLE_ENFORCE_EQ(
-        in_w, out_w,
-        "The feature size of input@Grad and output@Grad shall be same.");
+    PADDLE_ENFORCE_EQ(in_w, out_w,
+                      platform::errors::InvalidArgument(
+                          "The feature size of input@Grad and output@Grad "
+                          "shall be same. Expected %ld == %ld, but got %ld != "
+                          "%ld. Please check the input value.",
+                          in_w, out_w, in_w, out_w));
     const T* out_g_data = out_grad.data<T>();
     T* in_g_data = in_grad->mutable_data<T>(context.GetPlace());
     auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
@@ -298,7 +338,8 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
       auto place = context.GetPlace();
       PADDLE_ENFORCE_EQ(
           platform::is_cpu_place(place), true,
-          "Sequence_pool should run on CPU Device when pooltype is SUM");
+          platform::errors::InvalidArgument(
+              "Sequence_pool should run on CPU Device when pooltype is SUM"));
       const T* src = input.data<T>();
       T* dst = output->mutable_data<T>(place);
       jit::seq_pool_attr_t attr(
@@ -342,7 +383,10 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
         out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                               std::sqrt(static_cast<T>(h));
       } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "unsupported pooling pooltype: %s. Only support \"AVERAGE\" and "
+            "\"SQRT\"",
+            pooltype));
       }
     }
   }
@@ -400,7 +444,10 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
       } else if (pooltype == "FIRST") {
         in_g_e.chip(0, 0).device(place) = out_g_e_v;
       } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "unsupported pooling pooltype: %s. Only support \"AVERAGE\", "
+            "\"SQRT\", \"LAST\" and \"FIRST\"",
+            pooltype));
       }
     }
   }
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index 422b06c70eb..cba8dd935ef 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -205,7 +205,10 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
           lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else {
-      PADDLE_THROW("unsupported pooling pooltype");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unsupported pooling pooltype: %s. Only support \"MAX\", "
+          "\"AVERAGE\", \"SUM\", \"SQRT\", \"LAST\" and \"FIRST\"",
+          pooltype));
     }
   }
 };
@@ -370,7 +373,10 @@ class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
 
     } else {
-      PADDLE_THROW("unsupported pooling pooltype");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unsupported pooling pooltype: %s. Only support \"MAX\", "
+          "\"AVERAGE\", \"SUM\", \"SQRT\", \"LAST\" and \"FIRST\"",
+          pooltype));
     }
   }
 };
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index efab1a375b5..4b5f484e52c 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -50,9 +50,21 @@ void TestSequencePoolingSum(const DeviceContext &context,
   in_grad.mutable_data<T>(in_dims, place);
 
   // check tensor contruction result
-  PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size());
+  PADDLE_ENFORCE_EQ(
+      in_grad.dims().size(), out_grad.dims().size(),
+      paddle::platform::errors::InvalidArgument(
+          "The dimension of input and output shall be same. Expected %ld == "
+          "%ld, but got %ld != %ld. Please check the input value.",
+          in_grad.dims().size(), out_grad.dims().size(), in_grad.dims().size(),
+          out_grad.dims().size()));
   for (int64_t i = 1; i < out_grad.dims().size(); ++i) {
-    PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]);
+    PADDLE_ENFORCE_EQ(
+        in_grad.dims()[i], out_grad.dims()[i],
+        paddle::platform::errors::InvalidArgument(
+            "The dimension of input and output shall be same. Expected %ld == "
+            "%ld, but got %ld != %ld. Please check the input value.",
+            in_grad.dims()[i], out_grad.dims()[i], in_grad.dims()[i],
+            out_grad.dims()[i]));
   }
 
   // call functor
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index cafcf631932..0344226ea66 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -55,7 +55,11 @@ void Tree2ColUtil::construct_tree(const paddle::Tensor &EdgeSet,
                                   std::vector<std::vector<int>> *tr,
                                   size_t *node_count) {
   auto edge_set_dims = EdgeSet.dims();
-  PADDLE_ENFORCE_EQ(edge_set_dims[1], 2);
+  PADDLE_ENFORCE_EQ(edge_set_dims[1], 2,
+                    platform::errors::InvalidArgument(
+                        "The second dimension of the EdgeSet shall be 2, but "
+                        "got %ld != 2. Please check the input value.",
+                        edge_set_dims[1]));
   int64_t edge_count = EdgeSet.numel();
 
   const int *edge_data = EdgeSet.data<int>();
-- 
GitLab


From 252aeb1a4663eb0042768926ba534b2b27c7c03e Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 9 Sep 2020 10:52:15 +0800
Subject: [PATCH 013/261] [Dy2stat]Add naming rule if not specific
 InputSpec.name (#26997)

* Add naming rule if not specific InputSpec.name

* fix function name typo

* refine comment

* remove print statement
---
 .../dygraph_to_static/function_spec.py        | 63 +++++++++++++++++++
 .../dygraph_to_static/test_declarative.py     | 35 ++++++++---
 2 files changed, 91 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index 90e38bd9886..37ce8b0a152 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -135,6 +135,11 @@ class FunctionSpec(object):
 
             input_with_spec = pack_sequence_as(args, input_with_spec)
 
+        # If without specificing name in input_spec, add default name
+        # according to argument name from decorated function.
+        input_with_spec = replace_spec_empty_name(self._arg_names,
+                                                  input_with_spec)
+
         return input_with_spec
 
     @switch_to_static_graph
@@ -309,3 +314,61 @@ def convert_to_input_spec(inputs, input_spec):
         raise TypeError(
             "The type(input_spec) should be a `InputSpec` or dict/list/tuple of it, but received {}.".
             type_name(input_spec))
+
+
+def replace_spec_empty_name(args_name, input_with_spec):
+    """
+    Adds default name according to argument name from decorated function
+    if without specificing InputSpec.name
+
+    The naming rule are as followed:
+        1. If InputSpec.name is not None, do nothing.
+        2. If each argument `x` corresponds to an InputSpec, using the argument name like `x`
+        3. If the arguments `inputs` corresponds to a list(InputSpec), using name like `inputs_0`, `inputs_1`
+        4. If the arguments `input_dic` corresponds to a dict(InputSpec), using key as name.
+
+    For example:
+        
+        # case 1: foo(x, y)
+        foo = to_static(foo, input_spec=[InputSpec([None, 10]), InputSpec([None])])
+        print([in_var.name for in_var in foo.inputs])  # [x, y]
+
+        # case 2: foo(inputs) where inputs is a list
+        foo = to_static(foo, input_spec=[[InputSpec([None, 10]), InputSpec([None])]])
+        print([in_var.name for in_var in foo.inputs])  # [inputs_0, inputs_1]
+
+        # case 3: foo(inputs) where inputs is a dict
+        foo = to_static(foo, input_spec=[{'x': InputSpec([None, 10]), 'y': InputSpec([None])}])
+        print([in_var.name for in_var in foo.inputs])  # [x, y]
+    """
+    input_with_spec = list(input_with_spec)
+    candidate_arg_names = args_name[:len(input_with_spec)]
+
+    for i, arg_name in enumerate(candidate_arg_names):
+        input_spec = input_with_spec[i]
+        input_with_spec[i] = _replace_spec_name(arg_name, input_spec)
+
+    return input_with_spec
+
+
+def _replace_spec_name(name, input_spec):
+    """
+    Replaces InputSpec.name with given `name` while not specificing it.
+    """
+    if isinstance(input_spec, paddle.static.InputSpec):
+        if input_spec.name is None:
+            input_spec.name = name
+        return input_spec
+    elif isinstance(input_spec, (list, tuple)):
+        processed_specs = []
+        for i, spec in enumerate(input_spec):
+            new_name = "{}_{}".format(name, i)
+            processed_specs.append(_replace_spec_name(new_name, spec))
+        return processed_specs
+    elif isinstance(input_spec, dict):
+        processed_specs = {}
+        for key, spec in six.iteritems(input_spec):
+            processed_specs[key] = _replace_spec_name(key, spec)
+        return processed_specs
+    else:
+        return input_spec
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index 0b8df63d666..eed02ea655e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -47,8 +47,8 @@ class SimpleNet(Layer):
         return z
 
     @declarative(input_spec=[[InputSpec([None, 10]), InputSpec([None, 10])]])
-    def func_with_list(self, l):
-        x, y, int_val = l
+    def func_with_list(self, l, int_val=1):
+        x, y = l
         z = x + y
         z = z + int_val
         return z
@@ -60,10 +60,7 @@ class SimpleNet(Layer):
     def func_with_dict(self, d):
         x = d['x']
         y = d['y']
-        int_val = d['int_val']
-
         z = x + y
-        z = z + int_val
 
         return z
 
@@ -131,10 +128,10 @@ class TestInputSpec(unittest.TestCase):
             self.assertTrue(len(net.add_func.program_cache) == 1)
 
             # 5. test input with list
-            out = net.func_with_list([x, y, int_val])
+            out = net.func_with_list([x, y], int_val)
 
             # 6. test input with dict
-            out = net.func_with_dict({'x': x, 'y': y, 'int_val': int_val})
+            out = net.func_with_dict({'x': x, 'y': y})
 
             # 7. test input with lits contains dict
             int_np = np.ones([1]).astype('float32')
@@ -293,6 +290,30 @@ class TestDifferentInputSpecCacheProgram(unittest.TestCase):
                 foo_3.concrete_program
 
 
+class TestInputDefaultName(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.net = SimpleNet()
+
+    def assert_default_name(self, func_name, input_names):
+        decorated_func = getattr(self.net, func_name)
+
+        spec_names = [x.name for x in decorated_func.inputs]
+        self.assertListEqual(spec_names, input_names)
+
+    def test_common_input(self):
+        self.assert_default_name('forward', ['x'])
+
+    def test_list_input(self):
+        self.assert_default_name('func_with_list', ['l_0', 'l_1'])
+
+    def test_dict_input(self):
+        self.assert_default_name('func_with_dict', ['x', 'y'])
+
+    def test_nest_input(self):
+        self.assert_default_name('func_with_list_dict', ['dl_0', 'x', 'y'])
+
+
 class TestDeclarativeAPI(unittest.TestCase):
     def test_error(self):
         func = declarative(dyfunc_to_variable)
-- 
GitLab


From 3497fbe473f3763a8e134b0ff1a240ac24827f67 Mon Sep 17 00:00:00 2001
From: liym27 <33742067+liym27@users.noreply.github.com>
Date: Wed, 9 Sep 2020 13:26:27 +0800
Subject: [PATCH 014/261] Use paddle.disable_static() to replace with
 dygraph.guard(). (#27139)

---
 python/paddle/fluid/framework.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 5281df9ead1..797b32f5d47 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -217,7 +217,7 @@ def _dygraph_not_support_(func):
 def _dygraph_only_(func):
     def __impl__(*args, **kwargs):
         assert in_dygraph_mode(
-        ), "We Only support %s in imperative mode, please use fluid.dygraph.guard() as context to run it in imperative Mode" % func.__name__
+        ), "We Only support %s in dynamic mode, please call 'paddle.disable_static()' to enter dynamic mode." % func.__name__
         return func(*args, **kwargs)
 
     return __impl__
-- 
GitLab


From ca6100de5fb9d438038b1d6ec95a417c784218ff Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 9 Sep 2020 13:37:05 +0800
Subject: [PATCH 015/261] disable ut, fix it @malin (#27200)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 93581325193..db472ec0166 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -440,6 +440,8 @@ if(WITH_DISTRIBUTE)
     # FIXME(seiriosX) will fix this
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_sparse_embedding_ctr")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_a_sync_optimizer_auto")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")
 
     py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
     py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS})
-- 
GitLab


From c60352b3d4bf6b2176f68172e403a4e209d9cb49 Mon Sep 17 00:00:00 2001
From: iducn <45056973+iducn@users.noreply.github.com>
Date: Wed, 9 Sep 2020 14:23:54 +0800
Subject: [PATCH 016/261] update requirements (#27172)

* update requirements.txt

* add unittest_py directory and add new requirements.txt file,test=document_fix

* add unittest_py directory and add new requirements.txt file,test=document_fix
---
 python/unittest_py/requirements.txt | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 python/unittest_py/requirements.txt

diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
new file mode 100644
index 00000000000..3b6a7546616
--- /dev/null
+++ b/python/unittest_py/requirements.txt
@@ -0,0 +1,4 @@
+PyGithub
+coverage
+pycrypto
+mock
-- 
GitLab


From 43b0445b2945a429d5c8fb49c94924668e9ddaa0 Mon Sep 17 00:00:00 2001
From: Qinghe JING <jingqinghe@baidu.com>
Date: Wed, 9 Sep 2020 15:04:36 +0800
Subject: [PATCH 017/261] Add double grad in reduce sum (#27115)

* set default value to strategy in distributed_optimizer test=develop
---
 .../operators/reduce_ops/reduce_sum_op.cc     | 16 +++++++++++++
 .../fluid/tests/unittests/test_nn_grad.py     | 23 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 6e470e3af4e..54818470b27 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -51,6 +51,20 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class ReduceSumDoubleOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    op->SetAttrMap(this->Attrs());
+    op->SetType("reduce_sum");
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ReduceSumGradNoNeedBufferVarInferer, "X");
 class ReduceSumVarTypeInference : public paddle::framework::VarTypeInference {
  public:
@@ -77,6 +91,8 @@ REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker,
                   ops::ReduceSumOpGradMaker<paddle::framework::OpDesc>,
                   ops::ReduceSumOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
+                  ops::ReduceSumDoubleOpGradMaker<paddle::framework::OpDesc>,
+                  ops::ReduceSumDoubleOpGradMaker<paddle::imperative::OpBase>,
                   ops::ReduceSumGradNoNeedBufferVarInferer);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index c6cfe01dce4..0c39dc5e731 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -101,6 +101,29 @@ class TestReduceMeanWithDimDoubleGradCheck(unittest.TestCase):
             self.func(p)
 
 
+class TestReduceSumWithDimDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        shape = [7, 11]
+        eps = 0.05
+        dtype = np.float64
+
+        x = layers.data('x', shape, False, dtype)
+        x.persistable = True
+        y = layers.reduce_sum(x, dim=0)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 class TestMulDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-- 
GitLab


From f7d08b7db8c1e49d1c10384d4dbf94cb2f8aa1a2 Mon Sep 17 00:00:00 2001
From: Dong Daxiang <35550832+guru4elephant@users.noreply.github.com>
Date: Wed, 9 Sep 2020 15:44:43 +0800
Subject: [PATCH 018/261] =?UTF-8?q?=E3=80=90paddle.fleet=E3=80=91refine=20?=
 =?UTF-8?q?launch=20and=20distributed=20repr=20string=20for=20print=20(#27?=
 =?UTF-8?q?093)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* refine launch and distributed repr string for print
---
 .../fleet/base/distributed_strategy.py        | 89 ++++++++++++++++++-
 .../paddle/distributed/fleet/launch_utils.py  |  9 +-
 .../fluid/tests/unittests/CMakeLists.txt      |  2 +
 .../test_fleet_distributed_strategy.py        |  8 ++
 4 files changed, 102 insertions(+), 6 deletions(-)

diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 62967a202ab..d65be0dd4b1 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -17,6 +17,7 @@ from paddle.distributed.fleet.proto import distributed_strategy_pb2
 from paddle.fluid.framework import Variable, set_flags, core
 from paddle.fluid.wrapped_decorator import wrap_decorator
 import google.protobuf.text_format
+import google.protobuf
 
 __all__ = ["DistributedStrategy"]
 
@@ -1133,7 +1134,91 @@ class DistributedStrategy(object):
         return False
 
     def __repr__(self):
+        spacing = 2
+        max_k = 38
+        max_v = 38
+
+        length = max_k + max_v + spacing
+
+        h1_format = "    " + "|{{:^{}s}}|\n".format(length)
+        h2_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(max_k, " " *
+                                                               spacing, max_v)
+
+        border = "    +" + "".join(["="] * length) + "+"
+        line = "    +" + "".join(["-"] * length) + "+"
+
+        draws = border + "\n"
+        draws += h1_format.format("")
+        draws += h1_format.format("DistributedStrategy Overview")
+        draws += h1_format.format("")
+
         fields = self.strategy.DESCRIPTOR.fields
+        str_res = ""
+
+        env_draws = line + "\n"
+        for f in fields:
+            if "build_strategy" in f.name or "execution_strategy" in f.name:
+                continue
+            if "_configs" in f.name:
+                continue
+            else:
+                if isinstance(getattr(self.strategy, f.name), bool):
+                    if hasattr(self.strategy, f.name + "_configs"):
+                        if getattr(self.strategy, f.name):
+                            draws += border + "\n"
+                            draws += h1_format.format(
+                                "{} = True, please check {}_configs".format(
+                                    f.name, f.name))
+                            draws += line + "\n"
+                            my_configs = getattr(self.strategy,
+                                                 f.name + "_configs")
+                            config_fields = my_configs.DESCRIPTOR.fields
+                            for ff in config_fields:
+                                if isinstance(
+                                        getattr(my_configs, ff.name),
+                                        google.protobuf.pyext._message.
+                                        RepeatedScalarContainer):
+                                    values = getattr(my_configs, ff.name)
+                                    for i, v in enumerate(values):
+                                        if i == 0:
+                                            draws += h2_format.format(ff.name,
+                                                                      str(v))
+                                        else:
+                                            draws += h2_format.format("",
+                                                                      str(v))
+                                else:
+                                    draws += h2_format.format(
+                                        ff.name,
+                                        str(getattr(my_configs, ff.name)))
+                    else:
+                        env_draws += h2_format.format(
+                            f.name, str(getattr(self.strategy, f.name)))
+                else:
+                    env_draws += h2_format.format(
+                        f.name, str(getattr(self.strategy, f.name)))
+
+        result_res = draws + border + "\n" + h1_format.format(
+            "Environment Flags, Communication Flags")
+        result_res += env_draws
+
+        build_strategy_str = border + "\n"
+        build_strategy_str += h1_format.format("Build Strategy")
+        build_strategy_str += line + "\n"
+
+        fields = self.strategy.build_strategy.DESCRIPTOR.fields
         for f in fields:
-            print("{}: {}".format(f.name, f.default_value))
-        return str(self.strategy)
+            build_strategy_str += h2_format.format(
+                f.name, str(getattr(self.strategy.build_strategy, f.name)))
+        build_strategy_str += border + "\n"
+
+        execution_strategy_str = h1_format.format("Execution Strategy")
+        execution_strategy_str += line + "\n"
+
+        fields = self.strategy.execution_strategy.DESCRIPTOR.fields
+        for f in fields:
+            execution_strategy_str += h2_format.format(
+                f.name, str(getattr(self.strategy.execution_strategy, f.name)))
+        execution_strategy_str += border + "\n"
+
+        result_res += build_strategy_str + execution_strategy_str
+        return result_res
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 3da5aed8201..0e995200dde 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -347,12 +347,13 @@ def pretty_print_envs(envs, header=None):
     for k, v in envs.items():
         max_k = max(max_k, len(k))
 
-    h_format = "{{:^{}s}}{}{{:<{}s}}\n".format(max_k, " " * spacing, max_v)
-    l_format = "{{:<{}s}}{{}}{{:<{}s}}\n".format(max_k, max_v)
+    h_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(max_k, " " * spacing,
+                                                          max_v)
+    l_format = "    " + "|{{:>{}s}}{{}}{{:^{}s}}|\n".format(max_k, max_v)
     length = max_k + max_v + spacing
 
-    border = "".join(["="] * length)
-    line = "".join(["-"] * length)
+    border = "    +" + "".join(["="] * length) + "+"
+    line = "    +" + "".join(["-"] * length) + "+"
 
     draws = ""
     draws += border + "\n"
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index db472ec0166..422cc0eddd0 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -47,6 +47,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_distributed_strategy)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto)
 foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
@@ -461,6 +462,7 @@ if(WITH_DISTRIBUTE)
     	   py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
 	   py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
+	   py_test_modules(test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy)
 	   py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
         if(NOT WIN32)
             py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 8d715674cc6..83db1b33551 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -316,6 +316,14 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.conv_workspace_size_limit, 1000)
         strategy._enable_env()
 
+    def test_distributed_strategy_repr(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.recompute = True
+        strategy.recompute_configs = {"checkpoints": ["a1", "a2", "a3"]}
+        strategy.amp = True
+        strategy.localsgd = True
+        print(str(strategy))
+
 
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From c71d79b1d2f9eaec2461c5226d56a9e26947a4ca Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Wed, 9 Sep 2020 16:34:19 +0800
Subject: [PATCH 019/261] [cuda11 support] change the CMakeLists to support the
 cuda11  (#27124)

---
 cmake/cuda.cmake                                 |  2 +-
 cmake/external/warpctc.cmake                     |  2 +-
 cmake/flags.cmake                                | 10 +++++++++-
 cmake/third_party.cmake                          |  7 ++++---
 paddle/fluid/operators/CMakeLists.txt            |  4 +++-
 paddle/fluid/operators/detection/CMakeLists.txt  | 10 +++++++---
 paddle/fluid/operators/math/CMakeLists.txt       |  6 +++++-
 paddle/fluid/operators/reduce_ops/CMakeLists.txt | 12 ++++++++++--
 8 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 6f4671c13a9..7a94bda0f5f 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -16,7 +16,7 @@ else()
   set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
   set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
   set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
-  set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80")
+  set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
 endif()
 
 ######################################################################################
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 0f6b1c182d5..ac6cf624e82 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 set(WARPCTC_REPOSITORY  https://github.com/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e)
+set(WARPCTC_TAG         fc7f226b93758216a03b1be9d24593a12819b984)
 
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
     CACHE PATH "Warp-ctc Directory" FORCE)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 9d07a0979d9..415e07c7542 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -28,7 +28,15 @@ function(CheckCompilerCXX11Flag)
 endfunction()
 
 CheckCompilerCXX11Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if (WITH_GPU)
+    if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
+       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+    else()
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    endif()
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+endif()
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index c9442e8f843..9edfcb967ab 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -243,9 +243,10 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
 ENDIF()
 
 if(WITH_GPU)
-    include(external/cub)       # download cub
-    list(APPEND third_party_deps extern_cub)
-  
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        include(external/cub)       # download cub
+        list(APPEND third_party_deps extern_cub)
+    endif()
     set(CUDAERROR_URL  "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
     file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
 endif(WITH_GPU)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 6e8ff52ed4a..f0a04d850df 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -45,7 +45,9 @@ endif()
 SET(OP_HEADER_DEPS xxhash executor)
 
 if (WITH_GPU)
-    SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
+    endif()
 endif()
 
 SET(OP_PREFETCH_DEPS "")
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 537063640e4..c2b7c27ab4a 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -41,9 +41,13 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_fo
 detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
 
 if(WITH_GPU)
-  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
-  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub)
-  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS memory cub)
+  set(TMPDEPS memory)
+  if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+      set(TMPDEPS memory cub)
+  endif()
+  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS})
+  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS})
+  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS ${TMPDEPS})
 else()
   detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
   detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 3a19c7edff3..10d335b828b 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -9,7 +9,11 @@ function(math_library TARGET)
     set(hip_srcs)
     set(math_common_deps device_context framework_proto enforce)
     if (WITH_GPU)
-        list(APPEND math_common_deps cub)
+        if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)	
+            list(APPEND math_common_deps cub)
+	else()
+            list(APPEND math_common_deps)
+	endif()
     endif()
     set(multiValueArgs DEPS)
     cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
index 3da481a142a..a68666b100c 100644
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
@@ -1,6 +1,10 @@
 include(operators)
 if(WITH_GPU)
-    register_operators(DEPS cub)
+    if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        register_operators(DEPS cub)
+    else()
+        register_operators()
+    endif()
 else()
     register_operators()
 endif()
@@ -24,5 +28,9 @@ if(WITH_GPU)
 endif()
 
 if(WITH_GPU)
-    nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub)
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+	nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub)
+    else()
+	nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
+    endif()
 endif()
-- 
GitLab


From 3b8f5200a549affab23a4b6e483ae6eb240e7c8c Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Wed, 9 Sep 2020 16:53:45 +0800
Subject: [PATCH 020/261] add dgc cuda11 support for Paddle (#27204)

---
 cmake/external/dgc.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake
index 8472a0743b9..bc8611f3862 100644
--- a/cmake/external/dgc.cmake
+++ b/cmake/external/dgc.cmake
@@ -19,7 +19,7 @@ SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc")
 SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
 SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
 SET(DGC_LIBRARIES   "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
-SET(DGC_URL         "http://fleet.bj.bcebos.com/collective_ef2216a.tgz")
+SET(DGC_URL         "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz")
 INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
 
 cache_third_party(extern_dgc
@@ -30,7 +30,7 @@ ExternalProject_Add(
     extern_dgc
     ${EXTERNAL_PROJECT_LOG_ARGS}
     "${DGC_DOWNLOAD_CMD}"
-    URL_MD5         "2f67549fd5f1262383d83289abc4f88f"
+    URL_MD5         "94e6fa1bc97169d0e1aad44570fe3251"
     PREFIX          "${DGC_PREFIX_DIR}"
     SOURCE_DIR      "${DGC_SOURCES_DIR}"
     CONFIGURE_COMMAND ""
-- 
GitLab


From a1b640bc66a5cc9583de503e7406aeba67565e8d Mon Sep 17 00:00:00 2001
From: liym27 <33742067+liym27@users.noreply.github.com>
Date: Wed, 9 Sep 2020 17:19:07 +0800
Subject: [PATCH 021/261] Fix test_origin_info to be compatible with PY3.8,
 because ast module is different in PY3.8 (#27201)

---
 .../dygraph/dygraph_to_static/origin_info.py  |  9 +++++--
 .../dygraph_to_static/test_origin_info.py     | 26 ++++++++++++++++---
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
index 13f38b0726c..76e732d4d37 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
@@ -124,8 +124,13 @@ class OriginInfoAttacher(gast.NodeTransformer):
 
     def _abs_lineno(self, node):
         # NOTE(liym27):
-        #   If the first gast.FunctionDef has decorator, its lineno is 1, which
-        #   equals to the lineno of the first decorator node.
+        #   There are differences in ast_node.lineno between PY3.8+ and PY3.8-.
+        #   If the first gast.FunctionDef has decorator, the lineno of gast.FunctionDef is differs.
+        #       1. < PY3.8
+        #           its lineno equals to the lineno of the first decorator node, which is not right.
+        #       2. >= PY3.8
+        #           its lineno is the actual lineno, which is right.
+
         return self.lineno_offset + node.lineno
 
     def _abs_col_offset(self, node):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
index b03777b6ebc..3f77e9ade28 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import sys
 import unittest
 
 from paddle.fluid.dygraph.dygraph_to_static.ast_transformer import DygraphToStaticAst
@@ -177,8 +178,20 @@ class TestOriginInfoWithDecoratedFunc(TestOriginInfo):
 
     def set_dygraph_info(self):
         self.line_num = 2
-        self.line_index_list = [0, 2]
-        self.dy_rel_lineno_list = [0, 2]
+
+        # NOTE(liym27):
+        #   There are differences in ast_node.lineno between PY3.8+ and PY3.8-.
+        #   If the first gast.FunctionDef has decorator, the lineno of gast.FunctionDef is differs.
+        #       1. < PY3.8
+        #           its lineno equals to the lineno of the first decorator node, which is not right.
+        #       2. >= PY3.8
+        #           its lineno is the actual lineno, which is right.
+        if sys.version_info >= (3, 8):
+            self.line_index_list = [1, 2]
+            self.dy_rel_lineno_list = [1, 2]
+        else:
+            self.line_index_list = [0, 2]
+            self.dy_rel_lineno_list = [0, 2]
         self.dy_abs_col_offset = [0, 4]
         self.dy_func_name = [self.dygraph_func.__name__] * self.line_num
 
@@ -199,8 +212,13 @@ class TestOriginInfoWithDecoratedFunc2(TestOriginInfo):
 
     def set_dygraph_info(self):
         self.line_num = 2
-        self.line_index_list = [0, 3]
-        self.dy_rel_lineno_list = [0, 3]
+
+        if sys.version_info >= (3, 8):
+            self.line_index_list = [2, 3]
+            self.dy_rel_lineno_list = [2, 3]
+        else:
+            self.line_index_list = [0, 3]
+            self.dy_rel_lineno_list = [0, 3]
         self.dy_abs_col_offset = [0, 4]
         self.dy_func_name = [self.dygraph_func.__name__] * self.line_num
 
-- 
GitLab


From 5d039f40866ea2d879483668c685c5f18c4fc37d Mon Sep 17 00:00:00 2001
From: JZ-LIANG <38102074+JZ-LIANG@users.noreply.github.com>
Date: Wed, 9 Sep 2020 19:19:47 +0800
Subject: [PATCH 022/261] modified the implement of Lars optimizer (#26733)

add lars to fleet meta optimizer
---
 .../framework/distributed_strategy.proto      |  2 +
 .../operators/optimizers/lars_momentum_op.cc  |  3 +
 .../operators/optimizers/lars_momentum_op.cu  | 11 ++-
 .../operators/optimizers/lars_momentum_op.h   |  5 +-
 .../fleet/meta_optimizers/lamb_optimizer.py   |  4 +
 .../fleet/meta_optimizers/lars_optimizer.py   | 11 ++-
 python/paddle/fluid/optimizer.py              | 26 ++++++-
 .../test_fleet_lamb_meta_optimizer.py         | 53 +++++++++++--
 .../test_fleet_lars_meta_optimizer.py         | 74 +++++++++++++++++--
 9 files changed, 165 insertions(+), 24 deletions(-)
 mode change 100644 => 100755 paddle/fluid/operators/optimizers/lars_momentum_op.cc
 mode change 100644 => 100755 paddle/fluid/operators/optimizers/lars_momentum_op.h
 mode change 100644 => 100755 python/paddle/fluid/optimizer.py

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 551d1342ede..8d0093388b4 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -52,6 +52,8 @@ message DGCConfig {
 message LarsConfig {
   optional float lars_coeff = 1 [ default = 0.001 ];
   optional float lars_weight_decay = 2 [ default = 0.0005 ];
+  optional float epsilon = 3 [ default = 0.0 ];
+  repeated string exclude_from_weight_decay = 4;
 }
 
 message LambConfig {
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
old mode 100644
new mode 100755
index 5f0500d2faa..479f9643749
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -48,6 +48,9 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("lars_weight_decay",
                    "(float, default 0.0005) LARS weight decay")
         .SetDefault(0.0005);
+    AddAttr<float>("epsilon",
+                   "(float, default 0.0) epsilon to avoid Division by Zero.")
+        .SetDefault(0.0);
 
     AddComment(R"DOC(
 Lars Momentum Optimizer.
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
index 1dace4ed6ab..eb0111ae4de 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -23,14 +23,16 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
                                    const T* learning_rate, const T mu,
                                    const int64_t num, const T lars_coeff,
                                    const T lars_weight_decay, const T* p_norm,
-                                   const T* g_norm, T* p_out, T* v_out) {
+                                   const T* g_norm, T* p_out, T* v_out,
+                                   const T epsilon) {
   T lr = learning_rate[0];
   T local_lr = learning_rate[0];
   CUDA_KERNEL_LOOP(i, num) {
-    if (p_norm[0] > 0 && g_norm[0] > 0) {
+    if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) {
       local_lr = lr * lars_coeff * p_norm[0] /
-                 (g_norm[0] + lars_weight_decay * p_norm[0]);
+                 (g_norm[0] + lars_weight_decay * p_norm[0] + epsilon);
     }
+
     T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
     v_out[i] = v_new;
     p_out[i] = p[i] - v_new;
@@ -54,6 +56,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
     T mu = static_cast<T>(ctx.Attr<float>("mu"));
     T lars_coeff = ctx.Attr<float>("lars_coeff");
     T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
+    T epsilon = ctx.Attr<float>("epsilon");
 
     auto* p = param->data<T>();
     auto* v = velocity->data<T>();
@@ -79,7 +82,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
     eg_norm.device(*place) = eigen_g.square().sum().sqrt();
     MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
         p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
-        p_norm_data, g_norm_data, p_out, v_out);
+        p_norm_data, g_norm_data, p_out, v_out, epsilon);
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h
old mode 100644
new mode 100755
index e0064c20182..b579b5143dd
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
@@ -39,6 +39,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
     T mu = static_cast<T>(ctx.Attr<float>("mu"));
     T lars_coeff = ctx.Attr<float>("lars_coeff");
     T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
+    T epsilon = ctx.Attr<float>("epsilon");
 
     auto p_out = framework::EigenVector<T>::Flatten(*param_out);
     auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
@@ -59,9 +60,9 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
     ep_norm = p.square().sum().sqrt();
     eg_norm = g.square().sum().sqrt();
     T local_lr = lr[0];
-    if (ep_norm(0) > 0 && eg_norm(0) > 0) {
+    if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
       local_lr = lr[0] * lars_coeff * ep_norm(0) /
-                 (eg_norm(0) + lars_weight_decay * ep_norm(0));
+                 (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
     }
     v_out = v * mu + local_lr * (g + lars_weight_decay * p);
     p_out = p - v_out;
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index 3a9f2be533b..bfa186a1e7c 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -91,6 +91,10 @@ class LambOptimizer(MetaOptimizerBase):
         return self.lamb_opt.backward(loss, startup_program, parameter_list,
                                       no_grad_set, callbacks)
 
+    # the following function will be used by AMP if both LARS and AMP are turn on together.
+    def apply_gradients(self, params_grads):
+        return self.lamb_opt.apply_gradients(params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index cb12154ddc5..ec7a7eb18bc 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -44,13 +44,16 @@ class LarsOptimizer(MetaOptimizerBase):
             parameter_list=opt._parameter_list,
             regularization=opt.regularization,
             grad_clip=opt._grad_clip,
-            name=opt._name)
+            name=opt._name,
+            exclude_from_weight_decay=configs['exclude_from_weight_decay'],
+            epsilon=configs['epsilon'])
 
     def _can_apply(self):
         if self.user_defined_strategy.lars:
             if not isinstance(self.inner_opt, Momentum):
                 logging.warn(
-                    "lars need the inner optimizer to be Momentum optimizer.")
+                    "lars need the inner optimizer to be Momentum optimizer but got {}.".
+                    format(self.inner_opt.type))
                 return False
             return True
         return False
@@ -75,6 +78,10 @@ class LarsOptimizer(MetaOptimizerBase):
         return self.lars_opt.backward(loss, startup_program, parameter_list,
                                       no_grad_set, callbacks)
 
+    # the following function will be used by AMP if both LARS and AMP are turn on together.
+    def apply_gradients(self, params_grads):
+        return self.lars_opt.apply_gradients(params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
old mode 100644
new mode 100755
index 8b37cfef389..192effd2e42
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1604,7 +1604,7 @@ class LarsMomentumOptimizer(Optimizer):
         & local\_learning\_rate = learning\_rate * lars\_coeff * \\
           \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}
 
-        & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param)
+        & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon)
 
         & param = param - velocity
 
@@ -1628,7 +1628,9 @@ class LarsMomentumOptimizer(Optimizer):
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
-
+        exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
+        epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
+        
     Examples:
         .. code-block:: python
 
@@ -1659,7 +1661,9 @@ class LarsMomentumOptimizer(Optimizer):
                  parameter_list=None,
                  regularization=None,
                  grad_clip=None,
-                 name=None):
+                 name=None,
+                 exclude_from_weight_decay=None,
+                 epsilon=0):
         assert learning_rate is not None
         assert momentum is not None
         super(LarsMomentumOptimizer, self).__init__(
@@ -1672,6 +1676,11 @@ class LarsMomentumOptimizer(Optimizer):
         self._momentum = momentum
         self._lars_coeff = float(lars_coeff)
         self._lars_weight_decay = float(lars_weight_decay)
+        self._epsilon = float(epsilon)
+        if exclude_from_weight_decay is None:
+            self._exclude_from_weight_decay = []
+        else:
+            self._exclude_from_weight_decay = exclude_from_weight_decay
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -1682,6 +1691,14 @@ class LarsMomentumOptimizer(Optimizer):
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
 
+        _lars_weight_decay = self._lars_weight_decay
+        param_name = param_and_grad[0].name
+        if len(self._exclude_from_weight_decay) > 0:
+            for name in self._exclude_from_weight_decay:
+                if name in param_name:
+                    _lars_weight_decay = 0.0
+                    break
+
         velocity_acc = self._get_accumulator(self._velocity_acc_str,
                                              param_and_grad[0])
         # create the momentum optimize op
@@ -1700,7 +1717,8 @@ class LarsMomentumOptimizer(Optimizer):
             attrs={
                 "mu": self._momentum,
                 "lars_coeff": self._lars_coeff,
-                "lars_weight_decay": self._lars_weight_decay
+                "lars_weight_decay": _lars_weight_decay,
+                "epsilon": self._epsilon
             },
             stop_gradient=True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
index 3f140f53b04..ff305fb9523 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
@@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker
 
 class TestFleetLambMetaOptimizer(unittest.TestCase):
     def setUp(self):
-        os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
-        os.environ["PADDLE_TRAINERS_NUM"] = "2"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
     def net(self, main_prog, startup_prog):
         with fluid.program_guard(main_prog, startup_prog):
@@ -97,13 +95,54 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
-        ops_with_bias = [
+        ops_without_wd = [
             op for op in avg_cost.block.ops
             if op.type == 'lamb' and op.attr('op_role_var')[0].endswith('.b_0')
         ]
-        for op in ops_with_bias:
+        for op in ops_without_wd:
             self.assertEqual(op.attr('weight_decay'), 0)
 
+    def test_lamb_apply_with_amp(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.amp = True
+        strategy.amp_configs = {
+            "init_loss_scaling": 32768,
+            "decr_every_n_nan_or_inf": 2,
+            "incr_every_n_steps": 1000,
+            "incr_ratio": 2.0,
+            "use_dynamic_loss_scaling": True,
+            "decr_ratio": 0.5,
+            "custom_white_list": ['softmax'],
+            "custom_black_list": ['tanh'],
+        }
+        strategy.lamb = True
+        strategy.lamb_configs = {
+            'lamb_weight_decay': 0.01,
+            'exclude_from_weight_decay': [],
+        }
+
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('lamb', ops)
+        self.assertIn('cast', ops)
+        self.assertIn('isfinite', ops)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
index 3caa1a4eac0..34ab423e064 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
@@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker
 
 class TestFleetLarsMetaOptimizer(unittest.TestCase):
     def setUp(self):
-        os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
-        os.environ["PADDLE_TRAINERS_NUM"] = "2"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
     def net(self, main_prog, startup_prog):
         with fluid.program_guard(main_prog, startup_prog):
@@ -52,6 +50,8 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
                 strategy.lars_configs = {
                     "lars_coeff": 0.001,
                     "lars_weight_decay": 0.0005,
+                    "epsilon": 0,
+                    "exclude_from_weight_decay": ["batch_norm", ".b"],
                 }
 
         return avg_cost, strategy
@@ -83,6 +83,70 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
         ops = [op.type for op in avg_cost.block.ops]
         self.assertNotIn('lars_momentum', ops)
 
+    def test_lars_exclude_fn(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
+
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops_without_wd = [
+            op for op in avg_cost.block.ops
+            if op.type == 'lars_momentum' and ("batch_norm" in op.attr(
+                'op_role_var')[0] or ".b" in op.attr('op_role_var')[0])
+        ]
+        for op in ops_without_wd:
+            self.assertEqual(op.attr('lars_weight_decay'), 0)
+
+    def test_lars_apply_with_amp(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.amp = True
+        strategy.amp_configs = {
+            "init_loss_scaling": 32768,
+            "decr_every_n_nan_or_inf": 2,
+            "incr_every_n_steps": 1000,
+            "incr_ratio": 2.0,
+            "use_dynamic_loss_scaling": True,
+            "decr_ratio": 0.5,
+            "custom_white_list": ['softmax'],
+            "custom_black_list": ['tanh'],
+        }
+        strategy.lars = True
+        strategy.lars_configs = {
+            "lars_coeff": 0.001,
+            "lars_weight_decay": 0.0005,
+            "epsilon": 0,
+            "exclude_from_weight_decay": ["batch_norm", ".b"],
+        }
+
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('lars_momentum', ops)
+        self.assertIn('cast', ops)
+        self.assertIn('isfinite', ops)
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From edd962b1d0e70f8234946598beded178d83034a6 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Wed, 9 Sep 2020 19:20:11 +0800
Subject: [PATCH 023/261] Add 2.0 inference api doc. (#27125)

---
 .../inference/api/paddle_inference_api.h      | 157 +++++++++++++++++-
 1 file changed, 156 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index a58b510ecf1..5dc4430fde4 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -31,12 +31,30 @@ limitations under the License. */
 #include "paddle_analysis_config.h"  // NOLINT
 #include "paddle_api.h"              // NOLINT
 
+///
+/// \file paddle_inference_api.h
+///
+/// \brief Paddle Inference API
+///
+/// \author paddle-infer@baidu.com
+/// \date 2020-09-01
+/// \since 2.0.0-beta
+///
+
 namespace paddle_infer {
 using DataType = paddle::PaddleDType;
 using PlaceType = paddle::PaddlePlace;
 using PrecisionType = paddle::AnalysisConfig::Precision;
 using Config = paddle::AnalysisConfig;
 
+///
+/// \class Tensor
+///
+/// \brief Represents an n-dimensional array of values.
+/// The Tensor is used to store the input or output of the network.
+/// It is obtained through Predictor::GetinputHandle()
+/// and Predictor::GetOutputHandle() interface.
+///
 class PD_INFER_DECL Tensor {
  public:
   // Can only be created by predictor->GetInputHandle(cosnt std::string& name)
@@ -44,33 +62,106 @@ class PD_INFER_DECL Tensor {
   Tensor() = delete;
   explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor)
       : tensor_(std::move(tensor)) {}
+
+  ///
+  /// \brief Reset the shape of the tensor.
+  /// Generally it's only used for the input tensor.
+  /// Reshape must be called before calling mutable_data() or CopyFromCpu()
+  /// \param shape The shape to set.
+  ///
   void Reshape(const std::vector<int>& shape);
 
+  ///
+  /// \brief Copy the host memory to tensor data.
+  /// It's usually used to set the input tensor data.
+  /// \param data The pointer of the data, from which the tensor will copy.
+  ///
   template <typename T>
   void CopyFromCpu(const T* data);
 
-  // should add the place
+  ///
+  /// \brief Get the memory pointer in CPU or GPU with specific data type.
+  /// Please Reshape the tensor first before call this.
+  /// It's usually used to get input data pointer.
+  /// \param place The place of the tensor.
+  /// \return The tensor data buffer pointer.
+  ///
   template <typename T>
   T* mutable_data(PlaceType place);
 
+  ///
+  /// \brief Copy the tensor data to the host memory.
+  /// It's usually used to get the output tensor data.
+  /// \param[out] data The tensor will copy the data to the address.
+  ///
   template <typename T>
   void CopyToCpu(T* data);
 
+  ///
+  /// \brief Get the memory pointer directly.
+  /// It's usually used to get the output data pointer.
+  /// \param[out] place To get the device type of the tensor.
+  /// \param[out] size To get the data size of the tensor.
+  /// \return The tensor data buffer pointer.
+  ///
   template <typename T>
   T* data(PlaceType* place, int* size) const;
 
+  ///
+  /// \brief Set lod info of the tensor.
+  /// More about LOD can be seen here:
+  ///  https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
+  /// \param x the lod info.
+  ///
   void SetLoD(const std::vector<std::vector<size_t>>& x);
+
+  /// \brief Return the lod info of the tensor.
   std::vector<std::vector<size_t>> lod() const;
 
+  /// \brief Return the data type of the tensor.
+  /// It's usually used to get the output tensor data type.
+  /// \return The data type of the tensor.
   DataType type() const;
 
+  /// \brief Return the shape of the Tensor.
   std::vector<int> shape() const;
+
+  /// \brief Return the name of the tensor.
   const std::string& name() const;
 
  private:
   std::unique_ptr<paddle::ZeroCopyTensor> tensor_;
 };
 
+///
+/// \class Predictor
+///
+/// \brief Predictor is the interface for model prediction.
+///
+/// The predictor has the following typical uses:
+///
+/// Get predictor
+/// \code{cpp}
+///   auto predictor = CreatePredictor(config);
+/// \endcode
+///
+/// Get input or output names
+/// \code{cpp}
+///   auto input_names = predictor->GetInputNames();
+///   auto output_names = predictor->GetOutputNames();
+/// \endcode
+///
+/// Get input or output handle
+/// \code{cpp}
+///   auto input_t = predictor->GetInputHandle(input_names[0]);
+///   auto output_t = predictor->GetOutputHandle(output_names[0]);
+/// \endcode
+///
+/// Run predictor
+/// \code{cpp}
+///   predictor->Run();
+/// \endcode
+///
 class PD_INFER_DECL Predictor {
  public:
   Predictor() = delete;
@@ -79,25 +170,78 @@ class PD_INFER_DECL Predictor {
   explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
       : predictor_(std::move(pred)) {}
 
+  ///
+  /// \brief Construct a new Predictor object
+  ///
+  /// \param[in] Config config
+  ///
   explicit Predictor(const Config& config);
 
+  ///
+  /// \brief Get the input names
+  ///
+  /// \return input names
+  ///
   std::vector<std::string> GetInputNames();
+
+  ///
+  /// \brief Get the Input Tensor object
+  ///
+  /// \param[in] name input name
+  /// \return input tensor
+  ///
   std::unique_ptr<Tensor> GetInputHandle(const std::string& name);
 
+  ///
+  /// \brief Run the prediction engine
+  ///
+  /// \return Whether the function executed successfully
+  ///
   bool Run();
 
+  ///
+  /// \brief Get the output names
+  ///
+  /// \return output names
+  ///
   std::vector<std::string> GetOutputNames();
+
+  ///
+  /// \brief Get the Output Tensor object
+  ///
+  /// \param[in] name otuput name
+  /// \return output tensor
+  ///
   std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
 
+  ///
+  /// \brief Clone to get the new predictor. thread safe.
+  ///
+  /// \return get a new predictor
+  ///
   std::unique_ptr<Predictor> Clone();
+
+  /// \brief Clear the intermediate tensors of the predictor
   void ClearIntermediateTensor();
 
  private:
   std::unique_ptr<paddle::PaddlePredictor> predictor_;
 };
 
+///
+/// \brief A factory to help create predictors.
+///
+/// Usage:
+///
+/// \code{.cpp}
+/// Config config;
+/// ... // change the configs.
+/// auto predictor = CreatePredictor(config);
+/// \endcode
+///
 PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor(
     const Config& config);  // NOLINT
+
 PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);
 
 PD_INFER_DECL std::string GetVersion();
@@ -128,13 +272,24 @@ T* Tensor::data(PlaceType* place, int* size) const {
 namespace paddle_infer {
 namespace services {
 
+///
+/// \class PredictorPool
+///
+/// \brief PredictorPool is a simple encapsulation of Predictor, suitable for
+/// use in multi-threaded situations. According to the thread id, the
+/// corresponding Predictor is taken out from PredictorPool to complete the
+/// prediction.
+///
 class PD_INFER_DECL PredictorPool {
  public:
   PredictorPool() = delete;
   PredictorPool(const PredictorPool&) = delete;
   PredictorPool& operator=(const PredictorPool&) = delete;
 
+  /// \brief Construct the predictor pool with \param size predictor instances.
   explicit PredictorPool(const Config& config, size_t size = 1);
+
+  /// \brief Get \param id-th predictor.
   Predictor* Retrive(size_t idx);
 
  private:
-- 
GitLab


From 7c8e980a48620eb7d30d16c659be9ca7183c78ad Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 9 Sep 2020 20:52:16 +0800
Subject: [PATCH 024/261] fix enforce shell dir, test=document_fix (#27215)

---
 tools/enforce/count_all_enforce.sh    | 2 +-
 tools/enforce/count_enforce_by_dir.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/enforce/count_all_enforce.sh b/tools/enforce/count_all_enforce.sh
index c1b7508de03..683b73614d2 100644
--- a/tools/enforce/count_all_enforce.sh
+++ b/tools/enforce/count_all_enforce.sh
@@ -39,7 +39,7 @@
 #     Valid PADDLE_ENFORCE{_**} & PADDLE_THROW Count: 1706
 #     Invalid PADDLE_ENFORCE{_**} & PADDLE_THROW Count: 4572
 
-ROOT_DIR=../paddle/fluid
+ROOT_DIR=../../paddle/fluid
 ALL_PADDLE_CHECK_CNT=0
 VALID_PADDLE_CHECK_CNT=0
 
diff --git a/tools/enforce/count_enforce_by_dir.sh b/tools/enforce/count_enforce_by_dir.sh
index 03233d417ac..3cb13edf7cc 100644
--- a/tools/enforce/count_enforce_by_dir.sh
+++ b/tools/enforce/count_enforce_by_dir.sh
@@ -59,7 +59,7 @@
 
 . ./count_all_enforce.sh --source-only
 
-ROOT_DIR=../paddle/fluid
+ROOT_DIR=../../paddle/fluid
 
 function count_dir_independently(){
     local sub_dir_total_check_cnt=0
-- 
GitLab


From 50e60e8779931809fb0e478fee93629f795acbeb Mon Sep 17 00:00:00 2001
From: Steffy-zxf <48793257+Steffy-zxf@users.noreply.github.com>
Date: Thu, 10 Sep 2020 10:11:29 +0800
Subject: [PATCH 025/261] update error info for selected_rows_functor

update error info for selected_rows_functor
---
 .../operators/math/selected_rows_functor.cc   | 127 ++++++++++++++----
 .../operators/math/selected_rows_functor.cu   | 112 ++++++++++++---
 .../math/selected_rows_functor_test.cu.cc     |   4 +-
 3 files changed, 198 insertions(+), 45 deletions(-)

diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 3bb9efc5315..c2595beb0cb 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -29,7 +29,12 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2.height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height  = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2.height()));
     output->set_height(in1_height);
 
     auto& in1_rows = input1.rows();
@@ -47,15 +52,31 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
     auto& in2_value = input2.value();
 
     auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, in2_value.numel() / in2_rows.size(),
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, in2_value.numel() / in2_rows.size()));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, out_value->numel() / out_rows.size(),
+        platform::errors::InvalidArgument(
+            "The input and oupput width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, out_value->numel() / out_rows.size()));
 
     auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
     auto in2_place = input2.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
     auto out_place = context.GetPlace();
-    PADDLE_ENFORCE(platform::is_cpu_place(out_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(out_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
 
     auto* out_data = out_value->data<T>();
     auto* in1_data = in1_value.data<T>();
@@ -82,15 +103,35 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
     auto in1_height = input1.height();
     auto in2_dims = input2.dims();
     auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
+    PADDLE_ENFORCE_EQ(
+        in1_height, out_dims[0],
+        platform::errors::InvalidArgument(
+            "The input and output height must be equal."
+            "But recieved input height = [%d], output height = [%d]",
+            in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2.numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2.numel() / in1_height));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, output->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The input and output width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, output->numel() / in1_height));
 
     SetConstant<platform::CPUDeviceContext, T> functor;
     functor(context, output, 0.0);
@@ -121,7 +162,12 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
                   const int64_t input2_offset,
                   framework::SelectedRows* input2) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2->height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2->height()));
 
     auto& in1_rows = input1.rows();
     auto& in2_rows = *(input2->mutable_rows());
@@ -133,9 +179,13 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
     in2_rows.Extend(in1_rows.begin(), in1_rows.end());
 
     auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
     auto in2_place = input2->place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
@@ -163,7 +213,12 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
       auto& in_rows = (*iter)->rows();
       size += in_rows.end() - in_rows.begin();
       auto in1_height = (*iter)->height();
-      PADDLE_ENFORCE_EQ(in1_height, input2->height());
+      PADDLE_ENFORCE_EQ(in1_height, input2->height(),
+                        platform::errors::InvalidArgument(
+                            "The two inputs height must be equal."
+                            "But recieved first input height = [%d], second "
+                            "input height = [%d]",
+                            in1_height, input2->height()));
     }
     // concat rows
     std::vector<int64_t> in2_rows;
@@ -201,13 +256,23 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
     }
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
     auto* input2_data = input2->data<T>();
@@ -302,10 +367,12 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
         continue;
       }
       PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same "
+                            "dimension except for the first one."));
       PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same height."));
       row_num += input->rows().size();
       merged_row_set.insert(input->rows().begin(), input->rows().end());
     }
@@ -421,10 +488,12 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
         continue;
       }
       PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same "
+                            "dimension except for the first one."));
       PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
+                        platform::errors::InvalidArgument(
+                            "All input should have same height."));
       row_num += input->rows().size();
       merged_row_set.insert(input->rows().begin(), input->rows().end());
     }
@@ -492,13 +561,23 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
     auto* input2_data = input2->data<T>();
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 9cce52c6d45..35bd02ad35b 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -30,7 +30,12 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2.height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height  = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2.height()));
     output->set_height(in1_height);
 
     framework::Vector<int64_t> in1_rows(input1.rows());
@@ -48,18 +53,34 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     auto& in2_value = input2.value();
 
     auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, in2_value.numel() / in2_rows.size(),
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, in2_value.numel() / in2_rows.size()));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, out_value->numel() / out_rows.size(),
+        platform::errors::InvalidArgument(
+            "The input and oupput width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, out_value->numel() / out_rows.size()));
 
     auto* out_data = out_value->data<T>();
     auto* in1_data = in1_value.data<T>();
 
     auto in1_place = input1.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
     auto in2_place = input2.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
     auto out_place = context.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
 
     memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, out_place), out_data,
                  BOOST_GET_CONST(platform::CUDAPlace, in1_place), in1_data,
@@ -104,15 +125,35 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     auto in1_height = input1.height();
     auto in2_dims = input2.dims();
     auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument(
+            "The two inputs height must be equal."
+            "But recieved first input height = [%d], first input height = [%d]",
+            in1_height, in2_dims[0]));
+    PADDLE_ENFORCE_EQ(
+        in1_height, out_dims[0],
+        platform::errors::InvalidArgument(
+            "The input and output height must be equal."
+            "But recieved input height = [%d], output height = [%d]",
+            in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2.numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2.numel() / in1_height));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, output->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The input and output width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, output->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = input2.data<T>();
@@ -148,7 +189,12 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
                   const int64_t input2_offset,
                   framework::SelectedRows* input2) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2->height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2->height()));
 
     auto& in1_rows = input1.rows();
     auto& in2_rows = *(input2->mutable_rows());
@@ -162,9 +208,13 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     }
 
     auto in1_place = input1.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
     auto in2_place = input2->place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
@@ -209,13 +259,23 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = input2->data<T>();
@@ -340,10 +400,12 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
         continue;
       }
       PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
+                        platform::errors::InvalidArgument(
+                            "All input should have same "
+                            "dimension except for the first one."));
       PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
+                        platform::errors::InvalidArgument(
+                            "All input should have same height."));
       merged_row_set.insert(input->rows().begin(), input->rows().end());
     }
     std::vector<int64_t> merge_rows_cpu(merged_row_set.begin(),
@@ -448,13 +510,23 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
 
     auto in1_height = merged_in1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
 
     auto& in1_value = merged_in1.value();
     auto& in1_rows = merged_in1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.template data<T>();
     auto* in2_data = input2->data<T>();
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index 74892316e6d..81ad620466e 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -38,7 +38,9 @@ TEST(selected_rows_functor, gpu_add) {
           {static_cast<int64_t>(rows1.size()), row_numel}),
       gpu_place);
   functor(ctx, in1_value, 1.0);
-  PADDLE_ENFORCE(cudaDeviceSynchronize());
+  PADDLE_ENFORCE_EQ(cudaDeviceSynchronize(), 0,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "The all synchronization on the cuda is error!"));
 
   std::vector<int64_t> rows2{0, 5, 7, 9};
   std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
-- 
GitLab


From cc3306f7c8d62e42196ac3d61e744c0e9d1a1563 Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Thu, 10 Sep 2020 10:20:04 +0800
Subject: [PATCH 026/261] restruct logsumexp to speed up compiling (#27191)

---
 .../operators/reduce_ops/logsumexp_op.cc      | 154 ++++++++++++++++--
 .../operators/reduce_ops/logsumexp_op.cu      |  10 +-
 .../fluid/operators/reduce_ops/logsumexp_op.h | 112 ++++++++++++-
 .../operators/reduce_ops/logsumexp_op.part.cu |   9 +-
 .../fluid/tests/unittests/test_logsumexp.py   |   4 +-
 python/paddle/tensor/math.py                  |   5 +-
 6 files changed, 261 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
index 322a1637f5d..7cd164bfd3a 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
@@ -13,18 +13,138 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
-#include <memory>
+#include <algorithm>
 #include <string>
-#include <utility>
 #include <vector>
 
 namespace paddle {
 namespace operators {
 
-class LogsumexpOpMaker : public ops::ReduceOpMaker {
- protected:
-  virtual std::string GetName() const { return "logsumexp"; }
-  virtual std::string GetOpType() const { return "Reduce logsumexp"; }
+class LogsumexpOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "logsumexp");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 4,
+                      platform::errors::InvalidArgument(
+                          "The input tensor X's dimensions of logsumexp "
+                          "should be less equal than 4. But received X's "
+                          "dimensions = %d, X's shape = [%s].",
+                          x_rank, x_dims));
+    auto axis = ctx->Attrs().Get<std::vector<int>>("axis");
+    PADDLE_ENFORCE_GT(
+        axis.size(), 0,
+        platform::errors::InvalidArgument(
+            "The size of axis of logsumexp "
+            "should be greater than 0. But received the size of axis "
+            "of logsumexp is %d.",
+            axis.size()));
+
+    for (size_t i = 0; i < axis.size(); i++) {
+      PADDLE_ENFORCE_LT(
+          axis[i], x_rank,
+          platform::errors::InvalidArgument(
+              "axis[%d] should be in the "
+              "range [-dimension(X), dimension(X)] "
+              "where dimesion(X) is %d. But received axis[i] = %d.",
+              i, x_rank, axis[i]));
+      PADDLE_ENFORCE_GE(
+          axis[i], -x_rank,
+          platform::errors::InvalidArgument(
+              "axis[%d] should be in the "
+              "range [-dimension(X), dimension(X)] "
+              "where dimesion(X) is %d. But received axis[i] = %d.",
+              i, x_rank, axis[i]));
+      if (axis[i] < 0) {
+        axis[i] += x_rank;
+      }
+    }
+
+    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    auto dims_vector = vectorize(x_dims);
+    if (reduce_all) {
+      if (keepdim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = vectorize(x_dims);
+      if (keepdim) {
+        for (size_t i = 0; i < axis.size(); ++i) {
+          dims_vector[axis[i]] = 1;
+        }
+      } else {
+        const int kDelFlag = -1;
+        for (size_t i = 0; i < axis.size(); ++i) {
+          dims_vector[axis[i]] = kDelFlag;
+        }
+        dims_vector.erase(
+            std::remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+            dims_vector.end());
+      }
+      if (!keepdim && dims_vector.size() == 0) {
+        dims_vector.push_back(1);
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (axis.size() > 0 && axis[0] != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+};
+
+class LogsumexpOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 4 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<std::vector<int>>(
+        "axis",
+        "(list<int>, default {0}) The dimensions to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `axis[i] < 0`, the axis[i] to reduce is `rank + axis[i]`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault({0});
+    AddAttr<bool>("keepdim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+logsumexp Operator.
+
+This operator computes the logsumexp of input tensor along the given axis.
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
+
+)DOC"));
+  }
+};
+
+class LogsumexpGrapOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "logsumexp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "logsumexp");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
 };
 
 template <typename T>
@@ -32,7 +152,6 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
  public:
   using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
 
- protected:
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("logsumexp_grad");
     op->SetInput("X", this->Input("X"));
@@ -46,18 +165,17 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OPERATOR(logsumexp, ops::ReduceOp, ops::LogsumexpOpMaker,
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(logsumexp, ops::LogsumexpOp, ops::LogsumexpOpMaker,
                   ops::LogsumexpGradOpMaker<paddle::framework::OpDesc>,
                   ops::LogsumexpGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(logsumexp_grad, ops::ReduceGradOp);
+REGISTER_OPERATOR(logsumexp_grad, ops::LogsumexpGrapOp);
 
-REGISTER_OP_CPU_KERNEL(logsumexp,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         float, ops::LogsumexpFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::LogsumexpFunctor>);
 REGISTER_OP_CPU_KERNEL(
-    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                          float, ops::LogsumexpGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::LogsumexpGradFunctor>);
+    logsumexp, ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    logsumexp_grad,
+    ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
index c9ad1075c0c..86a31595eba 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
 
-REGISTER_OP_CUDA_KERNEL(logsumexp,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::LogsumexpFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::LogsumexpFunctor>);
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    logsumexp, ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.h b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
index 1d0e00262a3..a478690976b 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.h
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
@@ -14,11 +14,20 @@
 
 #pragma once
 
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 
 namespace paddle {
 namespace operators {
 
+#define HANDLE_DIM(NDIM, RDIM)                                            \
+  if (ndim == NDIM && rdim == RDIM) {                                     \
+    ReduceFunctor<DeviceContext, OutT, NDIM, RDIM, LogsumexpFunctor>(     \
+        context.template device_context<DeviceContext>(), *input, output, \
+        axis, keepdim);                                                   \
+  }
+
 struct LogsumexpFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
@@ -54,5 +63,106 @@ struct LogsumexpGradFunctor {
   }
 };
 
+template <typename DeviceContext, typename OutT>
+class LogsumexpKernel : public framework::OpKernel<OutT> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<OutT>(context.GetPlace());
+
+    auto axis = context.Attr<std::vector<int>>("axis");
+    auto keepdim = context.Attr<bool>("keepdim");
+    auto reduce_all = context.Attr<bool>("reduce_all");
+
+    const auto& input_dim_size = input->dims().size();
+    // The dims has full dim, set the reduce_all is True
+    reduce_all |= (static_cast<const int>(axis.size()) == input_dim_size);
+
+    if (reduce_all) {
+      // Flatten and reduce 1-D tensor
+      auto x = EigenVector<OutT>::Flatten(*input);
+      auto out = EigenScalar<OutT>::From(*output);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      LogsumexpFunctor()(place, &x, &out, reduce_dim);
+    } else {
+      int ndim = input_dim_size;
+      int rdim = axis.size();
+      // comments for accelerating compiling temporarily.
+      // HANDLE_DIM(6, 5);
+      // HANDLE_DIM(6, 4);
+      // HANDLE_DIM(6, 3);
+      // HANDLE_DIM(6, 2);
+      // HANDLE_DIM(6, 1);
+      // HANDLE_DIM(5, 4);
+      // HANDLE_DIM(5, 3);
+      // HANDLE_DIM(5, 2);
+      // HANDLE_DIM(5, 1);
+      HANDLE_DIM(4, 3);
+      HANDLE_DIM(4, 2);
+      HANDLE_DIM(4, 1);
+      HANDLE_DIM(3, 2);
+      HANDLE_DIM(3, 1);
+      HANDLE_DIM(2, 1);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogsumexpGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Input<Tensor>("Out");
+    auto* output_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>(context.GetPlace());
+
+    auto axis = context.Attr<std::vector<int>>("axis");
+    auto reduce_all = context.Attr<bool>("reduce_all");
+    const auto input_dim_size = context.Input<Tensor>("X")->dims().size();
+    reduce_all |= (static_cast<const int>(axis.size()) == input_dim_size);
+
+    if (reduce_all) {
+      auto x = EigenVector<T>::Flatten(*input);
+      auto y = EigenVector<T>::Flatten(*output);
+      auto dy = EigenVector<T>::Flatten(*output_grad);
+      auto dx = EigenVector<T>::Flatten(*input_grad);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto broadcast_dim =
+          Eigen::array<int, 1>({{static_cast<int>(input->numel())}});
+      LogsumexpGradFunctor()(place, &x, &y, &dx, &dy, broadcast_dim,
+                             broadcast_dim[0]);
+    } else {
+      int rank = input->dims().size();
+      switch (rank) {
+        case 1:
+          ReduceGradFunctor<DeviceContext, T, 1, LogsumexpGradFunctor>(
+              context.template device_context<DeviceContext>(), *input, *output,
+              *output_grad, input_grad, axis);
+          break;
+        case 2:
+          ReduceGradFunctor<DeviceContext, T, 2, LogsumexpGradFunctor>(
+              context.template device_context<DeviceContext>(), *input, *output,
+              *output_grad, input_grad, axis);
+          break;
+        case 3:
+          ReduceGradFunctor<DeviceContext, T, 3, LogsumexpGradFunctor>(
+              context.template device_context<DeviceContext>(), *input, *output,
+              *output_grad, input_grad, axis);
+          break;
+        case 4:
+          ReduceGradFunctor<DeviceContext, T, 4, LogsumexpGradFunctor>(
+              context.template device_context<DeviceContext>(), *input, *output,
+              *output_grad, input_grad, axis);
+          break;
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu b/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu
index d6ad4863092..81124e4f070 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu
@@ -15,8 +15,9 @@
 // .part used to speed up nvcc compile
 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
 
+namespace ops = paddle::operators;
+
 REGISTER_OP_CUDA_KERNEL(
-    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::LogsumexpGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::LogsumexpGradFunctor>);
+    logsumexp_grad,
+    ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py
index c2201a52605..cf9203dffcb 100644
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -46,8 +46,8 @@ class TestLogsumexp(OpTest):
         self.inputs = {'X': x}
         self.outputs = {'Out': out}
         self.attrs = {
-            'dim': self.axis,
-            'keep_dim': self.keepdim,
+            'axis': self.axis,
+            'keepdim': self.keepdim,
             'reduce_all': self.reduce_all
         }
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index ed2bbe03a36..079178e1cf7 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1194,15 +1194,14 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
         axis = [0]
 
     if in_dygraph_mode():
-        return core.ops.logsumexp(x, 'dim', axis, 'keep_dim', keepdim,
-                                    'reduce_all', reduce_all)
+        return core.ops.logsumexp(x, 'axis', axis, 'keepdim', keepdim, 'reduce_all', reduce_all)
 
     check_variable_and_dtype(x, 'x',
                              ['float32', 'float64'],
                              'logsumexp')
 
     helper = LayerHelper('logsumexp', **locals())
-    attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
+    attrs = {'axis': axis, 'keepdim': keepdim, 'reduce_all':reduce_all}
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(
         type='logsumexp', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
-- 
GitLab


From 40dd563dd77bf9643e192df611cfb5998a1fe405 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Thu, 10 Sep 2020 10:31:07 +0800
Subject: [PATCH 027/261] Decrease test_parallel_executor_crf CI time,
 test=develop (#27212)

Decrease the number of running iterations to reduce CI time.

CI system shows it decreased the unittest time from about 90 seconds to about 30 seconds
---
 .../paddle/fluid/tests/unittests/test_parallel_executor_crf.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index 6671a2def3c..ea59a7f584a 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -176,7 +176,7 @@ class TestCRFModel(unittest.TestCase):
                     place=fluid.CPUPlace())
 
             data = train_data()
-            for i in range(10):
+            for i in range(4):
                 cur_batch = next(data)
                 print(exe.run(train_cp,
                               feed=feeder.feed(cur_batch),
-- 
GitLab


From d3874ab44ac9546f81325ee8cbea82387246fec7 Mon Sep 17 00:00:00 2001
From: liym27 <33742067+liym27@users.noreply.github.com>
Date: Thu, 10 Sep 2020 10:37:14 +0800
Subject: [PATCH 028/261] Move unittest test_optimizer_in_control_flow from CI
 multi_cards. (#27185)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt          | 2 +-
 .../fluid/tests/unittests/test_optimizer_in_control_flow.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 422cc0eddd0..8c9dbba2d02 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -562,7 +562,7 @@ set_tests_properties(test_parallel_executor_test_while_train test_parallel_execu
         test_parallel_executor_feed_persistable_var
         test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
         test_data_norm_op test_imperative_using_non_zero_gpu test_fuse_bn_act_pass
-        test_optimizer_in_control_flow test_dataloader_keep_order
+        test_dataloader_keep_order
         test_dataloader_unkeep_order
         test_parallel_executor_fetch_isolated_var
         test_parallel_executor_inference_feed_partial_data
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
index 4b2914c223a..c1992d0d539 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
@@ -261,7 +261,13 @@ class TestMultiOptimizersMultiCardsError(unittest.TestCase):
             exe.run(startup_program)
 
             np.random.seed(SEED)
+
+            # NOTE(liym27):
+            # This test needs to run in multi cards to test NotImplementedError.
+            # Here, move this test from RUN_TYPE=DIST in tests/unittests/CMakeList.txt,
+            # to use multi cards ** only on CPU ** not GPU to reduce CI time.
             os.environ['CPU_NUM'] = str(2)
+
             pe_exe = fluid.ParallelExecutor(
                 use_cuda=use_cuda,
                 main_program=main_program,
-- 
GitLab


From fde5cfe88174108486115554f4a5b2ca40fa1125 Mon Sep 17 00:00:00 2001
From: wawltor <fangzeyang0904@hotmail.com>
Date: Thu, 10 Sep 2020 10:42:27 +0800
Subject: [PATCH 029/261] fix the CudaPinMemory bug for the equal op (#27176)

 fix the CudaPinMemory bug for the equal op and add the test case for the equal op
---
 .../fluid/operators/controlflow/compare_op.cc | 12 +++++--
 .../fluid/tests/unittests/test_compare_op.py  | 32 +++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 60f29ba39a8..4940649c2a3 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -111,8 +111,16 @@ class CompareOp : public framework::OperatorWithKernel {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // CompareOp kernel's device type is decided by input tensor place
     bool force_cpu = ctx.Attr<bool>("force_cpu");
-    kt.place_ = force_cpu ? platform::CPUPlace()
-                          : ctx.Input<framework::LoDTensor>("X")->place();
+    if (force_cpu) {
+      kt.place_ = platform::CPUPlace();
+    } else {
+      if (ctx.Input<framework::LoDTensor>("X")->place().type() !=
+          typeid(platform::CUDAPinnedPlace)) {
+        kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+      } else {
+        kt.place_ = ctx.GetPlace();
+      }
+    }
     return kt;
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index cfad5040980..25ae65aa7c9 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -38,6 +38,7 @@ def create_test_class(op_type, typename, callback):
             self.check_output()
 
         def test_errors(self):
+            paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = fluid.layers.data(name='x', shape=[2], dtype='int32')
                 y = fluid.layers.data(name='y', shape=[2], dtype='int32')
@@ -80,6 +81,7 @@ def create_paddle_case(op_type, callback):
                 self.place = paddle.CUDAPlace(0)
 
         def test_api(self):
+            paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = fluid.data(name='x', shape=[4], dtype='int64')
                 y = fluid.data(name='y', shape=[4], dtype='int64')
@@ -92,6 +94,7 @@ def create_paddle_case(op_type, callback):
             self.assertEqual((res == self.real_result).all(), True)
 
         def test_broadcast_api_1(self):
+            paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = paddle.static.data(
                     name='x', shape=[1, 2, 1, 3], dtype='int32')
@@ -108,6 +111,7 @@ def create_paddle_case(op_type, callback):
             self.assertEqual((res == real_result).all(), True)
 
         def test_attr_name(self):
+            paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = fluid.layers.data(name='x', shape=[4], dtype='int32')
                 y = fluid.layers.data(name='y', shape=[4], dtype='int32')
@@ -130,6 +134,7 @@ create_paddle_case('not_equal', lambda _a, _b: _a != _b)
 
 class TestCompareOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input x and y of compare_op must be Variable.
             x = fluid.layers.data(name='x', shape=[1], dtype="float32")
@@ -140,6 +145,7 @@ class TestCompareOpError(unittest.TestCase):
 
 class API_TestElementwise_Equal(unittest.TestCase):
     def test_api(self):
+        paddle.enable_static()
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             label = fluid.layers.assign(np.array([3, 3], dtype="int32"))
             limit = fluid.layers.assign(np.array([3, 2], dtype="int32"))
@@ -159,5 +165,31 @@ class API_TestElementwise_Equal(unittest.TestCase):
         self.assertEqual((res == np.array([True, True])).all(), True)
 
 
+class TestCompareOpPlace(unittest.TestCase):
+    def test_place_1(self):
+        paddle.enable_static()
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        label = fluid.layers.assign(np.array([3, 3], dtype="int32"))
+        limit = fluid.layers.assign(np.array([3, 2], dtype="int32"))
+        out = fluid.layers.less_than(label, limit, force_cpu=True)
+        exe = fluid.Executor(place)
+        res, = exe.run(fetch_list=[out])
+        self.assertEqual((res == np.array([False, False])).all(), True)
+
+    def test_place_2(self):
+        place = paddle.CPUPlace()
+        data_place = place
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+            data_place = paddle.CUDAPinnedPlace()
+        paddle.disable_static(place)
+        data = np.array([9], dtype="int64")
+        data_tensor = paddle.to_tensor(data, place=data_place)
+        result = data_tensor == 0
+        self.assertEqual((result.numpy() == np.array([False])).all(), True)
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From 60c3ef3ab8143979905af9e0c30600c0a67743ed Mon Sep 17 00:00:00 2001
From: 123malin <malin10@baidu.com>
Date: Thu, 10 Sep 2020 10:58:58 +0800
Subject: [PATCH 030/261] =?UTF-8?q?=E3=80=90paddle.fleet=E3=80=91parameter?=
 =?UTF-8?q?=5Fserver=5Foptimizer=20support=20auto=5Fstrategy=20(#27181)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* parameter_server_optimizer support auto_strategy
---
 .../distributed/fleet/base/fleet_base.py      |  8 +-
 .../fleet/meta_optimizers/amp_optimizer.py    |  5 +-
 .../fleet/meta_optimizers/dgc_optimizer.py    |  7 +-
 .../gradient_merge_optimizer.py               |  7 +-
 .../graph_execution_optimizer.py              | 30 +++----
 .../fleet/meta_optimizers/lamb_optimizer.py   |  7 +-
 .../fleet/meta_optimizers/lars_optimizer.py   |  7 +-
 .../meta_optimizers/localsgd_optimizer.py     | 11 ++-
 .../meta_optimizers/meta_optimizer_base.py    |  2 +-
 .../parameter_server_graph_optimizer.py       | 10 ++-
 .../parameter_server_optimizer.py             | 78 +++++++++---------
 .../meta_optimizers/pipeline_optimizer.py     |  7 +-
 .../meta_optimizers/recompute_optimizer.py    |  5 +-
 .../fluid/tests/unittests/CMakeLists.txt      |  2 -
 .../test_dist_fleet_a_sync_optimizer_auto.py  | 76 ------------------
 ..._dist_fleet_a_sync_optimizer_auto_async.py | 79 +++++++++++++++++++
 ...st_dist_fleet_a_sync_optimizer_auto_geo.py | 67 ++++++++++++++++
 17 files changed, 251 insertions(+), 157 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py

diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index b9189492694..0dfcd5f3255 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -231,7 +231,7 @@ class Fleet(object):
 
         Returns:
             int: worker numbers
-        
+
         Examples:
             .. code-block:: python
 
@@ -737,7 +737,7 @@ class Fleet(object):
         """
         Set the value of the learning rate manually in the optimizer. 
         Only work in dygraph mode
- 
+
         Args:
             value (float|Tensor): the value of learning rate
 
@@ -877,7 +877,7 @@ class Fleet(object):
         """
         Execute the optimizer once.
         Only work in dygraph mode
- 
+
         Returns: None
 
         Examples:
@@ -1019,7 +1019,7 @@ class Fleet(object):
         if self.user_defined_strategy._is_strict_auto():
             # turn on all the strategy for each optimizer
             for opt in distributed_optimizer_list:
-                opt._enable_strategy(self.user_defined_strategy)
+                opt._enable_strategy(self.user_defined_strategy, context)
 
         valid_optimizer_list = []
         valid_graph_optimizer_list = []
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index 938bd258847..31a9913701c 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -34,6 +34,9 @@ class AMPOptimizer(MetaOptimizerBase):
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.amp:
             return True
         return False
@@ -42,7 +45,7 @@ class AMPOptimizer(MetaOptimizerBase):
         dist_strategy.amp = False
         dist_strategy.amp_configs = {}
 
-    def _enable_strategy(self, dist_strategy):
+    def _enable_strategy(self, dist_strategy, context):
         dist_strategy.amp = True
         dist_strategy.amp_configs = {
             "init_loss_scaling": 32768.0,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index d292f58456c..3f6ed1ed2f2 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -53,6 +53,9 @@ class DGCOptimizer(MetaOptimizerBase):
             name=opt._name)
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.dgc:
             if not isinstance(self.inner_opt, Momentum):
                 logging.warn("dgc only works on Momentum optimizer")
@@ -69,7 +72,7 @@ class DGCOptimizer(MetaOptimizerBase):
         dist_strategy.dgc = False
         dist_strategy.dgc_configs = {}
 
-    def _enable_strategy(self, dist_strategy):
+    def _enable_strategy(self, dist_strategy, context):
         dist_strategy.dgc = True
         dist_strategy.dgc_configs = {"rampup_begin_step": 0, "rampup_step": 1}
 
@@ -89,5 +92,5 @@ class DGCOptimizer(MetaOptimizerBase):
                       no_grad_set=None):
         optimize_ops, params_grads = \
             self.dgc_opt.minimize(loss, startup_program,
-                                      parameter_list, no_grad_set)
+                                  parameter_list, no_grad_set)
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index bb0c631e081..f1b36809765 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -37,15 +37,18 @@ class GradientMergeOptimizer(MetaOptimizerBase):
             self.user_defined_strategy.gradient_merge_configs["avg"])
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         can_apply = (self.user_defined_strategy.gradient_merge == True) and \
-                  self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1
+            self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1
         return can_apply
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.gradient_merge = False
         dist_strategy.gradient_merge_configs = {}
 
-    def _enable_strategy(self, dist_strategy):
+    def _enable_strategy(self, dist_strategy, context):
         # we currently do not support auto-enable gradient merge
         return
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index 03304f1b68b..6c1cc3d7a97 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -48,7 +48,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
                  callbacks=None):
         pass
 
-    # should fix the variable 
+    # should fix the variable
     def _setup_nccl_op(self, startup_program, main_program, build_strategy):
         trainer_endpoints = self.role_maker.get_trainer_endpoints()
         trainers = trainer_endpoints
@@ -94,31 +94,31 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         dist_strategy = self.user_defined_strategy
         local_build_strategy = paddle.fluid.BuildStrategy()
         local_build_strategy.enable_sequential_execution = \
-                    dist_strategy.build_strategy.enable_sequential_execution
+            dist_strategy.build_strategy.enable_sequential_execution
         local_build_strategy.fuse_elewise_add_act_ops = \
-                    dist_strategy.build_strategy.fuse_elewise_add_act_ops
+            dist_strategy.build_strategy.fuse_elewise_add_act_ops
         local_build_strategy.fuse_bn_act_ops = \
-                    dist_strategy.build_strategy.fuse_bn_act_ops
+            dist_strategy.build_strategy.fuse_bn_act_ops
         local_build_strategy.enable_auto_fusion = \
-                    dist_strategy.build_strategy.enable_auto_fusion
+            dist_strategy.build_strategy.enable_auto_fusion
         local_build_strategy.fuse_relu_depthwise_conv = \
-                    dist_strategy.build_strategy.fuse_relu_depthwise_conv
+            dist_strategy.build_strategy.fuse_relu_depthwise_conv
         local_build_strategy.fuse_broadcast_ops = \
-                    dist_strategy.build_strategy.fuse_broadcast_ops
+            dist_strategy.build_strategy.fuse_broadcast_ops
         local_build_strategy.fuse_all_optimizer_ops = \
-                    dist_strategy.build_strategy.fuse_all_optimizer_ops
+            dist_strategy.build_strategy.fuse_all_optimizer_ops
         local_build_strategy.enable_inplace = \
-                    dist_strategy.build_strategy.enable_inplace
+            dist_strategy.build_strategy.enable_inplace
         local_build_strategy.use_hierarchical_allreduce = \
-                    dist_strategy.use_hierarchical_allreduce
+            dist_strategy.use_hierarchical_allreduce
         local_build_strategy.hierarchical_allreduce_inter_nranks = \
-                    dist_strategy.hierarchical_allreduce_inter_nranks
+            dist_strategy.hierarchical_allreduce_inter_nranks
         local_build_strategy.sync_batch_norm = \
-                    dist_strategy.sync_batch_norm
+            dist_strategy.sync_batch_norm
         local_build_strategy.fuse_all_reduce_ops = \
-                    dist_strategy.fuse_all_reduce_ops
+            dist_strategy.fuse_all_reduce_ops
         local_build_strategy.nccl_comm_num = \
-                    dist_strategy.nccl_comm_num
+            dist_strategy.nccl_comm_num
 
         if self.user_defined_strategy.recompute == True:
             logging.warn(
@@ -190,7 +190,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         # TODO(guru4elephant): should close all PE related flags here
         return
 
-    def _enable_strategy(self, dist_strategy):
+    def _enable_strategy(self, dist_strategy, context):
         # by default, graph execution strategy is enabled
         return
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index bfa186a1e7c..df9887759e1 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -62,6 +62,9 @@ class LambOptimizer(MetaOptimizerBase):
             name=opt._name)
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.lamb:
             if not isinstance(self.inner_opt, AdamOptimizer):
                 logging.warn(
@@ -75,7 +78,7 @@ class LambOptimizer(MetaOptimizerBase):
         dist_strategy.lamb = False
         dist_strategy.lamb_configs = {}
 
-    def _enable_strategy(self, dist_strategy):
+    def _enable_strategy(self, dist_strategy, context):
         dist_strategy.lamb = True
         dist_strategy.lamb_configs = {
             "lamb_weight_decay": 0.01,
@@ -102,5 +105,5 @@ class LambOptimizer(MetaOptimizerBase):
                       no_grad_set=None):
         optimize_ops, params_grads = \
             self.lamb_opt.minimize(loss, startup_program,
-                                      parameter_list, no_grad_set)
+                                   parameter_list, no_grad_set)
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index ec7a7eb18bc..609d8b85e71 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -49,6 +49,9 @@ class LarsOptimizer(MetaOptimizerBase):
             epsilon=configs['epsilon'])
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.lars:
             if not isinstance(self.inner_opt, Momentum):
                 logging.warn(
@@ -62,7 +65,7 @@ class LarsOptimizer(MetaOptimizerBase):
         dist_strategy.lars = False
         dist_strategy.lars_configs = {}
 
-    def _enable_strategy(self, dist_strategy):
+    def _enable_strategy(self, dist_strategy, context):
         dist_strategy.lars = True
         dist_strategy.lars_configs = {
             "lars_coeff": 0.01,
@@ -89,5 +92,5 @@ class LarsOptimizer(MetaOptimizerBase):
                       no_grad_set=None):
         optimize_ops, params_grads = \
             self.lars_opt.minimize(loss, startup_program,
-                                      parameter_list, no_grad_set)
+                                   parameter_list, no_grad_set)
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 3c1318301bb..4d33dfe7456 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -29,6 +29,9 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         self.snapshot_key = '@SNAPSHOT'
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if not self.user_defined_strategy.localsgd:
             return False
 
@@ -36,15 +39,15 @@ class LocalSGDOptimizer(MetaOptimizerBase):
             return False
 
         return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
-                or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \
-                or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \
-                or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD)
+            or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \
+            or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \
+            or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD)
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.localsgd = False
         dist_strategy.localsgd_configs = {}
 
-    def _enable_strategy(self, dist_strategy):
+    def _enable_strategy(self, dist_strategy, context):
         dist_strategy.localsgd = True
         dist_strategy.localsgd_configs = {"k_steps": 1}
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index b105c25b3ad..a12ca50442b 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -48,7 +48,7 @@ class MetaOptimizerBase(Optimizer):
         raise NotImplementedError("you should implement disable strategy in {}".
                                   format(type(self).__name__))
 
-    def _enable_strategy(self, dist_strategy):
+    def _enable_strategy(self, dist_strategy, context=None):
         raise NotImplementedError("you should implement enable strategy in {}".
                                   format(type(self).__name__))
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
index c9260dd2f8c..7dc532c86ea 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
@@ -24,6 +24,9 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
         self.meta_optimizers_white_list = []
 
     def _can_apply(self):
+        if self.role_maker._is_collective:
+            return False
+
         k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
         if k_steps < 0:
             return False
@@ -37,12 +40,11 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
         return True
 
     def _disable_strategy(self, dist_strategy):
-        dist_strategy.a_sync_configs = {}
+        return
 
-    def _enable_strategy(self, dist_strategy):
+    def _enable_strategy(self, dist_strategy, context):
         # only open up the async mode for auto-parallel
-        dist_strategy.a_sync = True
-        dist_strategy.a_sync_configs = {}
+        return
 
     def _is_graph_out(self):
         return True
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index 7dca7b9cb88..51d4d343165 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -32,8 +32,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):
     def _can_apply(self):
         if self.role_maker._is_collective:
             return False
-        if self.user_defined_strategy.auto == True:
-            return True
 
         k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
         return True if k_steps >= 0 else False
@@ -134,7 +132,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
 
         return _main, _startup
 
-    def _try_auto_apply_geo(self, program, compiled_config):
+    def _can_apply_geo(self, dist_strategy, program):
         def get_sys_free_mem():
             plat = platform.system()
             if platform.system() == "Darwin":
@@ -163,36 +161,28 @@ class ParameterServerOptimizer(MetaOptimizerBase):
                     "%s platform is unsupported is parameter server optimizer" %
                     (platform.system()))
 
-        if self.user_defined_strategy.auto == False:
-            return
-
-        a_sync_configs = self.user_defined_strategy.a_sync_configs
-        if a_sync_configs["k_steps"] >= 0:
-            return
-
-        self.user_defined_strategy.a_sync = True
         if not isinstance(self.inner_opt, fluid.optimizer.SGDOptimizer):
-            # auto async
-            a_sync_configs["k_steps"] = 0
-            self.user_defined_strategy.a_sync_configs = a_sync_configs
-            return
+            return False
 
-        from paddle.fluid.incubate.fleet.parameter_server.ir.vars_metatools import dtype_to_size
         free = get_sys_free_mem()
 
-        param_grad_pairs = compiled_config.origin_sparse_pairs + compiled_config.origin_dense_pairs
-        processed_var_names = set(["@EMPTY@"])
+        from paddle.fluid.incubate.fleet.parameter_server.ir import vars_metatools
 
+        processed_var_names = set(["@EMPTY@"])
         param_memory_size = 0
-        for param_grad_pair in param_grad_pairs:
-            param, grad = param_grad_pair
+        for varname in program.global_block().vars:
+            var = program.global_block().vars[varname]
+            if not var.persistable or var.desc.type(
+            ) != core.VarDesc.VarType.LOD_TENSOR:
+                continue
+            param = vars_metatools.create_var_struct(var)
             param_memory_size += param.m_size
-            processed_var_names.add(param.name)
+            processed_var_names.add(varname)
 
         upper_mem_use = param_memory_size * 5.0
 
         program_tmp_vars = dict()
-        batch_size = 1024
+        eval_batch_size = 1024
         for op in program.global_block().ops:
             for var_name in op.output_arg_names:
                 if var_name in processed_var_names:
@@ -215,23 +205,21 @@ class ParameterServerOptimizer(MetaOptimizerBase):
                         data_count *= (-x)
                     else:
                         data_count *= x
-                program_tmp_vars[var_name] = (data_count, neg_dim_count,
-                                              dtype_to_size[var.dtype])
+                program_tmp_vars[var_name] = (
+                    data_count, neg_dim_count,
+                    vars_metatools.dtype_to_size[var.dtype])
 
         for varname in program_tmp_vars:
             data_count, neg_dim_count, type_size = program_tmp_vars[varname]
             if neg_dim_count == 1:
-                data_count *= batch_size
+                data_count *= eval_batch_size
             var_memory = data_count * type_size
             upper_mem_use += var_memory
 
         if upper_mem_use < free:
-            # auto geo
-            a_sync_configs["k_steps"] = 800
+            return True
         else:
-            # auto async
-            a_sync_configs["k_steps"] = 0
-        self.user_defined_strategy.a_sync_configs = a_sync_configs
+            return False
 
     def minimize_impl(self,
                       loss,
@@ -240,6 +228,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
                       no_grad_set=None):
         self.inner_opt.minimize(loss, startup_program, parameter_list,
                                 no_grad_set)
+        strategy = self._get_distributed_strategy()
 
         _origin_main_program = loss.block.program
         _origin_startup_program = startup_program
@@ -247,11 +236,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
 
         compiled_config = public.CompileTimeStrategy(_origin_main_program,
                                                      _origin_startup_program,
-                                                     None, self.role_maker)
-
-        self._try_auto_apply_geo(_origin_main_program, compiled_config)
-
-        strategy = self._get_distributed_strategy()
+                                                     strategy, self.role_maker)
         compiled_config.strategy = strategy
 
         if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
@@ -267,9 +252,24 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         return None, None
 
     def _disable_strategy(self, dist_strategy):
-        dist_strategy.a_sync_configs = {}
-        self.user_defined_strategy.a_sync_configs = {}
+        dist_strategy.a_sync = False
+        a_sync_configs = dist_strategy.a_sync_configs
+        a_sync_configs["k_steps"] = -1
+        dist_strategy.a_sync_configs = a_sync_configs
+
+    def _enable_strategy(self, dist_strategy, context):
+        a_sync_configs = dist_strategy.a_sync_configs
+        if a_sync_configs["k_steps"] >= 0:
+            return
 
-    def _enable_strategy(self, dist_strategy):
         dist_strategy.a_sync = True
-        dist_strategy.a_sync_configs = {}
+        a_sync_configs = dist_strategy.a_sync_configs
+
+        is_geo = self._can_apply_geo(dist_strategy,
+                                     context["origin_main_program"])
+
+        if is_geo:
+            a_sync_configs["k_steps"] = 800
+        else:
+            a_sync_configs["k_steps"] = 0
+        dist_strategy.a_sync_configs = a_sync_configs
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index 32c54d44867..87fa7077911 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -103,6 +103,9 @@ class PipelineOptimizer(MetaOptimizerBase):
         self.wrapped_opt = PO(self.inner_opt, num_microbatches=num_microbatches)
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.pipeline == True:
             return True
         return False
@@ -111,7 +114,7 @@ class PipelineOptimizer(MetaOptimizerBase):
         dist_strategy.pipeline = False
         dist_strategy.pipeline_configs = {}
 
-    def _enable_strategy(self, dist_strategy):
+    def _enable_strategy(self, dist_strategy, context):
         # we do not support enable pipeline automatically right now
         return
 
@@ -180,7 +183,7 @@ class PipelineOptimizer(MetaOptimizerBase):
         grad = None
         for idx, op in reversed(list(enumerate(block.ops))):
             if is_backward_op(op) and \
-                OP_ROLE_VAR_KEY in op.attr_names:
+                    OP_ROLE_VAR_KEY in op.attr_names:
                 op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
                 if len(op_role_var) == 0:
                     continue
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 267656824c9..8f959548692 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -38,6 +38,9 @@ class RecomputeOptimizer(MetaOptimizerBase):
             list(user_defined_strategy.recompute_configs["checkpoints"]))
 
     def _can_apply(self):
+        if self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.recompute == True:
             if len(self.user_defined_strategy.recompute_configs[
                     "checkpoints"]) == 0:
@@ -49,7 +52,7 @@ class RecomputeOptimizer(MetaOptimizerBase):
         dist_strategy.recompute = False
         dist_strategy.recompute_configs = {}
 
-    def _enable_strategy(self, dist_strategy):
+    def _enable_strategy(self, dist_strategy, context):
         # we do not support automatically recompute checkpoints currently
         return
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 8c9dbba2d02..b496b7953a9 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -441,8 +441,6 @@ if(WITH_DISTRIBUTE)
     # FIXME(seiriosX) will fix this
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_sparse_embedding_ctr")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_a_sync_optimizer_auto")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")
 
     py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
     py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS})
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
index ab47659a88d..5a5d8afc55b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
@@ -62,82 +62,6 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
         self.assertTrue(a_sync_configs['k_steps'] == 0)
 
-    def test_a_sync_optimizer2(self):
-        os.environ["TRAINING_ROLE"] = "TRAINER"
-        import paddle.distributed.fleet as fleet
-
-        main_program = paddle.fluid.Program()
-        startup_program = paddle.fluid.Program()
-
-        paddle.fluid.framework.switch_main_program(main_program)
-        paddle.fluid.framework.switch_startup_program(startup_program)
-
-        fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.auto = True
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
-
-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
-        self.assertTrue(a_sync_configs['k_steps'] == 800)
-
-    def test_a_sync_optimizer3(self):
-        os.environ["TRAINING_ROLE"] = "TRAINER"
-        import paddle.distributed.fleet as fleet
-
-        main_program = paddle.fluid.Program()
-        startup_program = paddle.fluid.Program()
-
-        paddle.fluid.framework.switch_main_program(main_program)
-        paddle.fluid.framework.switch_startup_program(startup_program)
-
-        fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x",
-            shape=[-1, 1],
-            dtype="int64",
-            lod_level=1,
-            append_batch_size=False)
-        x_embedding = paddle.fluid.layers.embedding(
-            is_distributed=False,
-            input=input_x,
-            size=[1000000000, 100000],
-            param_attr=paddle.fluid.ParamAttr(
-                name="embedding",
-                initializer=paddle.fluid.initializer.Constant(value=0.01)),
-            is_sparse=True)
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=x_embedding, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.auto = True
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
-
-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
-        self.assertTrue(a_sync_configs['k_steps'] == 0)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
new file mode 100644
index 00000000000..9085556c04c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet.base.role_maker as role_maker
+import time
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_a_sync_optimizer3(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        x_embedding = paddle.fluid.layers.embedding(
+            is_distributed=False,
+            input=input_x,
+            size=[1000000000, 100000],
+            param_attr=paddle.fluid.ParamAttr(
+                name="embedding",
+                initializer=paddle.fluid.initializer.Constant(value=0.01)),
+            is_sparse=True)
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=x_embedding, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(a_sync_configs['k_steps'] == 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
new file mode 100644
index 00000000000..4787d048bd2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet.base.role_maker as role_maker
+import time
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_a_sync_optimizer2(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(a_sync_configs['k_steps'] == 800)
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From 5bd84b22c4f63948da884c480710987ac81464ff Mon Sep 17 00:00:00 2001
From: ShenLiang <shenliang03@baidu.com>
Date: Thu, 10 Sep 2020 11:34:35 +0800
Subject: [PATCH 031/261] revert divide (#27202)

---
 .../elementwise/elementwise_floordiv_op.cc    |   2 -
 .../elementwise/elementwise_floordiv_op.cu    |   2 -
 .../elementwise/elementwise_floordiv_op.h     |  12 +-
 python/paddle/fluid/dygraph/math_op_patch.py  |  39 +---
 python/paddle/fluid/layers/math_op_patch.py   |  41 ++---
 .../test_dist_transpiler_async_decay.py       |   4 +-
 .../unittests/test_elementwise_div_op.py      | 131 ++-----------
 .../unittests/test_elementwise_floordiv_op.py | 140 ++------------
 .../unittests/test_elementwise_mod_op.py      | 174 ++++--------------
 .../tests/unittests/test_math_op_patch.py     |   8 +-
 .../tests/unittests/test_rnn_decode_api.py    |   3 +-
 python/paddle/nn/functional/loss.py           |   3 +-
 python/paddle/tensor/math.py                  | 141 --------------
 13 files changed, 98 insertions(+), 602 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
index 457d9e79d7d..5a398fa50fe 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -49,8 +49,6 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_floordiv,
-    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
                                    int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
index f63d6f03763..60846d1e8fe 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
@@ -19,7 +19,5 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_floordiv,
-    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, double>,
     ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
index 8afe2133c04..5dc93740949 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <math.h>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
@@ -62,15 +61,8 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
                            const framework::Tensor *x,
                            const framework::Tensor *y, framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, FloorDivFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
-  }
+  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+      ctx, x, y, axis, FloorDivFunctor<T>(), z);
 }
 
 template <typename DeviceContext, typename T>
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 8c410967420..f9fe4198fec 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -19,7 +19,6 @@ from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
 from ..layers.layer_function_generator import OpProtoHolder
 from ..layers import common_methods
 from . import to_variable, no_grad
-import paddle
 
 import numpy as np
 import six
@@ -163,26 +162,6 @@ def monkey_patch_math_varbase():
     def _scalar_div_(var, value):
         return _scalar_elementwise_op_(var, 1.0 / value, 0.0)
 
-    # TODO(shenliang03):  currently, it supports divide, floor_divide, remainder
-    # for binary operator by using the api to achieve the type promotion
-    def _binary_method_creator_(op_type, reverse=False):
-        import paddle
-
-        def __impl__(self, other_var):
-            import paddle
-            op = getattr(paddle, op_type)
-            if reverse:
-                return op(other_var, self)
-            else:
-                return op(self, other_var)
-
-        __impl__.__doc__ = """
-
-        See paddle.{}""".format(op_type)
-        __impl__.__name__ = op_type
-
-        return __impl__
-
     # for binary operator such as elementwise, compare
     def _binary_creator_(method_name,
                          op_type,
@@ -281,20 +260,22 @@ def monkey_patch_math_varbase():
         ## a*b == b*a. Do not need to reverse explicitly
         ('__rmul__',
          _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__div__', _binary_creator_('__div__', 'elementwise_div', False,
+                                     _scalar_div_)),
+        ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
+                                         False, _scalar_div_)),
+        ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
+                                      None)),
         ('__rtruediv__', _binary_creator_('rtruediv__', 'elementwise_div', True,
                                           None)),
         ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
                                      None)),
         ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
                                       None)),
-        # These binary use paddle.optype
-        ('__div__', _binary_method_creator_('divide', False)),
-        ('__truediv__', _binary_method_creator_('divide', False)),
-        ('__rtruediv__', _binary_method_creator_('divide', True)),
-        ('__rdiv__', _binary_method_creator_('divide', True)),
-        ('__floordiv__', _binary_method_creator_('floor_divide', False)),
-        ('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
-        ('__mod__', _binary_method_creator_('remainder', False)),
+        ('__floordiv__', _binary_creator_('__floordiv__',
+                                          'elementwise_floordiv', False, None)),
+        ('__mod__', _binary_creator_('__mod__', 'elementwise_mod', False,
+                                     None)),
         ## for logical compare
         ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
         ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 38fc34472c8..4595f0cf939 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -16,7 +16,6 @@ from __future__ import print_function
 
 import warnings
 import inspect
-import paddle
 
 from .. import core
 from ..framework import Variable, unique_name
@@ -46,7 +45,6 @@ EXPRESSION_MAP = {
     "__pow__": "A ** B",
     "__rpow__": "A **= B",
     "__floordiv__": "A //B",
-    "__rfloordiv__": "A //= B",
     "__mod__": "A % B",
     "__eq__": "A == B",
     "__ne__": "A != B",
@@ -235,25 +233,6 @@ def monkey_patch_variable():
     def _scalar_div_(var, value):
         return _scalar_op_(var, 1.0 / value, 0.0)
 
-    # TODO(shenliang03):  currently, it supports divide, floor_divide, remainder
-    # for binary operator by using the api to achieve the type promotion
-    def _binary_method_creator_(op_type, reverse=False):
-        import paddle
-
-        def __impl__(self, other_var):
-            op = getattr(paddle, op_type)
-            if reverse:
-                return op(other_var, self)
-            else:
-                return op(self, other_var)
-
-        __impl__.__doc__ = """
-
-        See paddle.{}""".format(op_type)
-        __impl__.__name__ = op_type
-
-        return __impl__
-
     def _binary_creator_(method_name,
                          op_type,
                          reverse=False,
@@ -360,18 +339,22 @@ def monkey_patch_variable():
         #  a*b == b*a. Do not need to reverse explicitly
         ('__rmul__',
          _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__div__', _binary_creator_('__div__', 'elementwise_div', False,
+                                     _scalar_div_)),
+        ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
+                                         False, _scalar_div_)),
+        ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
+                                      None)),
+        ('__rtruediv__', _binary_creator_('__rtruediv__', 'elementwise_div',
+                                          True, None)),
         ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
                                      None)),
         ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
                                       None)),
-        # These binary use paddle.optype
-        ('__div__', _binary_method_creator_('divide', False)),
-        ('__rdiv__', _binary_method_creator_('divide', True)),
-        ('__truediv__', _binary_method_creator_('divide', False)),
-        ('__rtruediv__', _binary_method_creator_('divide', True)),
-        ('__floordiv__', _binary_method_creator_('floor_divide', False)),
-        ('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
-        ('__mod__', _binary_method_creator_('remainder', False)),
+        ('__floordiv__', _binary_creator_('__floordiv__',
+                                          'elementwise_floordiv', False, None)),
+        ('__mod__', _binary_creator_('__mod__', 'elementwise_mod', False,
+                                     None)),
         #  for logical compare
         ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
         ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
index 10621239484..761d57408b9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
@@ -113,8 +113,8 @@ class TranspilerAsyncLRDecayTest(unittest.TestCase):
                          ["listen_and_serv"])
         # block1: sum,cast,scale,floor,fill_constant,elementwise_pow,scale
         self.assertEqual([op.type for op in pserver.blocks[1].ops], [
-            "sum", "cast", "fill_constant", "elementwise_div", "floor",
-            "fill_constant", "elementwise_pow", "scale"
+            "sum", "cast", "scale", "floor", "fill_constant", "elementwise_pow",
+            "scale"
         ])
 
         # block1~2: optimize pass
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index 9ebaf8ff943..3cfbac8b613 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -240,124 +240,25 @@ class TestElementwiseDivBroadcast(unittest.TestCase):
             self.assertEqual((out_result == (2 / x)).all(), True)
 
 
-class TestDivideAPI(unittest.TestCase):
-    def setUp(self):
-        paddle.set_default_dtype("float64")
-        self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(fluid.CUDAPlace(0))
-
-    def check_static_result(self, place):
-        # rule 1
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = np.array([1, 2, 3])
-            self.assertRaises(TypeError, paddle.divide, x=x, y=y)
-
-        # rule 2: both the inputs are not Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = 2
-            y = 4
-            res = paddle.divide(x, y)
-            exe = fluid.Executor(place)
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={},
-                           fetch_list=[res])
-            self.assertEqual(np_z[0] == 0.5, True)
-
-        # rule 3: 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float32")
-            self.assertRaises(TypeError, paddle.divide, x=x, y=y)
-
-        # rule 4: x is Tensor, y is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = x / y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 3, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([1., 1.5, 2.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 5: y is Tensor, x is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = y / x
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 8, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([1., 0.25, 0.5])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 6: y is Tensor, x is Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float64")
-            exe = fluid.Executor(place)
-            res = x / y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={
-                               "x": np.array([2, 3, 4]).astype('float64'),
-                               "y": np.array([1, 5, 2]).astype('float64')
-                           },
-                           fetch_list=[res])
-            z_expected = np.array([2., 0.6, 2.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
+class TestDivideOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
+            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
 
-    def test_static(self):
-        for place in self.places:
-            self.check_static_result(place=place)
+            y_1 = paddle.divide(x, y, name='div_res')
+            self.assertEqual(('div_res' in y_1.name), True)
 
     def test_dygraph(self):
-        for place in self.places:
-            with fluid.dygraph.guard(place):
-                # rule 1 : avoid numpy.ndarray
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                self.assertRaises(TypeError, paddle.divide, x=x, y=np_y)
-
-                # rule 2: both the inputs are not Tensor
-                z = paddle.divide(3, 2)
-                self.assertEqual(z.numpy()[0] == 1.5, True)
-
-                # rule 3: both the inputs are Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x, dtype="float32")
-                y = paddle.to_tensor(np_y, dtype="float64")
-                self.assertRaises(TypeError, paddle.divide, x=x, y=y)
-
-                # rule 4: x is Tensor, y is scalar
-                np_x = np.array([2, 3, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = x / y
-                z_expected = np.array([1., 1.5, 2.])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 5: y is Tensor, x is scalar
-                np_x = np.array([2, 1, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = y / x
-                z_expected = np.array([1., 2., 0.5])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 6: y is Tensor, x is Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x / y
-                z_expected = np.array([2., 0.6, 2.])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 4]).astype('float64')
+            np_y = np.array([1, 5, 2]).astype('float64')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.divide(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([2., 0.6, 2.])
+            self.assertEqual((np_z == z_expected).all(), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
index 0b6acc76153..f339081e31b 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -58,13 +58,6 @@ class TestElementwiseModOp(OpTest):
         pass
 
 
-class TestElementwiseModOpInverse(TestElementwiseModOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
-        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
-        self.out = np.floor_divide(self.x, self.y)
-
-
 class TestElementwiseModOp_scalar(TestElementwiseModOp):
     def init_input_output(self):
         scale_x = random.randint(0, 100000000)
@@ -74,124 +67,25 @@ class TestElementwiseModOp_scalar(TestElementwiseModOp):
         self.out = np.floor_divide(self.x, self.y)
 
 
-class TestFloorDivideAPI(unittest.TestCase):
-    def setUp(self):
-        paddle.set_default_dtype("float64")
-        self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(fluid.CUDAPlace(0))
-
-    def check_static_result(self, place):
-        # rule 1
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = np.array([1, 2, 3])
-            self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
-
-        # rule 2: both the inputs are not Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = 2
-            y = 4
-            res = paddle.floor_divide(x, y)
-            exe = fluid.Executor(place)
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={},
-                           fetch_list=[res])
-            self.assertEqual(np_z[0] == 0., True)
-
-        # rule 3: 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float32")
-            self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
-
-        # rule 4: x is Tensor, y is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = x // y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 3, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([1., 1., 2.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 5: y is Tensor, x is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = y // x
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 8, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([1., 0., 0.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 6: y is Tensor, x is Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float64")
-            exe = fluid.Executor(place)
-            res = x // y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={
-                               "x": np.array([2, 3, 4]).astype('float64'),
-                               "y": np.array([1, 5, 2]).astype('float64')
-                           },
-                           fetch_list=[res])
-            z_expected = np.array([2., 0., 2.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-    def test_static(self):
-        for place in self.places:
-            self.check_static_result(place=place)
+class TestFloorDivideOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="int64")
+            y = fluid.data(name='y', shape=[2, 3], dtype='int64')
+
+            y_1 = paddle.floor_divide(x, y, name='div_res')
+            self.assertEqual(('div_res' in y_1.name), True)
 
     def test_dygraph(self):
-        for place in self.places:
-            with fluid.dygraph.guard(place):
-                # rule 1 : avoid numpy.ndarray
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                self.assertRaises(TypeError, paddle.floor_divide, x=x, y=np_y)
-
-                # rule 2: both the inputs are not Tensor
-                z = paddle.floor_divide(3, 2)
-                self.assertEqual(z.numpy()[0] == 1., True)
-
-                # rule 3: both the inputs are Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x, dtype="float32")
-                y = paddle.to_tensor(np_y, dtype="float64")
-                self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
-
-                # rule 4: x is Tensor, y is scalar
-                np_x = np.array([2, 3, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = x // y
-                z_expected = np.array([1, 1, 2])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 5: y is Tensor, x is scalar
-                np_x = np.array([2, 1, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = y // x
-                z_expected = np.array([1, 2, 0])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 6: y is Tensor, x is Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x // y
-                z_expected = np.array([2., 0., 2.])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 8, 7]).astype('int64')
+            np_y = np.array([1, 5, 3, 3]).astype('int64')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.floor_divide(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([2, 0, 2, 2])
+            self.assertEqual((np_z == z_expected).all(), True)
 
         with fluid.dygraph.guard(fluid.CPUPlace()):
             # divide by zero 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
index cab6160d761..2a8ca51693e 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
@@ -84,149 +84,41 @@ class TestElementwiseModOpDouble(TestElementwiseModOpFloat):
         self.dtype = np.float64
 
 
-class TestRemainderAPI(unittest.TestCase):
-    def setUp(self):
-        paddle.set_default_dtype("float64")
-        self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(fluid.CUDAPlace(0))
-
-    def check_static_result(self, place):
-        # rule 1
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = np.array([1, 2, 3])
-            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
-
-        # rule 3: 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float32")
-            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
-
-        # rule 4: x is Tensor, y is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = x % y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 3, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([0., 1., 0.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 5: y is Tensor, x is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = 3
-            y = fluid.data(name="y", shape=[3], dtype="float32")
-            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
-
-        # rule 6: y is Tensor, x is Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[1], dtype="float64")
-            exe = fluid.Executor(place)
-            res = x % y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={
-                               "x": np.array([1., 2., 4]).astype('float64'),
-                               "y": np.array([1.5]).astype('float64')
-                           },
-                           fetch_list=[res])
-            z_expected = np.array([1., 0.5, 1.0])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 6: y is Tensor, x is Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[6], dtype="float64")
-            y = fluid.data(name="y", shape=[1], dtype="float64")
-            exe = fluid.Executor(place)
-            res = x % y
-            np_z = exe.run(
-                fluid.default_main_program(),
-                feed={
-                    "x": np.array([-3., -2, -1, 1, 2, 3]).astype('float64'),
-                    "y": np.array([2]).astype('float64')
-                },
-                fetch_list=[res])
-            z_expected = np.array([1., 0., 1., 1., 0., 1.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-    def test_static(self):
-        for place in self.places:
-            self.check_static_result(place=place)
+class TestRemainderOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="int64")
+            y = fluid.data(name='y', shape=[2, 3], dtype='int64')
+
+            y_1 = paddle.remainder(x, y, name='div_res')
+            self.assertEqual(('div_res' in y_1.name), True)
 
     def test_dygraph(self):
-        for place in self.places:
-            with fluid.dygraph.guard(place):
-                # rule 1 : avoid numpy.ndarray
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                self.assertRaises(TypeError, paddle.remainder, x=x, y=np_y)
-
-                # rule 3: both the inputs are Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x, dtype="float32")
-                y = paddle.to_tensor(np_y, dtype="float64")
-                self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
-
-                # rule 4: x is Tensor, y is scalar
-                np_x = np.array([2, 3, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = x % y
-                z_expected = np.array([0, 1, 0])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 5: y is Tensor, x is scalar
-                np_x = np.array([2, 3, 4])
-                x = paddle.to_tensor(np_x)
-                self.assertRaises(TypeError, paddle.remainder, x=3, y=x)
-
-                # rule 6: y is Tensor, x is Tensor
-                np_x = np.array([1., 2., 4])
-                np_y = np.array([1.5])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x % y
-                z_expected = np.array([1., 0.5, 1.0])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 6: y is Tensor, x is Tensor
-                np_x = np.array([-3., -2, -1, 1, 2, 3])
-                np_y = np.array([2.])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x % y
-                z_expected = np.array([1., 0., 1., 1., 0., 1.])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                np_x = np.array([-3.3, 11.5, -2, 3.5])
-                np_y = np.array([-1.2, 2., 3.3, -2.3])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x % y
-                z_expected = np.array([-0.9, 1.5, 1.3, -1.1])
-                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
-
-                np_x = np.array([-3, 11, -2, 3])
-                np_y = np.array([-1, 2, 3, -2])
-                x = paddle.to_tensor(np_x, dtype="int64")
-                y = paddle.to_tensor(np_y, dtype="int64")
-                z = x % y
-                z_expected = np.array([0, 1, 1, -1])
-                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
-
-                np_x = np.array([-3, 3])
-                np_y = np.array([[2, 3], [-2, -1]])
-                x = paddle.to_tensor(np_x, dtype="int64")
-                y = paddle.to_tensor(np_y, dtype="int64")
-                z = x % y
-                z_expected = np.array([[1, 0], [-1, 0]])
-                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 8, 7]).astype('int64')
+            np_y = np.array([1, 5, 3, 3]).astype('int64')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.remainder(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([0, 3, 2, 1])
+            self.assertEqual((np_z == z_expected).all(), True)
+
+            np_x = np.array([-3.3, 11.5, -2, 3.5])
+            np_y = np.array([-1.2, 2., 3.3, -2.3])
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = x % y
+            z_expected = np.array([-0.9, 1.5, 1.3, -1.1])
+            self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+
+            np_x = np.array([-3, 11, -2, 3])
+            np_y = np.array([-1, 2, 3, -2])
+            x = paddle.to_tensor(np_x, dtype="int64")
+            y = paddle.to_tensor(np_y, dtype="int64")
+            z = x % y
+            z_expected = np.array([0, 1, 1, -1])
+            self.assertEqual(np.allclose(z_expected, z.numpy()), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
index 00137f63e24..f6eff22d6ce 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -189,15 +189,15 @@ class TestMathOpPatches(unittest.TestCase):
     @prog_scope()
     def test_integer_div(self):
         a = fluid.layers.data(name="a", shape=[1], dtype='int64')
-        b = a / 2
+        b = a / 7
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        a_np = numpy.array([3, 4, 10, 14, 9, 18])
+        a_np = numpy.array([3, 4, 10, 14, 9, 18]).astype('int64')
         b_np, = exe.run(fluid.default_main_program(),
                         feed={"a": a_np},
                         fetch_list=[b])
-        # for paddle2.0, use true_divide
-        b_np_actual = (a_np / 2.0)
+
+        b_np_actual = (a_np / 7).astype('int64')
         self.assertTrue(numpy.array_equal(b_np, b_np_actual))
 
     @prog_scope()
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index 7e2ef36c1a7..6ca194b2694 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -248,8 +248,7 @@ class PolicyGradient(object):
             func=reward_func, x=[action, length], out=reward)
         neg_log_prob = layers.cross_entropy(act_prob, action)
         cost = neg_log_prob * reward
-        cost = (layers.reduce_sum(cost) /
-                layers.cast(layers.reduce_sum(length), "float32")
+        cost = (layers.reduce_sum(cost) / layers.reduce_sum(length)
                 ) if length is not None else layers.reduce_mean(cost)
         optimizer = fluid.optimizer.Adam(self.lr)
         optimizer.minimize(cost)
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 6c139b0ddbb..da086c0955e 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1009,8 +1009,7 @@ def ctc_loss(log_probs,
     loss_out = fluid.layers.squeeze(loss_out, [-1])
     assert reduction in ['mean', 'sum', 'none']
     if reduction == 'mean':
-        loss_out = paddle.mean(loss_out / paddle.cast(label_lengths,
-                                                      loss_out.dtype))
+        loss_out = paddle.mean(loss_out / label_lengths)
     elif reduction == 'sum':
         loss_out = paddle.sum(loss_out)
     return loss_out
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 079178e1cf7..966544c7abb 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -64,7 +64,6 @@ from ..fluid.layers import increment    #DEFINE_ALIAS
 from ..fluid.layers import multiplex    #DEFINE_ALIAS
 from ..fluid.layers import sums    #DEFINE_ALIAS
 from ..fluid import layers
-import paddle
 
 
 __all__ = [
@@ -343,69 +342,9 @@ def divide(x, y, name=None):
     axis = -1
     act = None
     if in_dygraph_mode():
-        # rule 1 : avoid numpy.ndarray
-        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-            raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-        # rule 2: both the inputs are not Tensor
-        elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
-            x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
-            y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
-
-        # rule 3: both the inputs are Tensor
-        elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
-            if y.dtype != x.dtype:
-                raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
-                                "But x is {}, y is {}".format(x.dtype, y.dtype))
-            elif x.dtype in _supported_int_dtype_:
-                x = x.astype(paddle.get_default_dtype())
-                y = y.astype(paddle.get_default_dtype())
-
-        # rule 4: x is Tensor, y is scalar
-        elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
-            if x.dtype in _supported_int_dtype_:
-                x = x.astype(paddle.get_default_dtype())
-            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
-
-        # rule 5: x is scalar, y is Tensor
-        elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
-            if y.dtype in _supported_int_dtype_:
-                y = y.astype(paddle.get_default_dtype())
-            x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
-
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
-    # rule 1 : avoid numpy.ndarray
-    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-        raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-    # rule 2: both the inputs are not Tensor
-    elif not isinstance(x, Variable) and not isinstance(y, Variable):
-        x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
-        y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
-
-    # rule 3: both the inputs are Tensor
-    elif isinstance(x, Variable) and isinstance(y, Variable):
-        if y.dtype != x.dtype:
-            raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
-                            "But x is {}, y is {}".format(x.dtype, y.dtype))
-        elif x.dtype in _supported_int_dtype_:
-            x = paddle.cast(x, paddle.get_default_dtype())
-            y = paddle.cast(y, paddle.get_default_dtype())
-
-    # rule 4: x is Tensor, y is scalar
-    elif isinstance(x, Variable) and not isinstance(y, Variable):
-        if x.dtype in _supported_int_dtype_:
-            x = paddle.cast(x, paddle.get_default_dtype())
-        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
-
-    # rule 5: x is scalar, y is Tensor
-    elif not isinstance(x, Variable) and isinstance(y, Variable):
-        if y.dtype in _supported_int_dtype_:
-            y = paddle.cast(y, paddle.get_default_dtype())
-        x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
-
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
@@ -444,55 +383,9 @@ def floor_divide(x, y, name=None):
     op_type = 'elementwise_floordiv'
     axis = -1
     if in_dygraph_mode():
-        # rule 1 : avoid numpy.ndarray
-        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-            raise TypeError("floor_divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-        # rule 2: both the inputs are not Tensor
-        elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
-            x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
-            y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
-
-        # rule 3: both the inputs are Tensor
-        elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
-            if y.dtype != x.dtype:
-                raise TypeError("floor_divide(): argument position 1 and argument position 2 must have the same dtype."
-                                "But x is {}, y is {}".format(x.dtype, y.dtype))
-
-        # rule 4: x is Tensor, y is scalar
-        elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
-            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
-
-        # rule 5: x is scalar, y is Tensor
-        elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
-            x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
-
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, op_name=op_type)
 
-    # rule 1 : avoid numpy.ndarray
-    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-        raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-    # rule 2: both the inputs are not Tensor
-    elif not isinstance(x, Variable) and not isinstance(y, Variable):
-        x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
-        y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
-
-    # rule 3: both the inputs are Tensor
-    elif isinstance(x, Variable) and isinstance(y, Variable):
-        if y.dtype != x.dtype:
-            raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
-                            "But x is {}, y is {}".format(x.dtype, y.dtype))
-
-    # rule 4: x is Tensor, y is scalar
-    elif isinstance(x, Variable) and not isinstance(y, Variable):
-        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
-
-    # rule 5: x is scalar, y is Tensor
-    elif not isinstance(x, Variable) and isinstance(y, Variable):
-        x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
-
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
@@ -531,43 +424,9 @@ def remainder(x, y, name=None):
     op_type = 'elementwise_mod'
     axis = -1
     if in_dygraph_mode():
-        # rule 1 : avoid numpy.ndarray
-        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-            raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-        elif not isinstance(x, paddle.Tensor):
-            raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
-
-        # rule 3: both the inputs are Tensor
-        elif isinstance(y, paddle.Tensor):
-            if y.dtype != x.dtype:
-                raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
-                                "But x is {}, y is {}".format(x.dtype, y.dtype))
-
-        # rule 4: x is Tensor, y is scalar
-        elif not isinstance(y, paddle.Tensor):
-            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
-
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, op_name=op_type)
 
-    # rule 1 : avoid numpy.ndarray
-    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-        raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-    elif not isinstance(x, Variable):
-        raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
-
-    # rule 3: both the inputs are Tensor
-    elif isinstance(y, Variable):
-        if y.dtype != x.dtype:
-            raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
-                            "But x is {}, y is {}".format(x.dtype, y.dtype))
-
-    # rule 4: x is Tensor, y is scalar
-    elif not isinstance(y, paddle.Tensor):
-        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
-
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
-- 
GitLab


From 07d089f6aa2d7a184f4d714926685158dac41c41 Mon Sep 17 00:00:00 2001
From: chalsliu <45041955+chalsliu@users.noreply.github.com>
Date: Thu, 10 Sep 2020 13:32:36 +0800
Subject: [PATCH 032/261] Check benchmark issues in CI

---
 tools/check_file_diff_approvals.sh |  7 +++++++
 tools/check_ut.py                  | 17 +++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 1e5179d0282..84254cc89bb 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -287,12 +287,19 @@ fi
 pip install PyGithub
 # For getting PR related data
 wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate
+wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate
 HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
 if [ "${HASUTFIXED}" != "" ]; then
   echo_line="${HASUTFIXED} You must have one RD (chalsliu (Recommend) or kolinwei) approval.\n"
   check_approval 1 45041955 22165420
 fi
 
+HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has benchmark issue to be fixed" || true`
+if [ "${HASUTFIXED}" != "" ]; then
+    echo_line="${HASUTFIXED} You must have one RD (hysunflower or xiegegege or Xreki) approval.\n"
+  check_approval 1 52739577 46314656 12538138
+fi
+
 if [ -n "${echo_list}" ];then
   echo "****************"
   echo -e "${echo_list[@]}"
diff --git a/tools/check_ut.py b/tools/check_ut.py
index 7b5e5a4f1c5..f5fe4c687dd 100644
--- a/tools/check_ut.py
+++ b/tools/check_ut.py
@@ -27,9 +27,12 @@ class PRChecker(object):
         self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60)
         self.repo = None
 
-    def check(self):
-        """ check pr. """
-        filename = 'block.txt'
+    def check(self, filename, msg):
+        """ 
+        Args:
+            filename (str): File to get block names.  
+            msg (str): Error message.  
+        """
         pr_id = os.getenv('GIT_PR_ID')
         if not pr_id:
             print('No PR ID')
@@ -44,12 +47,10 @@ class PRChecker(object):
         with open(filename) as f:
             for l in f:
                 if l.rstrip('\r\n') == user:
-                    print('{} has unit-test to be fixed, so CI failed.'.format(
-                        user))
-                    exit(1)
-        exit(0)
+                    print('{} {}'.format(user, msg))
 
 
 if __name__ == '__main__':
     pr_checker = PRChecker()
-    pr_checker.check()
+    pr_checker.check('block.txt', 'has unit-test to be fixed, so CI failed.')
+    pr_checker.check('bk.txt', 'has benchmark issue to be fixed, so CI failed.')
-- 
GitLab


From 5406b014c016db112085cf8c6a085d86fff0b248 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 10 Sep 2020 15:12:56 +0800
Subject: [PATCH 033/261] Refine jit.save implement to adapt InputSpec using
 cases (#26959)

* add some unittest cases ot verify jit.save, no_test

* add more unittests

* add test with example inputs

* polish implement details

* remove useless blank

* fix fetch random error
---
 python/paddle/fluid/dygraph/jit.py            | 147 ++++++--
 .../tests/unittests/test_jit_save_load.py     | 345 +++++++++++++++---
 2 files changed, 400 insertions(+), 92 deletions(-)

diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index ad208811624..ec96bdd9786 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -18,6 +18,7 @@ import os
 import pickle
 import warnings
 import functools
+from collections import OrderedDict
 
 import six
 import paddle
@@ -633,6 +634,73 @@ class SaveLoadConfig(object):
         self._keep_name_table = value
 
 
+def _get_input_var_names(inputs, input_spec):
+    name_none_error = "The %s's name is None. " \
+        "When using jit.save, please set InputSepc's name in " \
+        "to_static(input_spec=[]) and jit.save(input_spec=[]) " \
+        "and make sure they are consistent."
+    name_no_exists_error = "The tensor `%s` does not exists. " \
+        "Please make sure the name of InputSpec or example Tensor " \
+        "in input_spec is the same as the name of InputSpec in " \
+        "`to_static` decorated on the Layer.forward method."
+    result_list = []
+    input_var_names = [var.name for var in inputs if isinstance(var, Variable)]
+    if input_spec is None:
+        # no prune
+        result_list = input_var_names
+    elif input_spec is not None and len(input_spec) == len(input_var_names):
+        # no prune
+        result_list = input_var_names
+        # if input spec name not in input_var_names, only raise warning 
+        for spec in input_spec:
+            if spec.name is None:
+                warnings.warn(name_none_error % spec)
+            elif spec.name not in input_var_names:
+                warnings.warn(name_no_exists_error % spec.name)
+            else:
+                # do nothing
+                pass
+    else:
+        # prune
+        for spec in input_spec:
+            if spec.name is None:
+                # name is None, the input_spec only can be InputSpec
+                raise ValueError(name_none_error % spec)
+            elif spec.name not in input_var_names:
+                # the input_spec can be `InputSpec` or `VarBase`
+                raise ValueError(name_no_exists_error % spec.name)
+            else:
+                result_list.append(spec.name)
+
+    return result_list
+
+
+def _get_output_vars(outputs, output_spec):
+    name_no_exists_error = "The tensor `%s` does not exists. " \
+        "Please make sure the name of example Tensor " \
+        "in configs.output_spec is the output tensor of " \
+        "Layer.forward method."
+    result_list = []
+    output_vars_dict = OrderedDict()
+    for var in outputs:
+        if isinstance(var, Variable):
+            output_vars_dict[var.name] = var
+    if output_spec is None:
+        result_list = output_vars_dict.values()
+    elif output_spec is not None and len(output_spec) == len(output_vars_dict):
+        result_list = output_vars_dict.values()
+        for var in output_spec:
+            if var.name not in output_vars_dict:
+                warnings.warn(name_no_exists_error % var.name)
+    else:
+        for var in output_spec:
+            if var.name not in output_vars_dict:
+                raise ValueError(name_no_exists_error % var.name)
+            else:
+                result_list.append(output_vars_dict[var.name])
+    return result_list
+
+
 # NOTE(chenweihang): change jit.save/load argument `configs` to `config`
 def deprecate_save_load_configs(func):
     @functools.wraps(func)
@@ -753,26 +821,6 @@ def save(layer, model_path, input_spec=None, config=None):
             paddle.jit.save(layer, model_path)
     """
 
-    def get_inout_spec(all_vars, target_vars, return_name=False):
-        result_list = []
-        valid_var_dict = {}
-        valid_vars = [var for var in all_vars if isinstance(var, Variable)]
-        for var in valid_vars:
-            valid_var_dict[var.name] = var
-        if target_vars:
-            for i, var in enumerate(target_vars):
-                # check target var whether exists
-                if var.name not in valid_var_dict:
-                    raise RuntimeError(
-                        "The variable to feed/fetch are not exist.")
-                result_list.append(valid_var_dict[var.name])
-        else:
-            result_list = valid_vars
-        if return_name:
-            result_list = [var.name for var in result_list]
-
-        return result_list
-
     # 1. input check
     prog_translator = ProgramTranslator()
     if not prog_translator.enable:
@@ -788,25 +836,58 @@ def save(layer, model_path, input_spec=None, config=None):
     if configs is None:
         configs = SaveLoadConfig()
 
+    # avoid change user given input_spec
+    inner_input_spec = None
     if input_spec is not None:
         if not isinstance(input_spec, list):
             raise TypeError(
                 "The input input_spec should be 'list', but received input_spec's type is %s."
                 % type(input_spec))
+        inner_input_spec = []
         for var in input_spec:
-            if not isinstance(var, (core.VarBase, Variable,
-                                    paddle.static.InputSpec)):
+            if isinstance(var, paddle.static.InputSpec):
+                inner_input_spec.append(var)
+            elif isinstance(var, (core.VarBase, Variable)):
+                inner_input_spec.append(
+                    paddle.static.InputSpec.from_tensor(var))
+            else:
                 raise TypeError(
                     "The element in input_spec list should be 'Variable' or `paddle.static.InputSpec`, but received element's type is %s."
                     % type(var))
 
-    # 2. get program of declarative Layer.forward
-    if not isinstance(layer.forward, StaticLayer):
-        raise RuntimeError(
-            "layer.forward need to be decorated by `@declarative`.")
-    concrete_program = layer.forward.concrete_program
-
-    # NOTE: we maintain the mapping of variable name to
+    # 2. get program from Layer
+    # TODO(chenweihang): add support for other method, not only forward
+    if isinstance(layer.forward, StaticLayer):
+        concrete_program = layer.forward.concrete_program
+    else:
+        # transform in jit.save, if input_spec is incomplete, declarative will throw error
+        static_forward = declarative(layer.forward, input_spec=inner_input_spec)
+        concrete_program = static_forward.concrete_program
+        # the input_spec has been used in declarative, which is equal to 
+        # @declarative with input_spec and jit.save without input_spec,
+        # avoid needless warning
+        inner_input_spec = None
+
+    # 3. build input & output of save_infernece_model
+    # NOTE(chenweihang): [ Get input variables name ]
+    # There are two cases, whether to prune the inputs or not
+    # - not prune inputs (recommend):
+    #   - the len(input_spec) == len((concrete_program.inputs) - 1
+    #   - here can use concrete_program.inputs directly
+    # - prune inputs:
+    #   - the input_spec length < len((concrete_program.inputs) - 1
+    #   - the input_spec's name should be in concrete_program.inputs
+    input_var_names = _get_input_var_names(concrete_program.inputs,
+                                           inner_input_spec)
+
+    # NOTE(chenweihang): [ Get output variables ]
+    # the rule is like [ Get input variables name ]. For output var, 
+    # we only support VarBase spec, and actually, we only need the 
+    # var name of output, and we don't recommended to use output_spec
+    output_vars = _get_output_vars(concrete_program.outputs,
+                                   configs.output_spec)
+
+    # NOTE(chenweihang): we maintain the mapping of variable name to
     # structured name, the buffer variable (non-persistable)
     # saved to inference program may not need by dygraph Layer, 
     # we only record the state_dict variable's structured name
@@ -814,7 +895,7 @@ def save(layer, model_path, input_spec=None, config=None):
     for structured_name, var in six.iteritems(layer.state_dict()):
         state_names_dict[var.name] = structured_name
 
-    # 3. share parameters from Layer to scope & record var info
+    # 4. share parameters from Layer to scope & record var info
     scope = core.Scope()
     extra_var_info = dict()
     for param_or_buffer in concrete_program.parameters:
@@ -832,10 +913,6 @@ def save(layer, model_path, input_spec=None, config=None):
             extra_info_dict['trainable'] = param_or_buffer.trainable
         extra_var_info[param_or_buffer.name] = extra_info_dict
 
-    # 4. build input & output spec
-    input_var_names = get_inout_spec(concrete_program.inputs, input_spec, True)
-    output_vars = get_inout_spec(concrete_program.outputs, configs.output_spec)
-
     # 5. save inference model
     from paddle.fluid.io import save_inference_model
 
@@ -856,7 +933,7 @@ def save(layer, model_path, input_spec=None, config=None):
             export_for_deployment=configs._export_for_deployment,
             program_only=configs._program_only)
 
-        # NOTE: [ Save extra variable info ]
+        # NOTE(chenweihang): [ Save extra variable info ]
         # save_inference_model will lose some important variable information, including:
         #   - Variable name and correspondence (when saved variables as one file)
         #   - Variable.stop_gradient information
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index f7fcc1ff561..7bf806bab55 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -56,6 +56,16 @@ class LinearNet(fluid.dygraph.Layer):
         return self._linear(x)
 
 
+class LinearNetWithInputSpec(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetWithInputSpec, self).__init__()
+        self._linear = Linear(in_size, out_size)
+
+    @declarative(input_spec=[InputSpec(shape=[None, 784], dtype='float32')])
+    def forward(self, x):
+        return self._linear(x)
+
+
 class LinearNetNotDeclarative(fluid.dygraph.Layer):
     def __init__(self, in_size, out_size):
         super(LinearNetNotDeclarative, self).__init__()
@@ -65,6 +75,23 @@ class LinearNetNotDeclarative(fluid.dygraph.Layer):
         return self._linear(x)
 
 
+class LinerNetWithLabel(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinerNetWithLabel, self).__init__()
+        self._linear = Linear(in_size, out_size)
+
+    @declarative(input_spec=[
+        InputSpec(
+            shape=[None, 784], dtype='float32', name="image"), InputSpec(
+                shape=[None, 1], dtype='int64', name="label")
+    ])
+    def forward(self, x, label):
+        out = self._linear(x)
+        loss = fluid.layers.cross_entropy(out, label)
+        avg_loss = fluid.layers.mean(loss)
+        return out, avg_loss
+
+
 class LinearNetReturnLoss(fluid.dygraph.Layer):
     def __init__(self, in_size, out_size):
         super(LinearNetReturnLoss, self).__init__()
@@ -78,6 +105,54 @@ class LinearNetReturnLoss(fluid.dygraph.Layer):
         return z, loss
 
 
+class LinearNetMultiInput(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetMultiInput, self).__init__()
+        self._linear1 = Linear(in_size, out_size)
+        self._linear2 = Linear(in_size, out_size)
+
+    @declarative(input_spec=[
+        InputSpec(
+            [None, 8], dtype='float32'), InputSpec(
+                [None, 8], dtype='float32')
+    ])
+    def forward(self, x, y):
+        x_out = self._linear1(x)
+        y_out = self._linear2(y)
+        loss = fluid.layers.mean(x_out + y_out)
+        return x_out, y_out, loss
+
+
+class MultiLoadingLinearNet(fluid.dygraph.Layer):
+    def __init__(self, size, model_path):
+        super(MultiLoadingLinearNet, self).__init__()
+        self._linear = Linear(size, size)
+        self._load_linear1 = fluid.dygraph.jit.load(model_path)
+        self._load_linear2 = fluid.dygraph.jit.load(model_path)
+
+    @declarative
+    def forward(self, x):
+        tmp1 = self._linear(x)
+        tmp2 = self._load_linear1(tmp1)
+        tmp3 = self._load_linear2(tmp2)
+        y = self._linear(tmp3)
+        return y
+
+
+class LinearNetReturnHidden(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetReturnHidden, self).__init__()
+        self._linear_1 = Linear(in_size, out_size)
+        self._linear_2 = Linear(in_size, out_size)
+
+    @declarative
+    def forward(self, x):
+        y = self._linear_1(x)
+        z = self._linear_2(y)
+        loss = fluid.layers.mean(z)
+        return y, loss
+
+
 def train(layer, input_size=784, label_size=1):
     # create optimizer
     sgd = fluid.optimizer.SGDOptimizer(
@@ -102,6 +177,27 @@ def train(layer, input_size=784, label_size=1):
     return [img], layer, avg_loss
 
 
+def train_with_label(layer, input_size=784, label_size=1):
+    # create optimizer
+    sgd = fluid.optimizer.SGDOptimizer(
+        learning_rate=0.01, parameter_list=layer.parameters())
+    # create data loader
+    train_loader = fluid.io.DataLoader.from_generator(capacity=5)
+    train_loader.set_batch_generator(
+        random_batch_reader(input_size, label_size))
+    # train
+    for data in train_loader():
+        img, label = data
+        label.stop_gradient = True
+
+        out, avg_loss = layer(img, label)
+
+        avg_loss.backward()
+        sgd.minimize(avg_loss)
+        layer.clear_gradients()
+    return out
+
+
 class TestJitSaveLoad(unittest.TestCase):
     def setUp(self):
         self.model_path = "model.test_jit_save_load"
@@ -168,15 +264,6 @@ class TestJitSaveLoad(unittest.TestCase):
         self.assertTrue(
             np.array_equal(train_layer(x).numpy(), new_layer(x).numpy()))
 
-    def test_save_get_program_failed(self):
-        layer = LinearNetNotDeclarative(784, 1)
-        example_inputs, layer, _ = train(layer)
-        with self.assertRaises(RuntimeError):
-            fluid.dygraph.jit.save(
-                layer=layer,
-                model_path=self.model_path,
-                input_spec=example_inputs)
-
     def test_load_dygraph_no_path(self):
         model_path = "model.test_jit_save_load.no_path"
         new_layer = LinearNet(784, 1)
@@ -184,24 +271,6 @@ class TestJitSaveLoad(unittest.TestCase):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
 
-class LinearNetMultiInput(fluid.dygraph.Layer):
-    def __init__(self, in_size, out_size):
-        super(LinearNetMultiInput, self).__init__()
-        self._linear1 = Linear(in_size, out_size)
-        # self._linear2 = Linear(in_size, out_size)
-
-    @declarative(input_spec=[
-        InputSpec(
-            [None, 8], dtype='float32'), InputSpec(
-                [None, 8], dtype='float32')
-    ])
-    def forward(self, x, y):
-        x_out = self._linear1(x)
-        y_out = self._linear1(y)
-        loss = fluid.layers.mean(x_out + y_out)
-        return x_out, y_out, loss
-
-
 class TestSaveLoadWithInputSpec(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
@@ -345,22 +414,6 @@ class TestJitSaveLoadConfig(unittest.TestCase):
             np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
 
 
-class MultiLoadingLinearNet(fluid.dygraph.Layer):
-    def __init__(self, size, model_path):
-        super(MultiLoadingLinearNet, self).__init__()
-        self._linear = Linear(size, size)
-        self._load_linear1 = fluid.dygraph.jit.load(model_path)
-        self._load_linear2 = fluid.dygraph.jit.load(model_path)
-
-    @declarative
-    def forward(self, x):
-        tmp1 = self._linear(x)
-        tmp2 = self._load_linear1(tmp1)
-        tmp3 = self._load_linear2(tmp2)
-        y = self._linear(tmp3)
-        return y
-
-
 class TestJitMultipleLoading(unittest.TestCase):
     def setUp(self):
         self.linear_size = 4
@@ -389,20 +442,6 @@ class TestJitMultipleLoading(unittest.TestCase):
             name_set.add(var.name)
 
 
-class LinearNetReturnHidden(fluid.dygraph.Layer):
-    def __init__(self, in_size, out_size):
-        super(LinearNetReturnHidden, self).__init__()
-        self._linear_1 = Linear(in_size, out_size)
-        self._linear_2 = Linear(in_size, out_size)
-
-    @declarative
-    def forward(self, x):
-        y = self._linear_1(x)
-        z = self._linear_2(y)
-        loss = fluid.layers.mean(z)
-        return y, loss
-
-
 class TestJitPruneModelAndLoad(unittest.TestCase):
     def setUp(self):
         self.linear_size = 4
@@ -461,5 +500,197 @@ class TestJitPruneModelAndLoad(unittest.TestCase):
             fluid.dygraph.jit.load(self.model_path)
 
 
+class TestJitSaveMultiCases(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        fluid.enable_dygraph()
+        # config seed
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+    def verify_inference_correctness(self, layer, model_path, with_label=False):
+        layer.eval()
+        loaded_layer = paddle.jit.load(model_path)
+        loaded_layer.eval()
+        # inference & compare
+        x = paddle.to_variable(np.random.random((1, 784)).astype('float32'))
+        if with_label:
+            y = paddle.to_variable(np.random.random((1, 1)).astype('int64'))
+            pred, _ = layer(x, y)
+            pred = pred.numpy()
+        else:
+            pred = layer(x).numpy()
+        loaded_pred = loaded_layer(x).numpy()
+        self.assertTrue(
+            np.array_equal(pred, loaded_pred),
+            msg="Result diff when load and inference:\nlayer result:\n{}\n" \
+                "loaded layer result:\n{}".format(pred, loaded_pred))
+
+    def test_no_prune_to_static_after_train(self):
+        layer = LinearNet(784, 1)
+
+        train(layer)
+
+        model_path = "test_no_prune_to_static_after_train"
+        paddle.jit.save(layer, model_path)
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_no_prune_to_static_no_train(self):
+        layer = LinearNetWithInputSpec(784, 1)
+
+        model_path = "test_no_prune_to_static_no_train"
+        paddle.jit.save(layer, model_path)
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_no_prune_no_to_static_after_train(self):
+        layer = LinearNetNotDeclarative(784, 1)
+
+        train(layer)
+
+        model_path = "test_no_prune_no_to_static_after_train"
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[InputSpec(
+                shape=[None, 784], dtype='float32')])
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_no_prune_no_to_static_after_train_with_examples(self):
+        layer = LinearNetNotDeclarative(784, 1)
+
+        example_inputs, _, _ = train(layer)
+
+        model_path = "test_no_prune_no_to_static_after_train_with_examples"
+        fluid.dygraph.jit.save(
+            layer=layer, model_path=model_path, input_spec=example_inputs)
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_no_prune_no_to_static_no_train(self):
+        layer = LinearNetNotDeclarative(784, 1)
+
+        model_path = "test_no_prune_no_to_static_no_train"
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[InputSpec(
+                shape=[None, 784], dtype='float32')])
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_prune_to_static_after_train(self):
+        layer = LinerNetWithLabel(784, 1)
+
+        out = train_with_label(layer)
+
+        model_path = "test_prune_to_static_after_train"
+        configs = paddle.SaveLoadConfig()
+        configs.output_spec = [out]
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[
+                InputSpec(
+                    shape=[None, 784], dtype='float32', name="image")
+            ],
+            configs=configs)
+
+        self.verify_inference_correctness(layer, model_path, True)
+
+    def test_prune_to_static_no_train(self):
+        layer = LinerNetWithLabel(784, 1)
+
+        model_path = "test_prune_to_static_no_train"
+        configs = paddle.SaveLoadConfig()
+        # TODO: no train, cannot get output_spec var here
+        # now only can use index
+        configs.output_spec = layer.forward.outputs[:1]
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[
+                InputSpec(
+                    shape=[None, 784], dtype='float32', name="image")
+            ],
+            configs=configs)
+
+        self.verify_inference_correctness(layer, model_path, True)
+
+    def test_no_prune_input_spec_name_warning(self):
+        layer = LinearNetWithInputSpec(784, 1)
+
+        train(layer)
+
+        model_path = "test_no_prune_input_spec_name_warning"
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[InputSpec(
+                shape=[None, 784], dtype='float32')])
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[
+                InputSpec(
+                    shape=[None, 784], dtype='float32', name='feed_input')
+            ])
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_not_prune_output_spec_name_warning(self):
+        layer = LinearNet(784, 1)
+
+        train(layer)
+
+        model_path = "test_not_prune_output_spec_name_warning"
+        configs = paddle.SaveLoadConfig()
+        out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
+        configs.output_spec = [out]
+        paddle.jit.save(layer, model_path, configs=configs)
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_prune_input_spec_name_error(self):
+        layer = LinerNetWithLabel(784, 1)
+
+        model_path = "test_prune_input_spec_name_error"
+        with self.assertRaises(ValueError):
+            paddle.jit.save(
+                layer,
+                model_path,
+                input_spec=[InputSpec(
+                    shape=[None, 784], dtype='float32')])
+        with self.assertRaises(ValueError):
+            paddle.jit.save(
+                layer,
+                model_path,
+                input_spec=[
+                    InputSpec(
+                        shape=[None, 784], dtype='float32', name='feed_input')
+                ])
+
+    def test_prune_output_spec_name_error(self):
+        layer = LinerNetWithLabel(784, 1)
+
+        train_with_label(layer)
+
+        model_path = "test_prune_to_static_after_train"
+        configs = paddle.SaveLoadConfig()
+        out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
+        configs.output_spec = [out]
+        with self.assertRaises(ValueError):
+            paddle.jit.save(
+                layer,
+                model_path,
+                input_spec=[
+                    InputSpec(
+                        shape=[None, 784], dtype='float32', name="image")
+                ],
+                configs=configs)
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From e00586159884ce0c5b1a5e39b6d30949507fb13d Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Thu, 10 Sep 2020 09:22:29 +0200
Subject: [PATCH 034/261] [oneDNN]Introducing oneDNN 1.6 (#27137)

* - introducing oneDNN 1.6

test=develop

* - Removed redundant code

test=develop
---
 cmake/external/mkldnn.cmake                     |  2 +-
 .../operators/elementwise/elementwise_mul_op.h  | 17 +----------------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index ae870b766fc..ad7b7c2c2ab 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     https://github.com/intel/mkl-dnn.git)
-SET(MKLDNN_TAG            1ea812f4f5aa1bd989372a23ab50d0f0f81ee677)
+SET(MKLDNN_TAG            4c05c181b40cf7132f8943411fb3fab1786df0f7)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 718321b441b..e4d3ea6d729 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -33,22 +33,7 @@ class ElementwiseMulOp : public ElementwiseOp {
     auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
 #ifdef PADDLE_WITH_MKLDNN
-    using mkldnn::memory;
-    auto CanMKLDNNElementwiseMulBeUsed = [&]() {
-      auto x_dims = ctx.Input<Tensor>("X")->dims();
-      auto y_dims = ctx.Input<Tensor>("Y")->dims();
-      int rankdiff = x_dims.size() - y_dims.size();
-      // TODO(jczaja): Remove this when oneDNN performance for scalar
-      // broadcasting
-      // is improved (Ernie large situation)
-      if (rankdiff != 0 && y_dims.size() == 1 && y_dims[0] == 1) {
-        return false;
-      }
-
-      return true;
-    };
-
-    if (platform::CanMKLDNNBeUsed(ctx) && CanMKLDNNElementwiseMulBeUsed()) {
+    if (platform::CanMKLDNNBeUsed(ctx)) {
       return framework::OpKernelType(input_data_type, ctx.GetPlace(),
                                      framework::DataLayout::kMKLDNN,
                                      framework::LibraryType::kMKLDNN);
-- 
GitLab


From 78446ecdba3c3d325237dc0456d8297e3c1ac584 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Thu, 10 Sep 2020 15:23:42 +0800
Subject: [PATCH 035/261] [UT] fix run type of ut test cases of
 test_train_recognize_digits and test_api_impl, test=develop (#27218)

---
 paddle/fluid/inference/api/CMakeLists.txt | 2 --
 paddle/fluid/train/CMakeLists.txt         | 2 --
 2 files changed, 4 deletions(-)

diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index fb0ad31a3e6..c0d3b14e0e4 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -53,12 +53,10 @@ if(WITH_TESTING)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_fluid_shared
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
-    set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
   elseif(WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
-    set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
   endif()
 
 endif()
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index 7eab677fac1..235d92ac4f9 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -27,8 +27,6 @@ function(train_test TARGET_NAME)
         endif()
         set_tests_properties(test_train_${TARGET_NAME}${arg}
                 PROPERTIES DEPENDS test_${TARGET_NAME})
-        set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES LABELS "RUN_TYPE=DIST")
         if(NOT WIN32 AND NOT APPLE)
             set_tests_properties(test_train_${TARGET_NAME}${arg}
                     PROPERTIES TIMEOUT 150)
-- 
GitLab


From 7c7fbd32181cf30d9994623f29590160610d9c78 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Thu, 10 Sep 2020 15:27:19 +0800
Subject: [PATCH 036/261] fix error msg of fused_embedding_fc_lstm_op,
 test=develop (#27231)

---
 .../fused/fused_embedding_fc_lstm_op.cc        | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index c698cb1405f..79fa268f388 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -367,8 +367,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
 
     for (int64_t i = 0; i < ids_numel; ++i) {
-      PADDLE_ENFORCE_LT(ids_data[i], row_number);
-      PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
+      PADDLE_ENFORCE_LT(
+          ids_data[i], row_number,
+          platform::errors::OutOfRange(
+              "Value of Ids %d should less than dict size %d.", i, row_number));
+      PADDLE_ENFORCE_GE(ids_data[i], 0,
+                        platform::errors::OutOfRange(
+                            "Value of Ids %d should greater than ZERO.", i));
       memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
              row_width * sizeof(T));
     }
@@ -473,8 +478,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     for (int64_t i = 0; i < ids_numel; ++i) {
-      PADDLE_ENFORCE_LT(ids_data[i], row_number);
-      PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
+      PADDLE_ENFORCE_LT(
+          ids_data[i], row_number,
+          platform::errors::OutOfRange(
+              "Value of Ids %d should less than dict size %d.", i, row_number));
+      PADDLE_ENFORCE_GE(ids_data[i], 0,
+                        platform::errors::OutOfRange(
+                            "Value of Ids %d should greater than ZERO.", i));
       memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
              row_width * sizeof(T));
     }
-- 
GitLab


From 58a88ba9af79a720074b23892fb137fc7642449c Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Thu, 10 Sep 2020 15:46:39 +0800
Subject: [PATCH 037/261] add double grad for expand (#27183)

* add double grad for expand, test=develop
---
 paddle/fluid/operators/expand_op.cc           | 22 +++++++++++++++++
 .../fluid/tests/unittests/test_nn_grad.py     | 24 +++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 3c898ac29f0..83e205367a7 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -228,6 +228,26 @@ class ExpandGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class ExpandDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    if (this->HasInput("expand_times_tensor")) {
+      op->SetInput("expand_times_tensor", this->Input("expand_times_tensor"));
+    }
+    if (this->HasInput("ExpandTimes")) {
+      op->SetInput("ExpandTimes", this->Input("ExpandTimes"));
+    }
+    op->SetAttrMap(this->Attrs());
+    op->SetType("expand");
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandGradNoNeedBufVarsInferer, "X");
 
 }  // namespace operators
@@ -238,6 +258,8 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
                   ops::ExpandGradOpMaker<paddle::framework::OpDesc>,
                   ops::ExpandGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp,
+                  ops::ExpandDoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::ExpandDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::ExpandGradNoNeedBufVarsInferer);
 REGISTER_OP_CPU_KERNEL(
     expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index 0c39dc5e731..0e4f89f6026 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -153,6 +153,30 @@ class TestMulDoubleGradCheck(unittest.TestCase):
 
 
 class TestReshapeDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [3, 12]
+        expand_times = [4, 9]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = layers.expand(x, expand_times)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestExpandDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         x_shape = [3, 12]
-- 
GitLab


From c5f957ae387f6193a399c8749f923f1635df19e3 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Thu, 10 Sep 2020 15:55:09 +0800
Subject: [PATCH 038/261] add double grad for tile op and expand_v2 op (#27114)

* add double grad for tile, test=develop

* add double grad for expand_v2 op, test=develop
---
 paddle/fluid/operators/expand_v2_op.cc        | 22 +++++++++
 paddle/fluid/operators/tile_op.cc             | 22 +++++++++
 .../fluid/tests/unittests/test_nn_grad.py     | 49 +++++++++++++++++++
 3 files changed, 93 insertions(+)

diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index 359d512c341..a1ee47b7f93 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -230,6 +230,26 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("expand_v2");
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    if (this->HasInput("expand_shapes_tensor")) {
+      op->SetInput("expand_shapes_tensor", this->Input("expand_shapes_tensor"));
+    }
+    if (this->HasInput("Shape")) {
+      op->SetInput("Shape", this->Input("Shape"));
+    }
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X");
 
 }  // namespace operators
@@ -240,6 +260,8 @@ REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker,
                   ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
                   ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp,
+                  ops::ExpandV2DoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::ExpandV2DoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::ExpandV2GradNoNeedBufVarsInferer);
 REGISTER_OP_CPU_KERNEL(
     expand_v2, ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index da4ca87296d..bc1cb3b4aa1 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -241,6 +241,26 @@ class TileGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class TileDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("tile");
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    if (this->HasInput("repeat_times_tensor")) {
+      op->SetInput("repeat_times_tensor", this->Input("repeat_times_tensor"));
+    }
+    if (this->HasInput("RepeatTimes")) {
+      op->SetInput("RepeatTimes", this->Input("RepeatTimes"));
+    }
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X");
 
 }  // namespace operators
@@ -251,6 +271,8 @@ REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker,
                   ops::TileGradOpMaker<paddle::framework::OpDesc>,
                   ops::TileGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(tile_grad, ops::TileGradOp,
+                  ops::TileDoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::TileDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::TileGradNoNeedBufVarsInferer);
 REGISTER_OP_CPU_KERNEL(
     tile, ops::TileKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index 0e4f89f6026..5d1e016287e 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
@@ -200,5 +201,53 @@ class TestExpandDoubleGradCheck(unittest.TestCase):
             self.func(p)
 
 
+class TestTileDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [3, 12]
+        repeat_times = [4, 9]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = paddle.tile(x, repeat_times)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestExpandV2DoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [1, 12]
+        new_shape = [4, 12]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = paddle.expand(x, new_shape)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From f6be5989fda4445d2b4d8b84f62a012b1d30f1c2 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Thu, 10 Sep 2020 16:57:40 +0800
Subject: [PATCH 039/261] Reduce the parallel compile count (#27187)

---
 paddle/scripts/paddle_build.bat | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 1616e237092..15610abef0f 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -154,6 +154,7 @@ echo    Step 2. Buile Paddle ...
 echo    ========================================
 call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
 
+for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*8/10
 set build_times=1
 :build_tp
 echo Build third_party the %build_times% time:
@@ -172,7 +173,7 @@ echo Build third_party successfully!
 set build_times=1
 :build_paddle
 echo Build Paddle the %build_times% time:
-msbuild /m /p:Configuration=Release /verbosity:minimal paddle.sln
+msbuild /m:%PARALLEL_PROJECT_COUNT% /p:Configuration=Release /verbosity:minimal paddle.sln
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
     if %build_times% GTR 2 (
-- 
GitLab


From ece74c4cd49189480bb196904e2577eba7653990 Mon Sep 17 00:00:00 2001
From: Zhen Wang <wangzhen31@baidu.com>
Date: Thu, 10 Sep 2020 19:11:18 +0800
Subject: [PATCH 040/261] Update the _get_fake_quant_type definition in
 imperative QAT. (#27222)

---
 .../slim/quantization/imperative/qat.py       |  1 -
 .../slim/quantization/imperative/quant_nn.py  | 49 ++++++++++++++-----
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 5662284483b..8d399c92901 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -192,7 +192,6 @@ class ImperativeQuantAware(object):
         assert len(input_dtype) == len(
             feed), "The length of input_shape should be equal to  feed's."
 
-        prog_trans = dygraph.ProgramTranslator()
         with dygraph.guard():
             model.eval()
             input_vars = []
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index 59dd9867abb..e22c980b0a7 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -209,15 +209,24 @@ class FakeQuantAbsMax(layers.Layer):
         return quant_out
 
 
-def _get_fake_quant_type(quant_type, name, moving_rate, quant_bits, dtype,
-                         quant_on_weight):
+def _get_fake_quant_type(quant_type, **kwargs):
+    call_args = {
+        "name": kwargs.get("name", None),
+        "quant_bits": kwargs.get("quant_bits", 8),
+        "dtype": kwargs.get("dtype", "float32")
+    }
+
+    if quant_type == 'abs_max':
+        call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
+    elif quant_type == 'moving_average_abs_max':
+        call_args["moving_rate"] = kwargs.get("moving_rate", 0.9)
+
     fake_quant_map = {
-        'abs_max':
-        lambda: FakeQuantAbsMax(name, quant_bits, dtype, quant_on_weight),
-        'moving_average_abs_max':
-        lambda: FakeQuantMovingAverage(name, moving_rate, quant_bits, dtype)
+        'abs_max': FakeQuantAbsMax,
+        'moving_average_abs_max': FakeQuantMovingAverage
     }
-    return fake_quant_map[quant_type]()
+
+    return fake_quant_map[quant_type](**call_args)
 
 
 class QuantizedConv2D(layers.Layer):
@@ -247,11 +256,18 @@ class QuantizedConv2D(layers.Layer):
         self.bias = getattr(layer, 'bias')
         # For FakeQuant
         self._fake_quant_weight = _get_fake_quant_type(
-            weight_quantize_type, self.weight.name, moving_rate, weight_bits,
-            self._dtype, True)
+            weight_quantize_type,
+            name=self.weight.name,
+            moving_rate=moving_rate,
+            quant_bits=weight_bits,
+            dtype=self._dtype,
+            quant_on_weight=True)
         self._fake_quant_input = _get_fake_quant_type(
             activation_quantize_type,
-            layer.full_name(), moving_rate, activation_bits, self._dtype, False)
+            name=layer.full_name(),
+            moving_rate=moving_rate,
+            quant_bits=activation_bits,
+            dtype=self._dtype)
 
     def forward(self, input):
         quant_input = self._fake_quant_input(input)
@@ -326,11 +342,18 @@ class QuantizedLinear(layers.Layer):
         self.bias = getattr(layer, 'bias')
         # For FakeQuant
         self._fake_quant_weight = _get_fake_quant_type(
-            weight_quantize_type, self.weight.name, moving_rate, weight_bits,
-            self._dtype, True)
+            weight_quantize_type,
+            name=self.weight.name,
+            moving_rate=moving_rate,
+            quant_bits=weight_bits,
+            dtype=self._dtype,
+            quant_on_weight=True)
         self._fake_quant_input = _get_fake_quant_type(
             activation_quantize_type,
-            layer.full_name(), moving_rate, activation_bits, self._dtype, False)
+            name=layer.full_name(),
+            moving_rate=moving_rate,
+            quant_bits=activation_bits,
+            dtype=self._dtype)
 
     def forward(self, input):
         quant_input = self._fake_quant_input(input)
-- 
GitLab


From b6715386236f4db49cb47cb13b6afd21b474341c Mon Sep 17 00:00:00 2001
From: Zhen Wang <wangzhen31@baidu.com>
Date: Thu, 10 Sep 2020 19:38:18 +0800
Subject: [PATCH 041/261] * Reduce the training iterations in
 test_fetch_unmerged and test_fuse_bn_act_pass. (#27234)

* Use the single GPU card to execute the test_fuse_bn_act_pass UT.
---
 python/paddle/fluid/tests/unittests/CMakeLists.txt     |  2 +-
 .../fluid/tests/unittests/test_fetch_unmerged.py       | 10 +++++-----
 .../fluid/tests/unittests/test_fuse_bn_act_pass.py     |  6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index b496b7953a9..102bacff963 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -559,7 +559,7 @@ endif()
 set_tests_properties(test_parallel_executor_test_while_train test_parallel_executor_mnist
         test_parallel_executor_feed_persistable_var
         test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
-        test_data_norm_op test_imperative_using_non_zero_gpu test_fuse_bn_act_pass
+        test_data_norm_op test_imperative_using_non_zero_gpu
         test_dataloader_keep_order
         test_dataloader_unkeep_order
         test_parallel_executor_fetch_isolated_var
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
index 1181272bd98..37d269e3369 100644
--- a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
@@ -28,7 +28,7 @@ class TestFetchUnmerged(unittest.TestCase):
         conv_pool_1 = fluid.nets.simple_img_conv_pool(
             input=img,
             filter_size=5,
-            num_filters=20,
+            num_filters=8,
             pool_size=2,
             pool_stride=2,
             pool_type='max',
@@ -37,12 +37,12 @@ class TestFetchUnmerged(unittest.TestCase):
         conv_pool_2 = fluid.nets.simple_img_conv_pool(
             input=conv_pool_1,
             filter_size=5,
-            num_filters=50,
+            num_filters=16,
             pool_size=2,
             pool_stride=2,
             pool_type='avg',
             act="relu")
-        hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
+        hidden = fluid.layers.fc(input=conv_pool_2, size=32, act='relu')
         prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
         loss = fluid.layers.cross_entropy(input=prediction, label=label)
         avg_loss = fluid.layers.mean(loss)
@@ -75,8 +75,8 @@ class TestFetchUnmerged(unittest.TestCase):
         binary = fluid.CompiledProgram(main_program).with_data_parallel(
             loss_name=loss.name, build_strategy=build_strategy)
 
-        iters = 3
-        batch_size = 64
+        iters = 2
+        batch_size = 16
         train_reader = paddle.batch(
             paddle.reader.shuffle(
                 paddle.dataset.mnist.train(), buf_size=500),
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
index 921dbdbc6d4..5bcfc8720dd 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -25,7 +25,7 @@ class TestFuseBatchNormActPass(unittest.TestCase):
             hidden1 = fluid.layers.conv2d(
                 input=x,
                 filter_size=3,
-                num_filters=32,
+                num_filters=16,
                 stride=1,
                 padding=1,
                 act=None,
@@ -43,7 +43,7 @@ class TestFuseBatchNormActPass(unittest.TestCase):
                 bias_attr=bias_attr,
                 act='relu',
                 data_layout='NHWC')
-            hidden3 = fluid.layers.fc(input=hidden2, size=128, act='relu')
+            hidden3 = fluid.layers.fc(input=hidden2, size=32, act='relu')
             hidden4 = fluid.layers.batch_norm(
                 input=hidden3, act='relu', data_layout='NHWC')
             prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax')
@@ -63,7 +63,7 @@ class TestFuseBatchNormActPass(unittest.TestCase):
         startup_program = fluid.Program()
         x, y, loss = self.build_program(main_program, startup_program, use_cuda)
         exe = fluid.Executor(place)
-        iters = 10
+        iters = 8
         batch_size = 16
         feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
 
-- 
GitLab


From 2e59769612f6c9680cdfe89db6c56adc18693739 Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Fri, 11 Sep 2020 09:02:04 +0800
Subject: [PATCH 042/261] add empty op (c++, python, unit test) (#26659)

---
 paddle/fluid/operators/empty_op.cc            | 132 +++++++++
 paddle/fluid/operators/empty_op.cu.cc         |  26 ++
 paddle/fluid/operators/empty_op.h             |  45 +++
 paddle/fluid/operators/fill_constant_op.h     |  24 +-
 paddle/fluid/operators/gaussian_random_op.cc  |   3 +-
 paddle/fluid/operators/gaussian_random_op.cu  |   3 +-
 .../mkldnn/gaussian_random_mkldnn_op.cc       |   3 +-
 paddle/fluid/operators/utils.h                |  21 ++
 python/paddle/__init__.py                     |   1 +
 .../fluid/tests/unittests/test_empty_op.py    | 270 ++++++++++++++++++
 python/paddle/tensor/__init__.py              |   1 +
 python/paddle/tensor/creation.py              |  88 ++++++
 12 files changed, 588 insertions(+), 29 deletions(-)
 create mode 100644 paddle/fluid/operators/empty_op.cc
 create mode 100644 paddle/fluid/operators/empty_op.cu.cc
 create mode 100644 paddle/fluid/operators/empty_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_empty_op.py

diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
new file mode 100644
index 00000000000..f539e2e6f6d
--- /dev/null
+++ b/paddle/fluid/operators/empty_op.cc
@@ -0,0 +1,132 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/empty_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class EmptyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("ShapeTensor",
+             "(Tensor<int>), optional). The shape of the output."
+             "It has a higher priority than Attr(shape).")
+        .AsDispensable();
+    AddInput("ShapeTensorList",
+             "(vector<Tensor<int>>, optional). The shape of the output. "
+             "It has a higher priority than Attr(shape)."
+             "The shape of the element in vector must be [1].")
+        .AsDuplicable()
+        .AsDispensable();
+    AddAttr<std::vector<int64_t>>("shape",
+                                  "(vector<int64_t>) The shape of the output")
+        .SetDefault({});
+    AddAttr<int>("dtype", "The data type of output tensor, Default is float")
+        .SetDefault(framework::proto::VarType::FP32);
+    AddOutput("Out", "(Tensor) The output tensor.");
+    AddComment(R"DOC(empty operator
+Returns a tensor filled with uninitialized data. The shape of the tensor is
+defined by the variable argument shape.
+
+
+The type of the tensor is specify by `dtype`.
+)DOC");
+  }
+};
+
+class EmptyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* context) const override {
+    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
+
+    if (context->HasInput("ShapeTensor")) {
+      auto dims = context->GetInputDim("ShapeTensor");
+      int num_ele = 1;
+      for (int i = 0; i < dims.size(); ++i) {
+        num_ele *= dims[i];
+      }
+
+      context->SetOutputDim("Out", framework::make_ddim({num_ele}));
+    } else if (context->HasInputs("ShapeTensorList")) {
+      std::vector<int> out_dims;
+      auto dims_list = context->GetInputsDim("ShapeTensorList");
+      for (size_t i = 0; i < dims_list.size(); ++i) {
+        auto& dims = dims_list[i];
+        PADDLE_ENFORCE_EQ(
+            dims, framework::make_ddim({1}),
+            "ShapeError: The shape of Tensor in list must be [1]. "
+            "But received the shape "
+            "is [%s]",
+            dims);
+
+        out_dims.push_back(dims[0]);
+      }
+
+      context->SetOutputDim("Out", framework::make_ddim(out_dims));
+    } else {
+      auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
+      context->SetOutputDim("Out", framework::make_ddim(shape));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
+      return expected_kernel_type;
+    } else {
+      return framework::OpKernelType(expected_kernel_type.data_type_,
+                                     tensor.place(), tensor.layout());
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& context) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(context.Attr<int>("dtype")),
+        context.GetPlace());
+  }
+};
+
+class EmptyOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* context) const override {
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        BOOST_GET_CONST(int, context->GetAttr("dtype")));
+    context->SetOutputDataType("Out", data_type);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(
+    empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel<plat::CPUDeviceContext, bool>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, int>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, int64_t>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, float>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, double>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/empty_op.cu.cc b/paddle/fluid/operators/empty_op.cu.cc
new file mode 100644
index 00000000000..22799e507ae
--- /dev/null
+++ b/paddle/fluid/operators/empty_op.cu.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/empty_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    empty, ops::EmptyKernel<plat::CUDADeviceContext, bool>,
+    ops::EmptyKernel<plat::CUDADeviceContext, int>,
+    ops::EmptyKernel<plat::CUDADeviceContext, int64_t>,
+    ops::EmptyKernel<plat::CUDADeviceContext, float>,
+    ops::EmptyKernel<plat::CUDADeviceContext, double>,
+    ops::EmptyKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/empty_op.h b/paddle/fluid/operators/empty_op.h
new file mode 100644
index 00000000000..9c913776838
--- /dev/null
+++ b/paddle/fluid/operators/empty_op.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class EmptyKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+
+    Tensor *out_tensor = context.Output<Tensor>("Out");
+
+    auto shape = GetShape(context);
+    out_tensor->Resize(shape);
+
+    out_tensor->mutable_data(context.GetPlace(), dtype);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index 74939da08b3..6fea8fe98bf 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -27,27 +27,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-inline framework::DDim GetShape(const framework::ExecutionContext &ctx,
-                                std::string op_type) {
-  // 1. shape is a Tensor
-  if (ctx.HasInput("ShapeTensor")) {
-    auto *shape_tensor = ctx.Input<framework::LoDTensor>("ShapeTensor");
-    auto vec_shape = GetDataFromTensor<int>(shape_tensor);
-    return framework::make_ddim(vec_shape);
-  }
-
-  // 2. shape is a list/tuple containing Tensor
-  auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
-  if (shape_tensor_list.size() > 0) {
-    auto vec_shape = GetDataFromTensorList(shape_tensor_list);
-    return framework::make_ddim(vec_shape);
-  }
-
-  // 3. shape is a list/tuple without containing Tensor
-  auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
-  return framework::make_ddim(vec_shape);
-}
-
 template <typename T>
 class FillConstantKernel : public framework::OpKernel<T> {
  public:
@@ -93,8 +72,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
       }
       value = tensor_data[0];
     }
-    const std::string op_type = "fill_constant";
-    auto shape = GetShape(ctx, op_type);
+    auto shape = GetShape(ctx);
 
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = out_var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 4f128463375..17a71c67b8a 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -34,8 +34,7 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
 
     std::normal_distribution<T> dist(mean, std);
-    const std::string op_type = "gaussian_random";
-    auto shape = GetShape(context, op_type);
+    auto shape = GetShape(context);
     tensor->Resize(shape);
     int64_t size = tensor->numel();
     T* data = tensor->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 69c8b600406..7a0c93eb1b2 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -58,8 +58,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    const std::string op_type = "gaussian_random";
-    auto shape = GetShape(context, op_type);
+    auto shape = GetShape(context);
     tensor->Resize(shape);
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index 98200caca8c..51fa5ad021a 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -30,8 +30,7 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
     float std = context.Attr<float>("std");
     auto* tensor = context.Output<framework::Tensor>("Out");
 
-    const std::string op_type = "gaussian_random";
-    auto shape = GetShape(context, op_type);
+    auto shape = GetShape(context);
     tensor->Resize(shape);
     T* data = tensor->mutable_data<T>(context.GetPlace());
     int64_t size = tensor->numel();
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index e53981a5365..aec995304a7 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -81,5 +81,26 @@ inline std::vector<T> GetDataFromTensorList(
   }
   return vec_new_data;
 }
+
+inline framework::DDim GetShape(const framework::ExecutionContext& ctx) {
+  // 1. shape is a Tensor
+  if (ctx.HasInput("ShapeTensor")) {
+    auto* shape_tensor = ctx.Input<framework::LoDTensor>("ShapeTensor");
+    auto vec_shape = GetDataFromTensor<int>(shape_tensor);
+    return framework::make_ddim(vec_shape);
+  }
+
+  // 2. shape is a list/tuple containing Tensor
+  auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
+  if (shape_tensor_list.size() > 0) {
+    auto vec_shape = GetDataFromTensorList(shape_tensor_list);
+    return framework::make_ddim(vec_shape);
+  }
+
+  // 3. shape is a list/tuple without containing Tensor
+  auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
+  return framework::make_ddim(vec_shape);
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index d5793eb424a..ed0b415d0bf 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -75,6 +75,7 @@ from .tensor.creation import full_like  #DEFINE_ALIAS
 from .tensor.creation import triu  #DEFINE_ALIAS
 from .tensor.creation import tril  #DEFINE_ALIAS
 from .tensor.creation import meshgrid  #DEFINE_ALIAS
+from .tensor.creation import empty  #DEFINE_ALIAS
 from .tensor.linalg import matmul  #DEFINE_ALIAS
 from .tensor.linalg import dot  #DEFINE_ALIAS
 # from .tensor.linalg import einsum        #DEFINE_ALIAS
diff --git a/python/paddle/fluid/tests/unittests/test_empty_op.py b/python/paddle/fluid/tests/unittests/test_empty_op.py
new file mode 100644
index 00000000000..e8b1f836fca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_empty_op.py
@@ -0,0 +1,270 @@
+#Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from op_test import OpTest
+from paddle.fluid import Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+
+# Situation 1: Attr(shape) is a list(without tensor)
+class TestEmptyOp(OpTest):
+    def setUp(self):
+        self.op_type = "empty"
+        self.init_config()
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        data_type = outs[0].dtype
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(outs[0])
+            min_value = np.nanmin(outs[0])
+
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            always_non_full_zero = max_value > min_value
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = outs[0].size
+            true_num = np.sum(outs[0] == True)
+            false_num = np.sum(outs[0] == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'float32'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+class TestEmptyOp2(TestEmptyOp):
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'float64'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+class TestEmptyOp3(TestEmptyOp):
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'int32'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+class TestEmptyOp4(TestEmptyOp):
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'int64'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+class TestEmptyOp5(TestEmptyOp):
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'bool'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+# Situation 2: shape is a tensor
+class TestEmptyOp_ShapeTensor(OpTest):
+    def setUp(self):
+        self.op_type = "empty"
+        self.init_config()
+
+    def init_config(self):
+        self.shape = [500, 3]
+        dtype = 'float32'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'dtype': dtype_inner}
+        self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")}
+        self.outputs = {'Out': np.zeros(self.shape).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        data_type = outs[0].dtype
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(outs[0])
+            min_value = np.nanmin(outs[0])
+
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            always_non_full_zero = max_value > min_value
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = outs[0].size
+            true_num = np.sum(outs[0] == True)
+            false_num = np.sum(outs[0] == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+
+
+# Situation 3: Attr(shape) is a list(with tensor)
+class TestEmptyOp_ShapeTensorList(OpTest):
+    def setUp(self):
+        self.op_type = "empty"
+        self.init_config()
+
+    def init_config(self):
+        self.shape = [123, 92]
+        self.infer_shape = [-1, 92]
+
+        dtype = 'float32'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+
+        shape_tensor_list = []
+        for index, ele in enumerate(self.shape):
+            shape_tensor_list.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {"ShapeTensorList": shape_tensor_list}
+        self.attrs = {'shape': self.infer_shape, 'dtype': dtype_inner}
+        self.outputs = {'Out': np.zeros(self.shape).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        data_type = outs[0].dtype
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(outs[0])
+            min_value = np.nanmin(outs[0])
+
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            always_non_full_zero = max_value > min_value
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = outs[0].size
+            true_num = np.sum(outs[0] == True)
+            false_num = np.sum(outs[0] == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+
+
+class TestEmptyAPI(unittest.TestCase):
+    def __check_out__(self, out, dtype='float32'):
+        max_value = np.nanmax(np.array(out))
+        min_value = np.nanmin(np.array(out))
+        always_non_full_zero = max_value > min_value
+        always_full_zero = max_value == 0.0 and min_value == 0.0
+        self.assertTrue(always_full_zero or always_non_full_zero,
+                        'always_full_zero or always_non_full_zero.')
+
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+        shape = [200, 3]
+        out = paddle.empty(shape=shape)
+        self.__check_out__(out)
+        paddle.enable_static()
+
+    def test_dygraph_api_out_2(self):
+        paddle.disable_static()
+        shape_data = np.array([200, 3]).astype('int32')
+        shape = paddle.to_tensor(shape_data)
+        out = paddle.empty(shape=shape)
+        self.__check_out__(out)
+        paddle.enable_static()
+
+    def test_dygraph_api_out_3(self):
+        paddle.disable_static()
+        shape_data = np.array([200, 3]).astype('int64')
+        shape = paddle.to_tensor(shape_data)
+        out = paddle.empty(shape=shape)
+        self.__check_out__(out)
+        paddle.enable_static()
+
+    def test_dygraph_api_attr(self):
+        paddle.disable_static()
+        shape = [200, 3]
+        dtype = 'float64'
+        out = paddle.empty(shape=shape, dtype=dtype)
+        self.__check_out__(out, dtype)
+        paddle.enable_static()
+
+    def test_static_graph(self):
+        dtype = 'float64'
+
+        positive_2_int32 = fluid.layers.fill_constant([1], "int32", 3)
+        positive_2_int64 = fluid.layers.fill_constant([1], "int64", 3)
+
+        shape_tensor_int32 = fluid.data(
+            name="shape_tensor_int32", shape=[2], dtype="int32")
+        shape_tensor_int64 = fluid.data(
+            name="shape_tensor_int64", shape=[2], dtype="int64")
+
+        out_1 = paddle.empty(shape=[200, 3], dtype=dtype)
+        out_2 = paddle.empty(shape=shape_tensor_int32, dtype=dtype)
+        out_3 = paddle.empty(shape=shape_tensor_int64, dtype=dtype)
+        out_4 = paddle.empty(shape=[200, positive_2_int32], dtype=dtype)
+        out_5 = paddle.empty(shape=[200, positive_2_int64], dtype=dtype)
+
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        res_1, res_2, res_3, res_4, res_5 = exe.run(
+            fluid.default_main_program(),
+            feed={
+                "shape_tensor_int32": np.array([200, 3]).astype("int32"),
+                "shape_tensor_int64": np.array([200, 3]).astype("int64"),
+            },
+            fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+        self.__check_out__(res_1, dtype)
+        self.__check_out__(res_2, dtype)
+        self.__check_out__(res_3, dtype)
+        self.__check_out__(res_4, dtype)
+        self.__check_out__(res_5, dtype)
+
+
+class TestEmptyError(unittest.TestCase):
+    def test_attr(self):
+        def test_dtype():
+            shape = [200, 3]
+            dtype = 'uint8'
+            result = paddle.empty(shape=shape, dtype=dtype)
+
+        self.assertRaises(TypeError, test_dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 0fed32a1676..8bb584be236 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -40,6 +40,7 @@ from .creation import full_like  #DEFINE_ALIAS
 from .creation import triu  #DEFINE_ALIAS
 from .creation import tril  #DEFINE_ALIAS
 from .creation import meshgrid  #DEFINE_ALIAS
+from .creation import empty  #DEFINE_ALIAS
 from .io import save  #DEFINE_ALIAS
 from .io import load  #DEFINE_ALIAS
 from .linalg import matmul  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 9eece1240d7..b75e2a8851f 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -48,6 +48,7 @@ __all__ = [
     'eye',
     'full',
     'full_like',
+    'empty',
     'triu',
     'tril',
     'meshgrid'
@@ -981,3 +982,90 @@ def diag(x, offset=0, padding_value=0, name=None):
 
     out.stop_gradient = True
     return out
+
+
+def empty(shape, dtype=None, name=None):
+    """
+    This Op returns a Tensor with uninitialized data which size is same as ``shape``.
+    
+    Args:
+        shape(list|tuple|Tensor): Shape of the Tensor to be created.
+                The data type of dimension of shape is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
+                the elements of it should be integers or Tensors with shape [1].
+                If ``shape`` is an Tensor, it should be an 1-D Tensor.
+        dtype(np.dtype|str, optional): Data type of the output Tensor
+            which can be bool, float16, float32, float64, int32, int64, if dytpe is `None`, the data
+            type of created Tensor use global default dtype (see ``get_default_dtype``
+            for details).
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        Tensor: Tensor which is created according to ``shape`` and ``dtype``, and is uninitialized.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()   # Now we are in imperative mode
+          paddle.set_device("cpu")  # and use cpu device
+
+          # example 1: argument ``shape`` is a list which doesn't contain Tensor.
+          data1 = paddle.empty(shape=[2,3], dtype='float32')
+          #[[4.3612203e+27 1.8176809e+31 1.3555911e-19]     # uninitialized
+          # [1.1699684e-19 1.3563156e-19 3.6408321e-11]]    # uninitialized
+
+          # example 2: argument ``shape`` is a Tensor, the data type must be int64 or int32.
+          shape_data = np.array([2, 3]).astype('int32')
+          shape = paddle.to_tensor(shape_data)
+          data2 = paddle.empty(shape=shape, dtype='float32')
+          #[[1.7192326e-37 4.8125365e-38 1.9866003e-36]     # uninitialized
+          # [1.3284029e-40 7.1117408e-37 2.5353012e+30]]    # uninitialized
+
+          # example 3: argument ``shape`` is a list which contains Tensor.
+          dim2_data = np.array([3]).astype('int32')
+          dim2 = paddle.to_tensor(dim2_data)
+          data3 = paddle.empty(shape=[2, dim2], dtype='float32')
+          #[[1.1024214e+24 7.0379409e+22 6.5737699e-34]     # uninitialized
+          # [7.5563101e+31 7.7130405e+31 2.8020654e+20]]    # uninitialized
+    """
+
+    if dtype is None:
+        dtype = paddle.get_default_dtype()
+
+    dtype = convert_dtype(dtype)
+
+    if in_dygraph_mode():
+        shape = utils.convert_shape_to_list(shape)
+        out = core.ops.empty('shape', shape, 'dtype',
+                             convert_np_dtype_to_dtype_(dtype))
+        out.stop_gradient = True
+        return out
+
+    helper = LayerHelper("empty", **locals())
+    inputs = {}
+
+    check_dtype(dtype, 'dtype',
+                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+                'empty')
+    check_type(shape, 'shape', (Variable, list, tuple), 'empty')
+
+    if isinstance(shape, Variable):
+        check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'empty')
+
+    attrs = {}
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='empty')
+
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+    attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
+    helper.append_op(
+        type='empty',
+        inputs=inputs,
+        outputs={'Out': [out]},
+        attrs=attrs,
+        stop_gradient=True)
+    out.stop_gradient = True
+    return out
-- 
GitLab


From 1b84c0bf43519113952d7806815f15ee9a71fec9 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Fri, 11 Sep 2020 09:35:34 +0800
Subject: [PATCH 043/261] Lite subgraph refine predictor (#27167)

---
 cmake/external/lite.cmake                     |  2 +-
 paddle/fluid/inference/analysis/argument.h    |  4 +
 .../inference/analysis/ir_pass_manager.cc     |  2 +
 .../analysis/ir_passes/lite_subgraph_pass.cc  | 10 +-
 .../fluid/inference/api/analysis_predictor.cc |  2 +
 paddle/fluid/inference/lite/engine.cc         | 46 +++++----
 paddle/fluid/inference/lite/engine.h          | 27 +++---
 paddle/fluid/inference/lite/tensor_utils.cc   | 97 ++++++++++++++-----
 paddle/fluid/inference/lite/test_engine.cc    |  8 +-
 .../fluid/inference/lite/test_tensor_utils.cc | 37 ++++++-
 .../inference/tests/api/lite_resnet50_test.cc |  2 +-
 paddle/fluid/operators/lite/lite_engine_op.h  |  8 +-
 .../operators/lite/lite_engine_op_test.cc     |  6 +-
 13 files changed, 176 insertions(+), 75 deletions(-)

diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index 8a655b2954d..3da550519ba 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa)
+    set(LITE_GIT_TAG 6d2b2a4028a58715b01887b04eb9bff8432eb184)
   endif()
 
   if(NOT CUDA_ARCH_NAME)
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 27bae7a71ea..8d28b8ace26 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -218,6 +218,10 @@ struct Argument {
 
   DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);
 
+  // Only used in paddle-lite subgraph.
+  DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads,
+                      int);
+
  private:
   std::unordered_set<std::string> valid_fields_;
 };
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index cd8d86d7293..d52d71f148c 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -150,6 +150,8 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("use_xpu", new bool(argument->use_xpu()));
       pass->Set("xpu_l3_workspace_size",
                 new int(argument->xpu_l3_workspace_size()));
+      pass->Set("cpu_math_library_num_threads",
+                new int(argument->cpu_math_library_num_threads()));
     }
     disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index 6b16a481dde..e78d5ef017b 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -244,6 +244,7 @@ void LiteSubgraphPass::SetUpEngine(
   bool enable_int8 = Get<bool>("enable_int8");
   bool use_xpu = Get<bool>("use_xpu");
   int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
+  int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
 
   lite_api::TargetType target_type;
   if (use_gpu) {
@@ -263,11 +264,12 @@ void LiteSubgraphPass::SetUpEngine(
       // Notice: The ordering here determines the device where the
       // input tensor of the Lite engine is located, and then affects
       // whether tensor sharing is feasible.
-      paddle::lite::Place({target_type, precision_type}),
-      paddle::lite::Place({target_type, PRECISION(kInt64)}),
-      paddle::lite::Place({target_type, PRECISION(kFloat)}),
-      paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
+      paddle::lite_api::Place({target_type, precision_type}),
+      paddle::lite_api::Place({target_type, PRECISION(kInt64)}),
+      paddle::lite_api::Place({target_type, PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kFloat)}),
   };
+  config.cpu_math_library_num_threads = cpu_math_library_num_threads;
   config.xpu_l3_workspace_size = xpu_l3_workspace_size;
   if (dump_model) {
     lite::StrToBinaryFile("./model.bin", config.model);
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 500aa8341d6..64dfdda54ac 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -461,6 +461,8 @@ void AnalysisPredictor::PrepareArgument() {
   }
 
   if (config_.lite_engine_enabled()) {
+    argument_.SetCpuMathLibraryNumThreads(
+        config_.cpu_math_library_num_threads());
     argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
     argument_.SetLitePassesFilter(config_.lite_passes_filter_);
     argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index 8e88c944939..5f24ef00bce 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -20,8 +20,12 @@
 #define LITE_WITH_XPU 1
 #endif
 
+#ifndef PADDLE_WITH_ARM
+#define LITE_WITH_X86 1
+#endif
+
 #include "paddle/fluid/inference/lite/engine.h"
-#include "lite/api/paddle_use_passes.h"
+#include <utility>
 
 namespace paddle {
 namespace inference {
@@ -36,32 +40,40 @@ bool EngineManager::Has(const std::string& name) const {
   return engines_.at(name).get() != nullptr;
 }
 
-paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
+paddle::lite_api::PaddlePredictor* EngineManager::Get(
+    const std::string& name) const {
   return engines_.at(name).get();
 }
 
-paddle::lite::Predictor* EngineManager::Create(const std::string& name,
-                                               const EngineConfig& cfg) {
-  if (cfg.valid_places.front().target == TARGET(kCUDA)) {
-#ifdef PADDLE_WITH_CUDA
-    paddle::lite::Env<TARGET(kCUDA)>::Init();
+paddle::lite_api::PaddlePredictor* EngineManager::Create(
+    const std::string& name, const EngineConfig& cfg) {
+  // config info for predictor.
+  paddle::lite_api::CxxConfig lite_cxx_config;
+  lite_cxx_config.set_model_buffer(cfg.model.c_str(), cfg.model.size(),
+                                   cfg.param.c_str(), cfg.param.size());
+  lite_cxx_config.set_valid_places(cfg.valid_places);
+#ifdef PADDLE_WITH_ARM
+  set_threads.set_threads(cfg.cpu_math_library_num_threads);
+#else
+  lite_cxx_config.set_x86_math_library_num_threads(
+      cfg.cpu_math_library_num_threads);
 #endif
-  } else if (cfg.valid_places.front().target == TARGET(kXPU)) {
+
 #ifdef PADDLE_WITH_XPU
-    paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread =
-        cfg.xpu_l3_workspace_size;
+  lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
+      cfg.xpu_l3_workspace_size);
 #endif
-  }
-  auto* p = new paddle::lite::Predictor();
-  p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
-           cfg.model_type, cfg.model_from_memory);
-  engines_[name].reset(p);
-  return p;
+
+  // create predictor
+  std::shared_ptr<paddle::lite_api::PaddlePredictor> p =
+      paddle::lite_api::CreatePaddlePredictor(lite_cxx_config);
+  engines_[name] = std::move(p);
+  return engines_[name].get();
 }
 
 void EngineManager::DeleteAll() {
   for (auto& item : engines_) {
-    item.second.reset(nullptr);
+    item.second.reset();
   }
 }
 
diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h
index 345eb682e9f..5ba487cc24d 100644
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -23,12 +23,9 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wall"
 #include "lite/api/cxx_api.h"
+#include "lite/api/paddle_api.h"
 #include "lite/api/paddle_place.h"
-#include "lite/core/context.h"
-#include "lite/core/device_info.h"
-#include "lite/core/memory.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
+#include "lite/api/paddle_use_passes.h"
 #pragma GCC diagnostic pop
 
 namespace paddle {
@@ -38,25 +35,33 @@ namespace lite {
 struct EngineConfig {
   std::string model;
   std::string param;
-  paddle::lite::Place prefer_place;
-  std::vector<paddle::lite::Place> valid_places;
+  std::vector<paddle::lite_api::Place> valid_places;
   std::vector<std::string> neglected_passes;
   lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
   bool model_from_memory{true};
+
+  // for xpu
   size_t xpu_l3_workspace_size;
+
+  // for x86 or arm
+  int cpu_math_library_num_threads{1};
+
+  // for cuda
+  bool use_multi_stream{false};
 };
 
 class EngineManager {
  public:
   bool Empty() const;
   bool Has(const std::string& name) const;
-  paddle::lite::Predictor* Get(const std::string& name) const;
-  paddle::lite::Predictor* Create(const std::string& name,
-                                  const EngineConfig& cfg);
+  paddle::lite_api::PaddlePredictor* Get(const std::string& name) const;
+  paddle::lite_api::PaddlePredictor* Create(const std::string& name,
+                                            const EngineConfig& cfg);
   void DeleteAll();
 
  private:
-  std::unordered_map<std::string, std::unique_ptr<paddle::lite::Predictor>>
+  std::unordered_map<std::string,
+                     std::shared_ptr<paddle::lite_api::PaddlePredictor>>
       engines_;
 };
 
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index d79a041ccf8..33661594b92 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/lite/tensor_utils.h"
+#include <functional>
 #include <map>
 #include <memory>
 #include "paddle/fluid/framework/data_type.h"
@@ -144,16 +145,55 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
   }
 }
 
-void InitDstTensor(paddle::lite::Tensor* dst, const framework::LoDTensor& src) {
+void* GetLiteTensorDataPtr(paddle::lite_api::Tensor* src,
+                           PrecisionType precision_type,
+                           TargetType target_type) {
+  void* res{nullptr};
+  switch (precision_type) {
+    case PrecisionType::kFloat:
+      res = static_cast<void*>(src->mutable_data<float>(target_type));
+      break;
+    case PrecisionType::kInt8:
+      res = static_cast<void*>(src->mutable_data<int8_t>(target_type));
+      break;
+    case PrecisionType::kInt32:
+      res = static_cast<void*>(src->mutable_data<int32_t>(target_type));
+      break;
+    case PrecisionType::kInt64:
+      res = static_cast<void*>(src->mutable_data<int64_t>(target_type));
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported precision type. Now only supports FP32, INT8, INT32 and "
+          "INT64."));
+      break;
+  }
+  return res;
+}
+
+int64_t GetLiteTensorNumel(const paddle::lite_api::Tensor& tensor) {
+  auto shape = tensor.shape();
+  int64_t numel = std::accumulate(shape.begin(), shape.end(), 1,
+                                  std::multiplies<int64_t>());
+  return numel;
+}
+
+void InitDstTensor(paddle::lite_api::Tensor* dst,
+                   const framework::LoDTensor& src) {
   // Currently, Lite needs to explicitly specify the target type of
   // the input tensor.
   constexpr int empty_size = 0;
-  dst->mutable_data(GetLiteTargetType(src.place()), empty_size);
-  dst->set_precision(GetLitePrecisionType(src.type()));
-  SetLoD(dst->mutable_lod(), src.lod());
+  dst->Resize({empty_size});
+  GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
+                       GetLiteTargetType(src.place()));
+  dst->SetPrecision(GetLitePrecisionType(src.type()));
+  paddle::lite::LoD lite_lod;
+  SetLoD(&lite_lod, src.lod());
+  dst->SetLoD(lite_lod);
 }
 
-void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
+void InitDstTensor(framework::LoDTensor* dst,
+                   const paddle::lite_api::Tensor& src) {
   constexpr framework::proto::VarType::Type dtype =
       framework::proto::VarType_Type_FP32;
   dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()),
@@ -162,7 +202,8 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
 }
 
 template <>
-void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
+void TensorCopyAsync(paddle::lite_api::Tensor* dst,
+                     const framework::LoDTensor& src,
                      const platform::DeviceContext& ctx) {
   InitDstTensor(dst, src);
   const platform::Place& src_place = src.place();
@@ -171,52 +212,56 @@ void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
       static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type());
   dst->Resize(framework::vectorize(src.dims()));
   const void* src_data = src.data<void>();
-  void* dst_data = dst->mutable_data(bytes);
+  void* dst_data{nullptr};
+  dst_data = GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
+                                  GetLiteTargetType(src.place()));
   VLOG(3) << "[CopyAsync fluid -> lite] Bytes = " << bytes << ", src = " << &src
           << ", dst = " << dst << ", src_type = " << src.type();
   MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
-  VLOG(3) << "[Lite memory size] Bytes = " << dst->memory_size();
+  VLOG(3) << "[Lite memory size] Bytes = " << bytes;
 }
 
 template <>
-void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src,
+void TensorCopyAsync(framework::LoDTensor* dst,
+                     const paddle::lite_api::Tensor& src,
                      const platform::DeviceContext& ctx) {
-  dst->Resize(paddle::framework::make_ddim(src.dims().Vectorize()));
+  dst->Resize(paddle::framework::make_ddim(src.shape()));
   InitDstTensor(dst, src);
   const platform::Place& src_place = GetNativePlace(src.target());
   const platform::Place& dst_place = dst->place();
-  const size_t bytes =
-      static_cast<size_t>(src.numel()) * framework::SizeOfType(dst->type());
-  const void* src_data = src.raw_data();
+  int64_t src_numel = GetLiteTensorNumel(src);
+  const size_t bytes = src_numel * framework::SizeOfType(dst->type());
+  const void* src_data = src.data<void>();
   // When Lite is ready, the source type needs to be modified here.
   void* dst_data = dst->mutable_data(dst_place, dst->type());
   VLOG(3) << "[CopyAsync lite -> fluid] Bytes = " << bytes << ", src = " << &src
           << ", dst = " << dst << ", src_type = " << dst->type();
   MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
-  VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size();
+  VLOG(3) << "[Lite memory size] Bytes = " << bytes;
 }
 
 template <>
-void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) {
-  const size_t bytes =
-      static_cast<size_t>(src->numel()) * framework::SizeOfType(src->type());
-  auto buf = std::make_shared<paddle::lite::Buffer>(paddle::lite::Buffer(
-      src->data<void>(), GetLiteTargetType(src->place()), src->memory_size()));
+void TensorDataShare(paddle::lite_api::Tensor* dst, framework::LoDTensor* src) {
   dst->Resize(framework::vectorize(src->dims()));
-  dst->set_precision(GetLitePrecisionType(src->type()));
-  SetLoD(dst->mutable_lod(), src->lod());
-  dst->ResetBuffer(buf, bytes);
+  dst->ShareExternalMemory(src->data<void>(), src->memory_size(),
+                           GetLiteTargetType(src->place()));
+  dst->SetPrecision(GetLitePrecisionType(src->type()));
+  paddle::lite::LoD lite_lod;
+  SetLoD(&lite_lod, src->lod());
+  dst->SetLoD(lite_lod);
 }
 
 template <>
-void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) {
+void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
   constexpr framework::proto::VarType::Type dtype =
       framework::proto::VarType_Type_FP32;
-  void* src_raw_data = src->raw_data();
+  void* src_raw_data =
+      GetLiteTensorDataPtr(src, GetLitePrecisionType(dtype), src->target());
+  size_t memory_size = GetLiteTensorNumel(*src) * sizeof(float);
   std::shared_ptr<memory::allocation::Allocation> holder(
-      new memory::allocation::Allocation(src_raw_data, src->memory_size(),
+      new memory::allocation::Allocation(src_raw_data, memory_size,
                                          GetNativePlace(src->target())));
-  dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize()));
+  dst->Resize(paddle::framework::make_ddim(src->shape()));
   SetLoD(dst->mutable_lod(), src->lod());
   dst->ResetHolderWithType(holder, dtype);
 }
diff --git a/paddle/fluid/inference/lite/test_engine.cc b/paddle/fluid/inference/lite/test_engine.cc
index d29bcb76be7..e505af19d53 100644
--- a/paddle/fluid/inference/lite/test_engine.cc
+++ b/paddle/fluid/inference/lite/test_engine.cc
@@ -102,10 +102,10 @@ TEST(EngineManager, engine) {
   config.model_from_memory = true;
   config.valid_places = {
 #ifdef PADDLE_WITH_CUDA
-      paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
 #endif
-      paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
-      paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
+      paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
   };
 
   LOG(INFO) << "Create EngineManager";
@@ -118,7 +118,7 @@ TEST(EngineManager, engine) {
   ASSERT_EQ(inference::Singleton<inference::lite::EngineManager>::Global().Has(
                 unique_key),
             true);
-  paddle::lite::Predictor* engine_0 =
+  paddle::lite_api::PaddlePredictor* engine_0 =
       inference::Singleton<inference::lite::EngineManager>::Global().Get(
           unique_key);
   CHECK_NOTNULL(engine_0);
diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc
index eef7bfb68fe..a792fb77d6a 100644
--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -73,6 +73,33 @@ TEST(LiteEngineOp, GetNativeLayoutType) {
   EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC));
 }
 
+template <typename T>
+void test_lite_tensor_data_ptr(PrecisionType precision_type) {
+  void* GetLiteTensorDataPtr(paddle::lite_api::Tensor * src,
+                             PrecisionType precision_type,
+                             TargetType target_type);
+  const int count = 4;
+  paddle::lite::Tensor lite_tensor;
+  lite_tensor.Resize({count});
+  auto* lite_tensor_data = lite_tensor.mutable_data<T>();
+  for (size_t i = 0; i < count; ++i) {
+    lite_tensor_data[i] = i;
+  }
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  T* data = static_cast<T*>(GetLiteTensorDataPtr(
+      &lite_api_tensor, precision_type, TargetType::kHost));
+  for (size_t i = 0; i < count; ++i) {
+    CHECK_EQ(data[i], static_cast<T>(i)) << "the i-th num is not correct.";
+  }
+}
+
+TEST(LiteEngineOp, GetLiteTensorDataPtr) {
+  test_lite_tensor_data_ptr<int64_t>(PrecisionType::kInt64);
+  test_lite_tensor_data_ptr<int32_t>(PrecisionType::kInt32);
+  test_lite_tensor_data_ptr<int8_t>(PrecisionType::kInt8);
+  EXPECT_ANY_THROW(test_lite_tensor_data_ptr<double>(PrecisionType::kUnk));
+}
+
 void test_tensor_copy(const platform::DeviceContext& ctx) {
   // Create LoDTensor.
   std::vector<float> vector({1, 2, 3, 4});
@@ -83,10 +110,11 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
   lod_tensor.set_lod(lod);
   // Create lite::Tensor and copy.
   paddle::lite::Tensor lite_tensor;
-  TensorCopyAsync(&lite_tensor, lod_tensor, ctx);
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  TensorCopyAsync(&lite_api_tensor, lod_tensor, ctx);
   // Copy to LoDTensor.
   framework::LoDTensor lod_tensor_n;
-  TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
+  TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(ctx.GetPlace())) {
     platform::GpuStreamSync(
@@ -108,10 +136,11 @@ void test_tensor_share(const platform::DeviceContext& ctx) {
   lod_tensor.set_lod(lod);
   // Create lite::Tensor and share.
   paddle::lite::Tensor lite_tensor;
-  TensorDataShare(&lite_tensor, &lod_tensor);
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  TensorDataShare(&lite_api_tensor, &lod_tensor);
   // Copy to LoDTensor.
   framework::LoDTensor lod_tensor_n;
-  TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
+  TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
   std::vector<float> result;
   TensorToVector(lod_tensor_n, ctx, &result);
   ASSERT_EQ(result, vector);
diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
index 31701c59ec3..9ae073e9e5b 100644
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -27,7 +27,7 @@ TEST(AnalysisPredictor, use_gpu) {
   AnalysisConfig config;
   config.EnableUseGpu(100, 0);
   config.SetModel(model_dir + "/model", model_dir + "/params");
-  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
+  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32, true);
 
   std::vector<PaddleTensor> inputs;
   auto predictor = CreatePaddlePredictor(config);
diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
index a920bf7c3f5..f6d65704388 100644
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -39,7 +39,7 @@ class LiteEngineOp : public framework::OperatorBase {
  private:
   std::vector<std::string> in_names_;
   std::vector<std::string> out_names_;
-  paddle::lite::Predictor *engine_;
+  paddle::lite_api::PaddlePredictor *engine_;
   framework::proto::VarType::Type precision_;
   bool use_gpu_;
   bool zero_copy_;
@@ -78,10 +78,10 @@ class LiteEngineOp : public framework::OperatorBase {
       framework::LoDTensor src_t =
           inference::analysis::GetFromScope<framework::LoDTensor>(scope,
                                                                   in_names_[i]);
-      paddle::lite::Tensor *dst_t = engine_->GetInput(i);
+      paddle::lite_api::Tensor dst_t = *(engine_->GetInput(i));
       VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
               << engine_->GetInputNames()[i] << ")";
-      inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
+      inference::lite::utils::TensorCopy(&dst_t, &src_t, *ctx, zero_copy_);
     }
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(dev_place)) {
@@ -93,7 +93,7 @@ class LiteEngineOp : public framework::OperatorBase {
     engine_->Run();
     VLOG(3) << "lite engine run done";
     for (size_t i = 0; i < out_names_.size(); i++) {
-      paddle::lite::Tensor src_t = *(engine_->GetOutput(i));
+      paddle::lite_api::Tensor src_t = *(engine_->GetOutput(i));
       framework::LoDTensor *dst_t =
           &inference::analysis::GetFromScope<framework::LoDTensor>(
               scope, out_names_[i]);
diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc
index fb5c0dcb351..76c963ac652 100644
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -84,10 +84,10 @@ TEST(LiteEngineOp, engine_op) {
   inference::lite::EngineConfig config;
   config.valid_places = {
 #ifdef PADDLE_WITH_CUDA
-      paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
 #endif
-      paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
-      paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
   };
   serialize_params(&(config.param), &scope, repetitive_params);
   config.model = program.Proto()->SerializeAsString();
-- 
GitLab


From 5e0dde02b23c9cc2c952daa7c563e9f712b039f4 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Fri, 11 Sep 2020 11:11:09 +0800
Subject: [PATCH 044/261] [Dy2stat] support usage: to_static(model) (#27040)

* support to_static(model)

* add warning and unittest
---
 python/paddle/fluid/dygraph/jit.py            | 11 +++++++-
 .../dygraph_to_static/test_declarative.py     | 26 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index ec96bdd9786..57864efec8a 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -212,7 +212,16 @@ def declarative(function=None, input_spec=None):
 
     # for usage: `declarative(foo, ...)`
     if function is not None:
-        return decorated(function)
+        if isinstance(function, Layer):
+            if isinstance(function.forward, StaticLayer):
+                class_name = function.__class__.__name__
+                warnings.warn(
+                    "`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one.".
+                    format(class_name))
+            function.forward = decorated(function.forward)
+            return function
+        else:
+            return decorated(function)
 
     # for usage: `@declarative`
     return decorated
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index eed02ea655e..5582a65304d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -332,5 +332,31 @@ class TestDeclarativeAPI(unittest.TestCase):
             func(np.ones(5).astype("int32"))
 
 
+class TestDecorateModelDirectly(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        program_trans.enable(True)
+        self.x = to_variable(np.ones([4, 10]).astype('float32'))
+
+    def test_fake_input(self):
+        net = SimpleNet()
+        net = declarative(net)
+        y = net(self.x)
+        self.assertTrue(len(net.forward.program_cache) == 1)
+
+    def test_input_spec(self):
+        net = SimpleNet()
+        net = declarative(net, input_spec=[InputSpec([None, 8, 10])])
+        self.assertTrue(len(net.forward.inputs) == 1)
+        self.assertTrue(len(net.forward.program_cache) == 1)
+        input_shape = net.forward.inputs[0].shape
+        self.assertListEqual(list(input_shape), [-1, 8, 10])
+
+        # redecorate
+        net = declarative(net, input_spec=[InputSpec([None, 16, 10])])
+        input_shape = net.forward.inputs[0].shape
+        self.assertListEqual(list(input_shape), [-1, 16, 10])
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From ac8afe184e1c079bf7f5ee9a6858cf5cdbbcecb7 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 11 Sep 2020 11:14:10 +0800
Subject: [PATCH 045/261] use structured name in loaded dict (#27242)

---
 python/paddle/fluid/dygraph/checkpoint.py         | 15 ++++++++++++++-
 .../fluid/tests/unittests/test_jit_save_load.py   |  7 +++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 30ded1f7eda..9876fc620b8 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -25,7 +25,7 @@ import warnings
 from .. import core
 from .base import guard
 from paddle.fluid.dygraph.jit import SaveLoadConfig, deprecate_save_load_configs
-from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
+from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME
 
 __all__ = [
     'save_dygraph',
@@ -233,6 +233,19 @@ def load_dygraph(model_path, config=None):
             para_dict = dict()
             for var_name in persistable_var_dict:
                 para_dict[var_name] = persistable_var_dict[var_name].numpy()
+
+            # if __variables.info__ exists, we can recover structured_name
+            var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME)
+            if os.path.exists(var_info_path):
+                with open(var_info_path, 'rb') as f:
+                    extra_var_info = pickle.load(f)
+                structured_para_dict = dict()
+                for var_name in para_dict:
+                    structured_name = extra_var_info[var_name].get(
+                        'structured_name', None)
+                    assert structured_name is not None, "Cannot find saved variable (%s)'s structured name in saved model." % var_name
+                    structured_para_dict[structured_name] = para_dict[var_name]
+                para_dict = structured_para_dict
     else:
         # Load state dict by `save_dygraph` save format
         para_dict = {}
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 7bf806bab55..f0680206de2 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -255,8 +255,11 @@ class TestJitSaveLoad(unittest.TestCase):
         train_layer.eval()
         # construct new model
         new_layer = LinearNet(784, 1)
-        model_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
-        new_layer.set_dict(model_dict)
+        orig_state_dict = new_layer.state_dict()
+        load_state_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
+        for structured_name in orig_state_dict:
+            self.assertTrue(structured_name in load_state_dict)
+        new_layer.set_state_dict(load_state_dict)
         new_layer.eval()
         # inference & compare
         x = fluid.dygraph.to_variable(
-- 
GitLab


From 3e20ddf73d627a2e63900d65815cc9e5bc800f84 Mon Sep 17 00:00:00 2001
From: liym27 <33742067+liym27@users.noreply.github.com>
Date: Fri, 11 Sep 2020 11:42:59 +0800
Subject: [PATCH 046/261] [Dy2Stat - Error Handling] Fix bug and optimize
 dy2stat error.  (#27225)

---
 .../fluid/dygraph/dygraph_to_static/error.py  | 33 ++++++-
 .../dygraph_to_static/program_translator.py   | 85 ++++++++++---------
 .../unittests/dygraph_to_static/test_error.py | 78 ++++++++++++++---
 3 files changed, 140 insertions(+), 56 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
index 5aba7ca0fdc..be21ab6d539 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import six
 import sys
 import traceback
 
@@ -20,6 +21,14 @@ from paddle.fluid.dygraph.dygraph_to_static.origin_info import Location, OriginI
 
 ERROR_DATA = "Error data about original source code information and traceback."
 
+# A flag to set whether to open the dygraph2static error reporting module
+SIMPLIFY_ERROR_ENV_NAME = "TRANSLATOR_SIMPLIFY_NEW_ERROR"
+DEFAULT_SIMPLIFY_NEW_ERROR = 1
+
+# A flag to set whether to display the simplified error stack
+DISABLE_ERROR_ENV_NAME = "TRANSLATOR_DISABLE_NEW_ERROR"
+DEFAULT_DISABLE_NEW_ERROR = 0
+
 
 def attach_error_data(error, in_runtime=False):
     """
@@ -103,7 +112,10 @@ class ErrorData(object):
 
         # Simplify error value to improve readability if error is raised in runtime
         if self.in_runtime:
-            self._simplify_error_value()
+            if int(
+                    os.getenv(SIMPLIFY_ERROR_ENV_NAME,
+                              DEFAULT_SIMPLIFY_NEW_ERROR)):
+                self._simplify_error_value()
             message_lines.append(str(self.error_value))
             return '\n'.join(message_lines)
 
@@ -150,3 +162,22 @@ class ErrorData(object):
 
         error_value_str = '\n'.join(error_value_lines)
         self.error_value = self.error_type(error_value_str)
+
+    def raise_new_exception(self):
+
+        # Raises the origin error if disable dygraph2static error module,
+        if int(os.getenv(DISABLE_ERROR_ENV_NAME, DEFAULT_DISABLE_NEW_ERROR)):
+            raise
+
+        new_exception = self.create_exception()
+        if six.PY3:
+            # NOTE(liym27):
+            # 1. Why `raise new_exception from None`?
+            #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
+            #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
+            #   caught exception.
+            # 2. Use exec to bypass syntax error checking in Python 2.
+
+            six.exec_("raise new_exception from None")
+        else:
+            raise new_exception
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 3d27810f1db..e5fce3e6ede 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -32,8 +32,7 @@ from paddle.fluid.layers.utils import flatten
 from paddle.fluid.dygraph.base import param_guard
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.dygraph.dygraph_to_static import DygraphToStaticAst
-from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
-from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data
+from paddle.fluid.dygraph.dygraph_to_static import error
 from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import create_and_update_origin_info_map
@@ -315,6 +314,7 @@ class StaticLayer(object):
 
         # 2. trace ops from dygraph layers and cache the generated program.
         args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs)
+
         try:
             concrete_program, partial_program_layer = self.get_concrete_program(
                 *args, **kwargs)
@@ -324,27 +324,22 @@ class StaticLayer(object):
                 partial_program_layer.training = self._class_instance.training
 
             # 4. return outputs.
-            return partial_program_layer(args)
+            try:
+                return partial_program_layer(args)
+            except Exception as e:
+                if not hasattr(e, error.ERROR_DATA):
+                    # runtime error
+                    error.attach_error_data(e, in_runtime=True)
+                    raise
         except Exception as e:
-            if not hasattr(e, ERROR_DATA):
-                # runtime error
-                attach_error_data(e, in_runtime=True)
-            error_data = getattr(e, ERROR_DATA, None)
+            error_data = getattr(e, error.ERROR_DATA, None)
             if error_data:
-                new_exception = error_data.create_exception()
-                if six.PY3:
-                    # NOTE(liym27):
-                    # 1. Why `raise new_exception from None`?
-                    #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
-                    #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
-                    #   caught exception.
-                    # 2. Use exec to bypass syntax error checking in Python 2.
-
-                    six.exec_("raise new_exception from None")
-                else:
-                    raise new_exception
+                error_data.raise_new_exception()
             else:
-                raise
+                logging_utils.warn(
+                    "Please file an issue at 'https://github.com/PaddlePaddle/Paddle/issues'"
+                    " if you can't handle this {} yourself.".format(type(e)))
+                raise e
 
     def _call_dygraph_function(self, *args, **kwargs):
         """
@@ -593,7 +588,7 @@ class ConcreteProgram(object):
                         outputs = static_func(*inputs)
                     except BaseException as e:
                         # NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here.
-                        attach_error_data(e)
+                        error.attach_error_data(e)
                         raise
 
                 if not isinstance(outputs,
@@ -813,28 +808,36 @@ class ProgramTranslator(object):
                 "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable = False. "
                 "We will just return dygraph output.")
             return dygraph_func(*args, **kwargs)
-
-        function_spec = FunctionSpec(dygraph_func)
-        cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
-                                                getattr(dygraph_func,
-                                                        '__self__', None))
-        _, partial_program_layer = self._program_cache[cache_key]
-
-        if args and isinstance(args[0], layers.Layer):
-            # Synchronize self.training attribute.
-            partial_program_layer.training = args[0].training
-            args = args[1:]
         try:
-            return partial_program_layer(args)
-
+            function_spec = FunctionSpec(dygraph_func)
+            cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
+                                                    getattr(dygraph_func,
+                                                            '__self__', None))
+            _, partial_program_layer = self._program_cache[cache_key]
+
+            if args and isinstance(args[0], layers.Layer):
+                # Synchronize self.training attribute.
+                partial_program_layer.training = args[0].training
+                args = args[1:]
+            try:
+                return partial_program_layer(args)
+            except BaseException as e:
+                # NOTE:
+                # 1. If e is raised in compile time, e should have been attached to ERROR_DATA before;
+                # 2. If e raised in runtime, e should be attached to ERROR_DATA here.
+                if not hasattr(e, error.ERROR_DATA):
+                    # runtime error
+                    error.attach_error_data(e, in_runtime=True)
+                raise
         except BaseException as e:
-            # NOTE:
-            # 1. If e is raised in compile time, e should have been attached to ERROR_DATA before;
-            # 2. If e raised in runtime, e should be attached to ERROR_DATA here.
-            if not hasattr(e, ERROR_DATA):
-                # runtime error
-                attach_error_data(e, in_runtime=True)
-            raise
+            error_data = getattr(e, error.ERROR_DATA, None)
+            if error_data:
+                error_data.raise_new_exception()
+            else:
+                logging_utils.warn(
+                    "Please file an issue at 'https://github.com/PaddlePaddle/Paddle/issues'"
+                    " if you can't handle this {} yourself.".format(type(e)))
+                raise e
 
     def get_func(self, dygraph_func):
         """
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
index 586020d4345..2998ba85757 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
@@ -14,15 +14,15 @@
 
 from __future__ import print_function
 
+import os
 import inspect
 import unittest
-
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.core import EnforceNotMet
-from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA, ErrorData
+from paddle.fluid.dygraph.dygraph_to_static import error
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import unwrap
-from paddle.fluid.dygraph.jit import declarative
 
 
 def inner_func():
@@ -30,7 +30,7 @@ def inner_func():
     return
 
 
-@declarative
+@paddle.jit.to_static
 def func_error_in_compile_time(x):
     x = fluid.dygraph.to_variable(x)
     inner_func()
@@ -41,14 +41,14 @@ def func_error_in_compile_time(x):
     return x_v
 
 
-@declarative
+@paddle.jit.to_static
 def func_error_in_compile_time_2(x):
     x = fluid.dygraph.to_variable(x)
     x = fluid.layers.reshape(x, shape=[1, 2])
     return x
 
 
-@declarative
+@paddle.jit.to_static
 def func_error_in_runtime(x, iter_num=3):
     x = fluid.dygraph.to_variable(x)
     two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32")
@@ -61,6 +61,9 @@ class TestErrorInCompileTime(unittest.TestCase):
         self.set_func()
         self.set_input()
         self.set_exception_type()
+        self.prog_trans = paddle.jit.ProgramTranslator()
+        self.simplify_error = 1
+        self.disable_error = 0
 
     def set_func(self):
         self.func = func_error_in_compile_time
@@ -88,14 +91,38 @@ class TestErrorInCompileTime(unittest.TestCase):
         for m in self.expected_message:
             self.assertIn(m, error_message)
 
-    def test(self):
-        with fluid.dygraph.guard():
-            with self.assertRaises(self.exception_type) as cm:
-                self.func(self.input)
-            exception = cm.exception
-            error_data = getattr(exception, ERROR_DATA)
-            self.assertIsInstance(error_data, ErrorData)
-            self._test_create_message(error_data)
+    def _test_attach_and_raise_new_exception(self, func_call):
+        paddle.disable_static()
+        with self.assertRaises(self.exception_type) as cm:
+            func_call()
+        exception = cm.exception
+
+        error_data = getattr(exception, error.ERROR_DATA, None)
+
+        self.assertIsInstance(error_data, error.ErrorData)
+        self._test_create_message(error_data)
+
+    def test_static_layer_call(self):
+        # NOTE: self.func(self.input) is the StaticLayer().__call__(self.input)
+        call_dy2static = lambda: self.func(self.input)
+
+        self.set_flags(0)
+        self._test_attach_and_raise_new_exception(call_dy2static)
+
+    def test_program_translator_get_output(self):
+        call_dy2static = lambda : self.prog_trans.get_output(unwrap(self.func), self.input)
+
+        self.set_flags(0)
+        self._test_attach_and_raise_new_exception(call_dy2static)
+
+    def set_flags(self, disable_error=0, simplify_error=1):
+        os.environ[error.DISABLE_ERROR_ENV_NAME] = str(disable_error)
+        self.disable_error = int(os.getenv(error.DISABLE_ERROR_ENV_NAME, 0))
+        self.assertEqual(self.disable_error, disable_error)
+
+        os.environ[error.SIMPLIFY_ERROR_ENV_NAME] = str(simplify_error)
+        self.simplify_error = int(os.getenv(error.SIMPLIFY_ERROR_ENV_NAME, 1))
+        self.assertEqual(self.simplify_error, simplify_error)
 
 
 class TestErrorInCompileTime2(TestErrorInCompileTime):
@@ -143,5 +170,28 @@ class TestErrorInRuntime(TestErrorInCompileTime):
             self.assertIn(m, error_message)
 
 
+@unwrap
+@paddle.jit.to_static()
+def func_decorated_by_other_1():
+    return 1
+
+
+@paddle.jit.to_static()
+@unwrap
+def func_decorated_by_other_2():
+    return 1
+
+
+class TestErrorInOther(unittest.TestCase):
+    def test(self):
+        paddle.disable_static()
+        prog_trans = paddle.jit.ProgramTranslator()
+        with self.assertRaises(NotImplementedError):
+            prog_trans.get_output(func_decorated_by_other_1)
+
+        with self.assertRaises(NotImplementedError):
+            func_decorated_by_other_2()
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From f1ab2882018fa0e59a2c2baa277bb37630c5a4d7 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Fri, 11 Sep 2020 12:46:09 +0800
Subject: [PATCH 047/261] enhance inference error info. (#27251)

---
 .../inference/api/details/zero_copy_tensor.cc | 63 ++++++++++++-------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index a5a0405ac88..46755eeda66 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -21,15 +21,21 @@
 namespace paddle {
 
 void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
-  PADDLE_ENFORCE(!name_.empty(),
-                 "Need to SetName first, so that the corresponding tensor can "
-                 "be retrieved.");
-  PADDLE_ENFORCE(input_or_output_,
-                 "Can't reshape the output tensor, it is readonly");
-  PADDLE_ENFORCE(scope_);
+  PADDLE_ENFORCE_EQ(
+      name_.empty(), false,
+      platform::errors::PreconditionNotMet(
+          "Need to SetName first, so that the corresponding tensor can "
+          "be retrieved."));
+  PADDLE_ENFORCE_EQ(input_or_output_, true,
+                    platform::errors::PermissionDenied(
+                        "Can't reshape the output tensor, it is readonly"));
+  PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
+                                      "The scope should not be nullptr."));
   auto *scope = static_cast<framework::Scope *>(scope_);
   auto *var = scope->FindVar(name_);
-  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::PreconditionNotMet(
+               "No tensor called [%s] in the runtime scope", name_));
   auto *tensor = var->GetMutable<framework::LoDTensor>();
   tensor->Resize(framework::make_ddim(shape));
 }
@@ -45,8 +51,10 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
   EAGER_GET_TENSOR;
   PADDLE_ENFORCE_GT(
       tensor->numel(), 0,
-      "You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
-      "function before retrieving mutable_data from input tensor.");
+      platform::errors::PreconditionNotMet(
+          "You should call ZeroCopyTensor::Reshape(const std::vector<int> "
+          "&shape)"
+          "function before retrieving mutable_data from input tensor."));
   switch (static_cast<int>(place)) {
     case static_cast<int>(PaddlePlace::kCPU): {
       return tensor->mutable_data<T>(platform::CPUPlace());
@@ -55,7 +63,8 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
       return tensor->mutable_data<T>(platform::CUDAPlace(device_));
     }
     default:
-      PADDLE_THROW("Unsupported place: %d", static_cast<int>(place));
+      PADDLE_THROW(platform::errors::Unavailable("Unsupported place: %d",
+                                                 static_cast<int>(place)));
       break;
   }
   return nullptr;
@@ -96,10 +105,11 @@ PaddleDType ZeroCopyTensor::type() const {
 template <typename T>
 void ZeroCopyTensor::copy_from_cpu(const T *data) {
   EAGER_GET_TENSOR;
-  PADDLE_ENFORCE_GE(
-      tensor->numel(), 0,
-      "You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
-      "function before copying data from cpu.");
+  PADDLE_ENFORCE_GE(tensor->numel(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "You should call ZeroCopyTensor::Reshape(const "
+                        "std::vector<int> &shape)"
+                        "function before copying data from cpu."));
   size_t ele_size = tensor->numel() * sizeof(T);
 
   if (place_ == PaddlePlace::kCPU) {
@@ -116,7 +126,8 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
     memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
                  data, ele_size, dev_ctx->stream());
 #else
-    PADDLE_THROW("Not compiled with CUDA, should not reach here.");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compiled with CUDA, should not reach here."));
 #endif
   }
 }
@@ -141,7 +152,8 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
 
     cudaStreamSynchronize(dev_ctx->stream());
 #else
-    PADDLE_THROW("Not compile with CUDA, should not reach here.");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compile with CUDA, should not reach here."));
 #endif
   }
 }
@@ -176,20 +188,27 @@ template PD_INFER_DECL uint8_t *ZeroCopyTensor::mutable_data<uint8_t>(
     PaddlePlace place);
 
 void *ZeroCopyTensor::FindTensor() const {
-  PADDLE_ENFORCE(!name_.empty(),
-                 "Need to SetName first, so that the corresponding tensor can "
-                 "be retrieved.");
-  PADDLE_ENFORCE(scope_);
+  PADDLE_ENFORCE_EQ(
+      name_.empty(), false,
+      platform::errors::PreconditionNotMet(
+          "Need to SetName first, so that the corresponding tensor can "
+          "be retrieved."));
+  PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
+                                      "The scope should not be nullptr."));
   auto *scope = static_cast<framework::Scope *>(scope_);
   auto *var = scope->FindVar(name_);
-  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::PreconditionNotMet(
+               "No tensor called [%s] in the runtime scope", name_));
   auto *tensor = var->GetMutable<framework::LoDTensor>();
   return tensor;
 }
 
 std::vector<int> ZeroCopyTensor::shape() const {
   EAGER_GET_TENSOR;
-  PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
+  PADDLE_ENFORCE_NOT_NULL(
+      tensor_, platform::errors::PreconditionNotMet(
+                   "Not found tensor called %s in the scope", name_));
   return framework::vectorize<int>(tensor->dims());
 }
 
-- 
GitLab


From 33ff833af25849eb6f5d31bdb2742ccdbe337e02 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 11 Sep 2020 14:09:14 +0800
Subject: [PATCH 048/261] fix loaded no params layer run error (#27241)

---
 paddle/fluid/operators/run_program_op.cc      | 10 +--
 paddle/fluid/operators/run_program_op.h       |  7 ++-
 python/paddle/fluid/dygraph/io.py             | 14 +++--
 .../tests/unittests/test_jit_save_load.py     | 62 ++++++++++++++++++-
 4 files changed, 78 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc
index 04559a93c86..2d599716443 100644
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
@@ -27,9 +27,6 @@ class RunProgramOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
                       platform::errors::NotFound(
                           "Input(X) of RunProgramOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInputs("Params"), true,
-                      platform::errors::NotFound(
-                          "Input(Params) of RunProgramOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasOutputs("Out"), true,
                       platform::errors::NotFound(
                           "Output(Out) of RunProgramOp should not be null."));
@@ -73,7 +70,8 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
              "(vector<LoDTensor or SelecetedRows>)"
              "The input parameter of RunProgram operator, also the parameters "
              "of the loaded program.")
-        .AsDuplicable();
+        .AsDuplicable()
+        .AsDispensable();
     AddOutput("Out",
               "(vector<LoDTensor>)"
               "The output tensors of RunProgram operator, also the fetch "
@@ -121,10 +119,6 @@ class RunProgramGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
                       platform::errors::NotFound(
                           "Input(X) of RunProgramGradOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInputs("Params"), true,
-        platform::errors::NotFound(
-            "Input(Params) of RunProgramGradOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInputs(framework::GradVarName("Out")), true,
         platform::errors::NotFound(
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index 1c493fc6be0..5afe25cf687 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -209,9 +209,14 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     auto output_vars = ctx.MultiOutputVar("Out");
 
     auto input_var_names = ctx.InputNames("X");
-    auto param_names = ctx.InputNames("Params");
     auto output_var_names = ctx.OutputNames("Out");
 
+    // current program may not hold parameters
+    std::vector<std::string> param_names;
+    if (!param_vars.empty()) {
+      param_names = ctx.InputNames("Params");
+    }
+
     auto *block = ctx.Attr<BlockDesc *>("global_block");
     auto *program = block->Program();
     auto start_op_index = ctx.Attr<int64_t>("start_op_index");
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 1d2ea142c7d..335ac500c89 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -479,11 +479,15 @@ def _load_persistable_vars(model_path,
             var_file_path = os.path.join(model_path, params_filename)
         else:
             var_file_path = os.path.join(model_path, VARIABLE_FILENAME)
-        framework._dygraph_tracer().trace_op(
-            type='load_combine',
-            inputs={},
-            outputs={'Out': load_var_list},
-            attrs={'file_path': var_file_path})
+        if not os.path.exists(var_file_path):
+            if len(extra_var_info) != 0:
+                raise ValueError("The model to be loaded is incomplete.")
+        else:
+            framework._dygraph_tracer().trace_op(
+                type='load_combine',
+                inputs={},
+                outputs={'Out': load_var_list},
+                attrs={'file_path': var_file_path})
 
     return load_var_dict
 
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index f0680206de2..7e6ca8076de 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -23,7 +23,7 @@ from paddle.static import InputSpec
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME
 
 BATCH_SIZE = 32
 BATCH_NUM = 10
@@ -153,6 +153,24 @@ class LinearNetReturnHidden(fluid.dygraph.Layer):
         return y, loss
 
 
+class EmptyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(EmptyLayer, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        return x
+
+
+class NoParamLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(NoParamLayer, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x, y):
+        return x + y
+
+
 def train(layer, input_size=784, label_size=1):
     # create optimizer
     sgd = fluid.optimizer.SGDOptimizer(
@@ -273,6 +291,15 @@ class TestJitSaveLoad(unittest.TestCase):
         with self.assertRaises(ValueError):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
+    def test_jit_load_model_incomplete(self):
+        model_path = "model.test_jit_save_load.remove_variables"
+        self.train_and_save_model(model_path=model_path)
+        # remove `__variables__`	
+        var_path = os.path.join(model_path, VARIABLE_FILENAME)
+        os.remove(var_path)
+        with self.assertRaises(ValueError):
+            paddle.jit.load(model_path)
+
 
 class TestSaveLoadWithInputSpec(unittest.TestCase):
     def setUp(self):
@@ -695,5 +722,38 @@ class TestJitSaveMultiCases(unittest.TestCase):
                 configs=configs)
 
 
+class TestJitSaveLoadEmptyLayer(unittest.TestCase):
+    def setUp(self):
+        self.model_path = "model.jit_save_load_empty_layer"
+        # enable dygraph mode
+        paddle.disable_static()
+
+    def test_save_load_empty_layer(self):
+        layer = EmptyLayer()
+        x = paddle.to_variable(np.random.random((10)).astype('float32'))
+        out = layer(x)
+        paddle.jit.save(layer, self.model_path)
+        load_layer = paddle.jit.load(self.model_path)
+        load_out = load_layer(x)
+        self.assertTrue(np.array_equal(out, load_out))
+
+
+class TestJitSaveLoadNoParamLayer(unittest.TestCase):
+    def setUp(self):
+        self.model_path = "model.jit_save_load_no_param_layer"
+        # enable dygraph mode
+        paddle.disable_static()
+
+    def test_save_load_no_param_layer(self):
+        layer = NoParamLayer()
+        x = paddle.to_variable(np.random.random((5)).astype('float32'))
+        y = paddle.to_variable(np.random.random((5)).astype('float32'))
+        out = layer(x, y)
+        paddle.jit.save(layer, self.model_path)
+        load_layer = paddle.jit.load(self.model_path)
+        load_out = load_layer(x, y)
+        self.assertTrue(np.array_equal(out, load_out))
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From 20a84820fdd4d4f06179a33c393896ebed8bf480 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Fri, 11 Sep 2020 15:49:09 +0800
Subject: [PATCH 049/261] fix unused var with zero gradient bug in
 fluid.gradient (#27246)

* fix calcu_gradients

* fix code place

* fix embedding interface usage
---
 python/paddle/fluid/backward.py               |  6 ++++
 .../dygraph_to_static/test_partial_program.py | 29 +++++++++++++++++++
 .../tests/unittests/test_calc_gradient.py     | 19 +++++++++++-
 3 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 898c7d29564..d51cacd1a5c 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1756,6 +1756,12 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
     op_path_dict = dict()
     op_path = _find_op_path_(block, targets, inputs, block_no_grad_set,
                              op_path_dict)
+
+    # find no grad var by op_path
+    no_grad_vars = _find_no_grad_vars(block, op_path, targets,
+                                      block_no_grad_set)
+    block_no_grad_set.update(no_grad_vars)
+
     no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
     grad_to_var = dict()
     grad_info_map = dict()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
index f0fbe54f9db..91067f36099 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.dygraph import declarative, ProgramTranslator
@@ -151,5 +152,33 @@ class TestWithTrainAndEval(unittest.TestCase):
                              partial_layer._train_program)
 
 
+class GPT2LMHeadModel(fluid.dygraph.Layer):
+    def __init__(self):
+        super(GPT2LMHeadModel, self).__init__()
+        self.embedding0 = paddle.nn.Embedding(20, 16)
+        self.embedding1 = paddle.nn.Embedding(20, 32)
+        self.lm_head_weight = paddle.to_tensor(
+            np.random.rand(2, 3).astype('float32'))
+
+    @declarative
+    def forward(self, x):
+        x = fluid.layers.reshape(x, shape=[-1, 6])
+        x1, x2, x3 = fluid.layers.split(input=x, dim=1, num_or_sections=3)
+        return x1
+
+
+class TestPruneUnusedParamInProgram(unittest.TestCase):
+    def test_prune(self):
+        input_ids = np.array([[15, 11, 6, 3, 18, 13]]).astype("float32")
+
+        place = fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            model = GPT2LMHeadModel()
+            model.eval()
+            input_ids = paddle.to_tensor(input_ids)
+            out = model(input_ids)
+            self.assertTrue(np.array_equal(out.numpy(), [[15, 11]]))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
index 3e8c449d899..fdfaf6a3113 100644
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import unittest
-
+import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid.backward import calc_gradient
@@ -81,5 +81,22 @@ class TestDoubleGrad(unittest.TestCase):
         self.assertEqual(12, out[0])
 
 
+class TestGradientWithPrune(unittest.TestCase):
+    def test_prune(self):
+        x = fluid.data(name='x', shape=[3], dtype='float32')
+        x.stop_gradient = False
+        x1, x2, x3 = fluid.layers.split(x, dim=0, num_or_sections=3)
+        y = x1 * 2
+        x1_grad = fluid.gradients(y, x)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        main = fluid.default_main_program()
+        exe.run(fluid.default_startup_program())
+        out = exe.run(main,
+                      feed={'x': np.ones([3]).astype('float32')},
+                      fetch_list=[x1_grad])
+        self.assertTrue(np.array_equal(out[0], [2., 0., 0.]))
+
+
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From f402d8d8c0239f39f67342e09ba865d4d419a0ce Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 11 Sep 2020 18:22:57 +0800
Subject: [PATCH 050/261] fix bug when axis is a tensor with more than 1
 element (#27263)

---
 python/paddle/fluid/layers/nn.py                         | 2 +-
 python/paddle/fluid/tests/unittests/test_unsqueeze_op.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 5a14b9fdc7b..bc9f182d95e 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6309,7 +6309,7 @@ def unsqueeze(input, axes, name=None):
         if isinstance(axes, int):
             axes = [axes]
         elif isinstance(axes, Variable):
-            axes = [axes.numpy().item(0)]
+            axes = axes.numpy().tolist()
         elif isinstance(axes, (list, tuple)):
             axes = [
                 item.numpy().item(0) if isinstance(item, Variable) else item
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index 6f713172f1b..1975e430602 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -167,8 +167,9 @@ class API_TestDyUnsqueezeAxisTensor(unittest.TestCase):
         with fluid.dygraph.guard():
             input1 = np.random.random([5, 10]).astype("int32")
             out1 = np.expand_dims(input1, axis=1)
+            out1 = np.expand_dims(out1, axis=2)
             input = fluid.dygraph.to_variable(input1)
-            output = paddle.unsqueeze(input, axis=paddle.to_tensor([1]))
+            output = paddle.unsqueeze(input, axis=paddle.to_tensor([1, 2]))
             out_np = output.numpy()
             self.assertTrue(np.array_equal(out1, out_np))
             self.assertEqual(out1.shape, out_np.shape)
-- 
GitLab


From 19228bd14292ede64821565e6f1c36b7aef2024c Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 11 Sep 2020 18:57:06 +0800
Subject: [PATCH 051/261] Temporally disable zero_copy (#27248)

* temporally disable zero_copy

* add test

* follow comments
---
 python/paddle/fluid/dygraph/base.py           | 18 +++++++++++++----
 .../unittests/test_imperative_numpy_bridge.py | 20 +++++++++++++------
 python/paddle/tensor/creation.py              |  9 ++++-----
 3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index c548bdfeba1..2f95c2b9007 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -25,6 +25,7 @@ from .tracer import Tracer
 import logging
 import objgraph
 from ..data_feeder import convert_dtype
+import warnings
 
 __all__ = [
     'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
@@ -609,10 +610,10 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
             uint8, uint16, complex64, complex128}.
         name(str, optional): The default value is None. Normally there is no 
             need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` .
+            refer to :ref:`api_guide_Name` . 
         zero_copy(bool, optional): Whether to share memory with the input numpy 
             array. This parameter only works with CPUPlace and will be set to 
-            True when it is None. Default: None.
+            True when it is None. Default: None. (Note: zero_copy is discarded temporally for some reason.)
         dtype(str, optional): The desired data type of returned ``Variable`` .
             Can be 'bool' , 'float16' , 'float32' , 'float64' , 'int8' , 'int16' , 
             'int32' , 'int64' , 'uint8' . Default: None.
@@ -665,8 +666,17 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
     else:
         if isinstance(framework._current_expected_place(),
                       framework.core.CPUPlace):
-            if zero_copy is None:
-                zero_copy = True
+            #TODO(zhiqiu): we found two problems when enable zero_copy on CPUPlace.
+            # (1): eigen requires 16-bytes alignments, but the data of numpy array may not statisfy. 
+            # Details: https://eigen.tuxfamily.org/dox/group__TopicUnalignedArrayAssert.html
+            # (2): when used in flask framework, it may result in hang.
+            # Details: https://github.com/PaddlePaddle/Paddle/issues/26635
+            # So, we temporally diable the zero_copy strategy.
+            if zero_copy == True:
+                warnings.warn(
+                    "Currently, zero_copy is not supported, and it will be discarded."
+                )
+                zero_copy = False
         else:
             assert not zero_copy, "zero_copy mode can only be used with CPUPlace"
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
index da01be8159a..772dd913e4d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
@@ -15,18 +15,26 @@
 import unittest
 import numpy as np
 import paddle.fluid as fluid
+import warnings
 
 
 class TestImperativeNumpyBridge(unittest.TestCase):
     def test_tensor_from_numpy(self):
         data_np = np.array([[2, 3, 1]]).astype('float32')
         with fluid.dygraph.guard(fluid.CPUPlace()):
-            var = fluid.dygraph.to_variable(data_np, zero_copy=True)
-            self.assertTrue(np.array_equal(var.numpy(), data_np))
-            data_np[0][0] = 4
-            self.assertEqual(data_np[0][0], 4)
-            self.assertEqual(var[0][0].numpy()[0], 4)
-            self.assertTrue(np.array_equal(var.numpy(), data_np))
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                var = fluid.dygraph.to_variable(data_np, zero_copy=True)
+                assert "Currently, zero_copy is not supported, and it will be discarded." in str(
+                    w[-1].message)
+            # Temporally diable zero_copy
+            # var = fluid.dygraph.to_variable(data_np, zero_copy=True)
+            # self.assertTrue(np.array_equal(var.numpy(), data_np))
+            # data_np[0][0] = 4
+            # self.assertEqual(data_np[0][0], 4)
+            # self.assertEqual(var[0][0].numpy()[0], 4)
+            # self.assertTrue(np.array_equal(var.numpy(), data_np))
+
             var2 = fluid.dygraph.to_variable(data_np, zero_copy=False)
             self.assertTrue(np.array_equal(var2.numpy(), data_np))
             data_np[0][0] = -1
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index b75e2a8851f..8011b92964b 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -63,8 +63,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
 
     If the ``data`` is already a tensor, and ``dtype`` or ``place`` does't change, no copy 
     will be performed and return origin tensor, otherwise a new tensor will be constructed
-    and returned. Similarly, if the data is an numpy\.ndarray of with the same ``dtype`` 
-    and the current place is cpu, no copy will be performed.
+    and returned. 
 
     The ``ComplexTensor`` is a unique type of paddle. If x is ``ComplexTensor``, then 
     ``x.real`` is the real part, and ``x.imag`` is the imaginary part.
@@ -209,20 +208,20 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             value=data,
             place=place,
             persistable=False,
-            zero_copy=True,
+            zero_copy=False,
             stop_gradient=stop_gradient)
     else:
         name = unique_name.generate('generated_tensor')
         real_tensor = paddle.Tensor(
             value=data.real,
             place=place,
-            zero_copy=True,
+            zero_copy=False,
             name=name + ".real",
             stop_gradient=stop_gradient)
         imag_tensor = paddle.Tensor(
             value=data.imag,
             place=place,
-            zero_copy=True,
+            zero_copy=False,
             name=name + ".imag",
             stop_gradient=stop_gradient)
         return paddle.ComplexTensor(real_tensor, imag_tensor)
-- 
GitLab


From 7745ad55ede759f5fe26fb709af450b88c406f0b Mon Sep 17 00:00:00 2001
From: LoveAn <mr.avin0323@gmail.com>
Date: Sat, 12 Sep 2020 13:51:41 +0800
Subject: [PATCH 052/261] Add details to the summary for show more error
 informations (#27165)

* Add details to the summary and test it, test=document_fix

* Add set +e before example, test=document_fix

* Remove test code, test=document_fix

* Optimize summary information and test it, test=document_fix

* Remove test code, test=document_fix
---
 paddle/scripts/paddle_build.sh | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 88d9e6e55d5..ec07565c5af 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -296,13 +296,13 @@ function check_style() {
     commit_files=on
     for file_name in `git diff --numstat upstream/$BRANCH |awk '{print $NF}'`;do
         if ! pre-commit run --files $file_name ; then
-            git diff
             commit_files=off
         fi
     done 
     
     if [ $commit_files == 'off' ];then
         echo "code format error"
+        git diff 2>&1
         exit 4
     fi
     trap : 0
@@ -1447,7 +1447,7 @@ function example() {
     cd ${PADDLE_ROOT}/tools
     python sampcd_processor.py cpu;example_error=$?
     if [ "$example_error" != "0" ];then
-      echo "Code instance execution failed"
+      echo "Code instance execution failed" >&2
       exit 5
     fi
 }
@@ -1456,15 +1456,25 @@ function summary_check_problems() {
     set +x
     local check_style_code=$1
     local example_code=$2
+    local check_style_info=$3
+    local example_info=$4
     if [ $check_style_code -ne 0 -o $example_code -ne 0 ];then
       echo "========================================"
       echo "summary problems:"
+      if [ $check_style_code -ne 0 -a $example_code -ne 0 ];then
+        echo "There are 2 errors: Code format error and Example code error."
+      else
+        [ $check_style_code -ne 0 ] && echo "There is 1 error: Code format error."
+        [ $example_code -ne 0 ] && echo "There is 1 error: Example code error."
+      fi
       echo "========================================"
       if [ $check_style_code -ne 0 ];then
-        echo "- Check code style failed! Please check the log and fix problems."
+        echo "*****Code format error***** Please fix it according to the diff information:"
+        echo "$check_style_info" | grep "code format error" -A $(echo "$check_style_info" | wc -l)
       fi
       if [ $example_code -ne 0 ];then
-        echo "- Check example code failed! Please check the log and fix problems."
+        echo "*****Example code error***** Please fix the error listed in the information:"
+        echo "$example_info" | grep "API check -- Example Code" -A $(echo "$example_info" | wc -l)
       fi
       [ $check_style_code -ne 0 ] && exit $check_style_code
       [ $example_code -ne 0 ] && exit $example_code
@@ -1486,15 +1496,16 @@ function main() {
         ;;
       build_and_check)
         set +e
-        $(check_style >&2)
+        check_style_info=$(check_style)
         check_style_code=$?
         generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         check_sequence_op_unittest
         generate_api_spec ${PYTHON_ABI:-""} "PR"
-        $(example >&2)
+        set +e
+        example_info=$(example)
         example_code=$?
-        summary_check_problems $check_style_code $example_code
+        summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
         assert_api_spec_approvals
         ;;
       build)
-- 
GitLab


From 5c4eed66fd9ee485fa0dd47fc319aaa0830f3e17 Mon Sep 17 00:00:00 2001
From: lidanqing <danqing.li@intel.com>
Date: Sat, 12 Sep 2020 12:58:16 +0200
Subject: [PATCH 053/261] Fix GRU mkldnn kernel fail on look_table_v2 (#27198)

* Fix the lookup_table_v2 failed on GRU mkldnn kernel issue
test=develop

* fix according to reviews, removed x_num_col_dims
test=develop

* update gru model. change according to reviews
test=develop

* change according to reviews
test=develop
---
 .../fluid/inference/tests/api/CMakeLists.txt  |  4 +-
 paddle/fluid/operators/fused/fusion_gru_op.cc | 59 +++++++++++--------
 .../fused/mkldnn/fusion_gru_mkldnn_op.cc      | 11 ++--
 3 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index fd4b1a54d2b..b3ec4b5714e 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -342,9 +342,9 @@ if(WITH_MKLDNN)
   ### Lexcial analysis GRU model
   set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
   download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz")
-  download_GRU_data("${GRU_PATH}" "GRU_eval_model.tar.gz")
+  download_GRU_data("${GRU_PATH}" "GRU_eval_model_v2.tar.gz")
   set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin")
-  set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model")
+  set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model_v2")
   set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis")
   set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc")
 
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index d0920098f60..f731a78f77b 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -30,16 +30,18 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
   OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fusion_gru");
   OP_INOUT_CHECK(ctx->HasInput("WeightX"), "Input", "WeightX", "fusion_gru");
   OP_INOUT_CHECK(ctx->HasInput("WeightH"), "Input", "WeightH", "fusion_gru");
-
   OP_INOUT_CHECK(ctx->HasOutput("XX"), "Output", "XX", "fusion_gru");
   OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "fusion_gru");
-
   auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    platform::errors::InvalidArgument(
-                        "Input(X)'s rank must be 2, but received input dim "
-                        "size is:%d, input dim is:[%s]",
-                        x_dims.size(), x_dims));
+  auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
+                        ? framework::flatten_to_2d(x_dims, 1)
+                        : x_dims;
+  PADDLE_ENFORCE_EQ(
+      x_mat_dims.size(), 2,
+      platform::errors::InvalidArgument("The size of input X dims should be 2, "
+                                        "or 3 with second dimension equal to "
+                                        "1, but now Input X dim is:[%s] ",
+                                        x_dims));
 
   auto wx_dims = ctx->GetInputDim("WeightX");
   PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
@@ -47,12 +49,14 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                         "The rank of Input(WeightX) should be 2, but received "
                         "WeightX dim size is:%d, WeightX dim is:[%s] ",
                         wx_dims.size(), wx_dims));
-  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
-                    platform::errors::InvalidArgument(
-                        "The first dimension of Input(WeightX) "
-                        "should equal to second dimension of input x, but "
-                        "received WeightX dimension is:%d, x dimension is:%d",
-                        wx_dims[0], x_dims[1]));
+  PADDLE_ENFORCE_EQ(
+      wx_dims[0], x_mat_dims[1],
+      platform::errors::InvalidArgument(
+          "The first dimension of flattened WeightX"
+          "should equal to last dimension of flattened input X, but "
+          "received fattened WeightX dimension is:%d, flattened X dimension "
+          "is:%d",
+          wx_dims[0], x_mat_dims[1]));
 
   int frame_size = wx_dims[1] / 3;
   auto wh_dims = ctx->GetInputDim("WeightH");
@@ -102,24 +106,24 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                           "received bias dim is:[%s], frame size is:%d",
                           b_dims, frame_size));
   }
-  framework::DDim out_dims({x_dims[0], frame_size});
+  framework::DDim out_dims({x_mat_dims[0], frame_size});
   ctx->SetOutputDim("Hidden", out_dims);
   ctx->ShareLoD("X", "Hidden");
   int xx_width;
   if (ctx->Attrs().Get<bool>("use_seq")) {
     xx_width = wx_dims[1];
   } else {
-    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    xx_width = x_mat_dims[1] > wx_dims[1] ? wx_dims[1] : x_mat_dims[1];
     OP_INOUT_CHECK(ctx->HasOutput("ReorderedH0"), "Output", "ReorderedH0",
                    "fusion_gru");
     OP_INOUT_CHECK(ctx->HasOutput("BatchedInput"), "Output", "BatchedInput",
                    "fusion_gru");
     OP_INOUT_CHECK(ctx->HasOutput("BatchedOut"), "Output", "BatchedOut",
                    "fusion_gru");
-    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedInput", {x_mat_dims[0], wx_dims[1]});
     ctx->SetOutputDim("BatchedOut", out_dims);
   }
-  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
+  ctx->SetOutputDim("XX", {x_mat_dims[0], xx_width});
   ctx->ShareLoD("X", "XX");
 }
 
@@ -220,14 +224,17 @@ class FusionGRUKernel : public framework::OpKernel<T> {
     }
   }
 
-#define INIT_BASE_DEFINES                  \
-  auto* x = ctx.Input<LoDTensor>("X");     \
-  auto* wh = ctx.Input<Tensor>("WeightH"); \
-  auto* xx = ctx.Output<LoDTensor>("XX");  \
-  auto x_lod = x->lod();                   \
-  auto x_dims = x->dims();   /* T x M*/    \
-  auto wh_dims = wh->dims(); /* D x 3D*/   \
-  const int total_T = x_dims[0];           \
+#define INIT_BASE_DEFINES                                     \
+  auto* x = ctx.Input<LoDTensor>("X");                        \
+  auto* wh = ctx.Input<Tensor>("WeightH");                    \
+  auto* xx = ctx.Output<LoDTensor>("XX");                     \
+  auto x_lod = x->lod();                                      \
+  auto x_dims = x->dims(); /* T x M*/                         \
+  auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)    \
+                        ? framework::flatten_to_2d(x_dims, 1) \
+                        : x_dims;                             \
+  auto wh_dims = wh->dims(); /* D x 3D*/                      \
+  const int total_T = x_mat_dims[0];                          \
   const int D3 = wh_dims[1]
 
 #define INIT_OTHER_DEFINES                                                   \
@@ -236,7 +243,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
   auto* bias = ctx.Input<Tensor>("Bias");                                    \
   auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                        \
   bool is_reverse = ctx.Attr<bool>("is_reverse");                            \
-  const int M = x_dims[1];                                                   \
+  const int M = x_mat_dims[1];                                               \
   const int D = wh_dims[0];                                                  \
   const int D2 = D * 2;                                                      \
   const jit::gru_attr_t attr(                                                \
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index 3940aae53b8..a31fe168439 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -364,13 +364,16 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
     const auto* weight_h = ctx.Input<Tensor>("WeightH");
     const auto* bias = ctx.Input<Tensor>("Bias");
     auto* hidden = ctx.Output<LoDTensor>("Hidden");
-
+    auto x_dims = input->dims();
+    auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
+                          ? framework::flatten_to_2d(x_dims, 1)
+                          : x_dims;
     // Get attributes
     const bool is_reverse = ctx.Attr<bool>("is_reverse");
     const bool origin_mode = ctx.Attr<bool>("origin_mode");
 
     // Get tensor dimensions
-    const auto x_dims = framework::vectorize(input->dims());
+    const auto x_mat_dims_vec = framework::vectorize(x_mat_dims);
     const auto weight_h_dims = framework::vectorize(weight_h->dims());
     const auto& input_lod = input->lod()[0];
 
@@ -384,8 +387,8 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
           }
           return res;
         }();
-    const int64_t IC = x_dims[1];         // Input channels
-    const int64_t OC = weight_h_dims[0];  // Output channels
+    const int64_t IC = x_mat_dims_vec[1];  // Input channels
+    const int64_t OC = weight_h_dims[0];   // Output channels
 
     GRUMKLDNNHandler<T> handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(),
                                 input, weight_h, h0, is_reverse, N, Ti, IC, OC,
-- 
GitLab


From 5c1bafbbc61633dcb4bbf541d8fa4dcebcfa8c12 Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Sun, 13 Sep 2020 22:25:07 +0800
Subject: [PATCH 054/261] use eval to improve performance, test=develop
 (#25459)

---
 paddle/fluid/operators/dist_op.h | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
index ca03400cfd1..a2279e40623 100644
--- a/paddle/fluid/operators/dist_op.h
+++ b/paddle/fluid/operators/dist_op.h
@@ -176,14 +176,26 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
   } else if (p == INFINITY || p == -INFINITY) {
     // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
     // j!=i, or equals to sign(z_i) * dout if j=i.
-    grad_t.device(place) =
-        (x_minux_y_abs == out_t.broadcast(out_bcast_dims)).template cast<T>() *
-        sign * out_grad_t.broadcast(out_bcast_dims);
+    if (platform::is_cpu_place(context.GetPlace())) {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign * out_grad_t.broadcast(out_bcast_dims);
+    }
   } else {
     // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
-    grad_t.device(place) =
-        (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
-        out_grad_t.broadcast(out_bcast_dims);
+    if (platform::is_cpu_place(context.GetPlace())) {
+      grad_t.device(place) =
+          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) *
+          sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) =
+          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
+          out_grad_t.broadcast(out_bcast_dims);
+    }
   }
 
   Eigen::DSizes<int, Rank * 2> x_reshape_dims;
-- 
GitLab


From 9437ce36c42d740942c83429442ecfb2b89eba4f Mon Sep 17 00:00:00 2001
From: Jack Zhou <136876878@qq.com>
Date: Mon, 14 Sep 2020 09:38:41 +0800
Subject: [PATCH 055/261] Error description optimize for math dir

Error description optimize for math dir
---
 paddle/fluid/operators/math/math_function.cc  |  18 ++-
 paddle/fluid/operators/math/math_function.cu  |  33 ++++-
 .../fluid/operators/math/math_function_impl.h |  65 +++++++--
 .../operators/math/math_function_test.cc      |   6 +-
 .../operators/math/math_function_test.cu      |   7 +-
 paddle/fluid/operators/math/padding.h         |  10 +-
 paddle/fluid/operators/math/sampler.h         |   7 +-
 paddle/fluid/operators/math/vol2col.cc        | 124 +++++++++++-------
 paddle/fluid/operators/math/vol2col.cu        | 124 +++++++++++-------
 9 files changed, 271 insertions(+), 123 deletions(-)

diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 824e66b1eb4..f44b33fcf2f 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -128,9 +128,23 @@ struct RowwiseAdd<platform::CPUDeviceContext, T> {
                   const framework::Tensor& input,
                   const framework::Tensor& vector, framework::Tensor* output) {
     auto in_dims = input.dims();
+    auto out_dims = output->dims();
     auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    PADDLE_ENFORCE_EQ(
+        vector.numel(), size,
+        platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size, vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(out_dims, in_dims,
+                      platform::errors::InvalidArgument(
+                          "The output tensor shape should be same as the input"
+                          " tensor shape. Expected output tensor shape: %s,"
+                          " but received %s",
+                          in_dims_cstr, out_dims_cstr));
 
     auto in = framework::EigenMatrix<T>::From(input);
     auto vec = framework::EigenVector<T>::Flatten(vector);
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index fba143d017d..1c519d226eb 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -88,9 +88,24 @@ struct RowwiseAdd<platform::CUDADeviceContext, T> {
                   const framework::Tensor& input,
                   const framework::Tensor& vector, framework::Tensor* output) {
     auto in_dims = input.dims();
+    auto out_dims = output->dims();
     auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    PADDLE_ENFORCE_EQ(
+        vector.numel(), size,
+        platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size, vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(
+        out_dims, in_dims,
+        platform::errors::InvalidArgument(
+            "The output tensor shape should be same as the input tensor"
+            " shape. Expected output tensor shape: %s,"
+            " but received %s",
+            in_dims_cstr, out_dims_cstr));
     int blocks = 512;
     int grids = (input.numel() + blocks - 1) / blocks;
     RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
@@ -113,7 +128,12 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
     framework::Tensor* vector) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size);
+  PADDLE_ENFORCE_EQ(vector->numel(), size,
+                    platform::errors::InvalidArgument(
+                        "The size of input vector"
+                        " should be equal to the size of input tensor column"
+                        " dimension. Expected vector size=%d, but received %d",
+                        size, vector->numel()));
   framework::Tensor one;
   one.mutable_data<double>({in_dims[0]}, context.GetPlace());
   SetConstant<platform::CUDADeviceContext, double> set;
@@ -134,7 +154,12 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
     framework::Tensor* vector) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
+  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0],
+                    platform::errors::InvalidArgument(
+                        "The size of input vector"
+                        " should be equal to the size of input tensor row"
+                        " dimension. Expected vector size=%d, but received %d",
+                        in_dims[0], vector->numel()));
   framework::Tensor one;
   one.mutable_data<double>({size}, context.GetPlace());
   SetConstant<platform::CUDADeviceContext, double> set;
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h
index 693d5620460..869a3054598 100644
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -59,7 +59,12 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                               framework::Tensor* out) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(out->numel(), size);
+  PADDLE_ENFORCE_EQ(out->numel(), size,
+                    platform::errors::InvalidArgument(
+                        "The size of output tensor "
+                        "should be equal to the size of input tensor column"
+                        " dimension. Expected output size=%d, but received %d",
+                        size, out->numel()));
 
   auto in = framework::EigenMatrix<T>::From(input);
   auto vec = framework::EigenVector<T>::Flatten(*out);
@@ -78,7 +83,13 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
     auto& in_dims = input.dims();
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
+    PADDLE_ENFORCE_EQ(
+        out->numel(), size,
+        platform::errors::InvalidArgument(
+            "The size of output tensor "
+            "should be equal to the size of input tensor column"
+            " dimension. Expected output size=%d, but received %d",
+            size, out->numel()));
 
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
@@ -100,8 +111,16 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
                                                const framework::Tensor& input,
                                                framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                            "The rank of input tensor "
+                                            "should be 2, but received %d",
+                                            in_dims.size()));
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
+                    platform::errors::InvalidArgument(
+                        "The size of output tensor "
+                        "should be equal to the size of input tensor row"
+                        " dimension. Expected output size=%d, but received %d",
+                        in_dims[0], out->numel()));
 
   auto in = framework::EigenMatrix<T>::From(input);
   auto vec = framework::EigenVector<T>::Flatten(*out);
@@ -118,10 +137,19 @@ class RowwiseMean<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                              "The rank of input tensor "
+                                              "should be 2, but received %d",
+                                              in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    PADDLE_ENFORCE_EQ(
+        out->numel(), height,
+        platform::errors::InvalidArgument(
+            "The size of output tensor "
+            "should be equal to the size of input tensor row"
+            " dimension. Expected output size=%d, but received %d",
+            height, out->numel()));
     auto inv_size = 1.0 / size;
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
@@ -141,8 +169,16 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                               const framework::Tensor& input,
                                               framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                            "The rank of input tensor "
+                                            "should be 2, but received %d",
+                                            in_dims.size()));
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
+                    platform::errors::InvalidArgument(
+                        "The size of output tensor "
+                        "should be equal to the size of input tensor row"
+                        " dimension. Expected output size=%d, but received %d",
+                        in_dims[0], out->numel()));
 
   auto in = framework::EigenMatrix<T>::From(input);
   auto vec = framework::EigenVector<T>::Flatten(*out);
@@ -159,10 +195,19 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                              "The rank of input tensor "
+                                              "should be 2, but received %d",
+                                              in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    PADDLE_ENFORCE_EQ(
+        out->numel(), height,
+        platform::errors::InvalidArgument(
+            "The size of output tensor "
+            "should be equal to the size of input tensor row"
+            " dimension. Expected output size=%d, but received %d",
+            height, out->numel()));
 
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 2343e0ee965..587823e535a 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -224,7 +224,11 @@ TEST(math_funciton, set_constant) {
   auto* ctx = new paddle::platform::CPUDeviceContext();
   paddle::operators::math::set_constant(*ctx, &t, 10);
   for (int64_t i = 0; i < t.numel(); ++i) {
-    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
+    PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
+                      paddle::platform::errors::InvalidArgument(
+                          "Each value of input"
+                          "tensor should be 10, but received %d.",
+                          t.data<int>()[i]));
   }
   delete ctx;
 }
diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu
index bcbb4a8274f..44b1ee45a4f 100644
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -18,7 +18,12 @@
 
 void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
                     const std::vector<float>& data) {
-  PADDLE_ENFORCE_EQ(size, data.size());
+  PADDLE_ENFORCE_EQ(
+      size, data.size(),
+      paddle::platform::errors::InvalidArgument(
+          "The size of argument data should"
+          " be equal to the argument size. Expected %d, but received %d.",
+          size, data.size()));
   for (size_t i = 0; i < data.size(); ++i) {
     in_ptr[i] = paddle::platform::float16(data[i]);
   }
diff --git a/paddle/fluid/operators/math/padding.h b/paddle/fluid/operators/math/padding.h
index 63f793433de..379b21c3c18 100644
--- a/paddle/fluid/operators/math/padding.h
+++ b/paddle/fluid/operators/math/padding.h
@@ -85,8 +85,9 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context,
       PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
       break;
     default:
-      PADDLE_THROW(
-          "PadOp only support tensors with no more than 6 dimensions.");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "PadOp only support tensors with no more"
+          " than 6 dimensions currently."));
   }
 }
 
@@ -114,8 +115,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
       PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
       break;
     default:
-      PADDLE_THROW(
-          "PadOp only support tensors with no more than 6 dimensions.");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "PadOp only support tensors with no more"
+          " than 6 dimensions currently."));
   }
 }
 
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index 480576ef9dc..de9113f2bb6 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -19,6 +19,8 @@ limitations under the License. */
 #include <random>
 #include <vector>
 
+#include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -31,7 +33,10 @@ namespace math {
 class Sampler {
  public:
   explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
-    //    PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
+    PADDLE_ENFORCE_GT(range, 0, platform::errors::InvalidArgument(
+                                    "Range should be"
+                                    " greater than 0, but recevied %d.",
+                                    range));
     if (seed == 0) {
       std::random_device r;
       seed_ = r();
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index 01f50727b44..c05da0062f2 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -34,10 +34,16 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* col,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
-                      "The dimension of vol should be 4.");
-    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
-                      "The dimension of col should be 7.");
+    PADDLE_ENFORCE_EQ(
+        vol.dims().size(), 4,
+        platform::errors::InvalidArgument("The dimension of"
+                                          " vol should be 4, but received %d.",
+                                          vol.dims().size()));
+    PADDLE_ENFORCE_EQ(
+        col->dims().size(), 7,
+        platform::errors::InvalidArgument("The dimension of"
+                                          "col should be 7, but received %d.",
+                                          col->dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
@@ -65,27 +71,33 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
     int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
     int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
 
-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
     const T* vol_data = vol.data<T>();
     T* col_data = col->data<T>();
 
@@ -140,10 +152,16 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* vol,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
-                      "The dimension of vol should be 4.");
-    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
-                      "The dimension of col should be 7.");
+    PADDLE_ENFORCE_EQ(
+        vol->dims().size(), 4,
+        platform::errors::InvalidArgument("The dimension of vol"
+                                          " should be 4, but received %d.",
+                                          vol->dims().size()));
+    PADDLE_ENFORCE_EQ(
+        col.dims().size(), 7,
+        platform::errors::InvalidArgument("The dimension of col"
+                                          " should be 7, but received %d.",
+                                          col.dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
@@ -170,27 +188,33 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
     int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
     int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
 
-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
+                      platform::errors::InvalidArgument(
+                          "input_depth(%d)"
+                          " and output_depth(%d) are mismatching.",
+                          input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
+                      platform::errors::InvalidArgument(
+                          "input_height(%d)"
+                          " and output_height(%d) are mismatching.",
+                          input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
+                      platform::errors::InvalidArgument(
+                          "input_width(%d)"
+                          " and output_width(%d) are mismatching.",
+                          input_width_tmp, output_width));
     T* vol_data = vol->data<T>();
     const T* col_data = col.data<T>();
 
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
index 9de9051f512..fe5a6009098 100644
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -90,10 +90,16 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* col,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
-                      "The dimension of vol should be 4.");
-    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
-                      "The dimension of col should be 7.");
+    PADDLE_ENFORCE_EQ(
+        vol.dims().size(), 4,
+        platform::errors::InvalidArgument("The dimension of"
+                                          " vol should be 4, but received %d.",
+                                          vol.dims().size()));
+    PADDLE_ENFORCE_EQ(
+        col->dims().size(), 7,
+        platform::errors::InvalidArgument("The dimension of"
+                                          "col should be 7, but received %d.",
+                                          col->dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
@@ -117,27 +123,33 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
     int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
     int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
     int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
 
     int num_outputs =
         input_channels * output_depth * output_height * output_width;
@@ -241,10 +253,16 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* vol,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
-                      "The dimension of vol should be 4.");
-    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
-                      "The dimension of col should be 7.");
+    PADDLE_ENFORCE_EQ(
+        vol->dims().size(), 4,
+        platform::errors::InvalidArgument("The dimension of vol"
+                                          " should be 4, but received %d.",
+                                          vol->dims().size()));
+    PADDLE_ENFORCE_EQ(
+        col.dims().size(), 7,
+        platform::errors::InvalidArgument("The dimension of col"
+                                          " should be 7, but received %d.",
+                                          col.dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
@@ -269,27 +287,33 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
     int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
     int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
 
-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
+                      platform::errors::InvalidArgument(
+                          "input_depth(%d)"
+                          " and output_depth(%d) are mismatching.",
+                          input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
+                      platform::errors::InvalidArgument(
+                          "input_height(%d)"
+                          " and output_height(%d) are mismatching.",
+                          input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
+                      platform::errors::InvalidArgument(
+                          "input_width(%d)"
+                          " and output_width(%d) are mismatching.",
+                          input_width_tmp, output_width));
 
     int num_kernels = input_channels * input_depth * input_height * input_width;
 
-- 
GitLab


From 255e0cf9780152b6a44f4b5413aad66dc46f78a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Mon, 14 Sep 2020 10:49:19 +0800
Subject: [PATCH 056/261] error messages of inference/capi, test=develop
 (#27258)

---
 paddle/fluid/inference/capi/c_api.cc         |  21 +-
 paddle/fluid/inference/capi/c_api_internal.h |   1 -
 paddle/fluid/inference/capi/pd_config.cc     | 208 +++++++++++++++----
 paddle/fluid/inference/capi/pd_predictor.cc  |  20 +-
 paddle/fluid/inference/capi/pd_tensor.cc     |  33 ++-
 5 files changed, 221 insertions(+), 62 deletions(-)

diff --git a/paddle/fluid/inference/capi/c_api.cc b/paddle/fluid/inference/capi/c_api.cc
index 821dff2f036..07493c742c4 100644
--- a/paddle/fluid/inference/capi/c_api.cc
+++ b/paddle/fluid/inference/capi/c_api.cc
@@ -16,6 +16,7 @@
 #include <vector>
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"
 
 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -34,27 +35,37 @@ void PD_DeletePaddleBuf(PD_PaddleBuf* buf) {
 }
 
 void PD_PaddleBufResize(PD_PaddleBuf* buf, size_t length) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   buf->buf.Resize(length);
 }
 
 void PD_PaddleBufReset(PD_PaddleBuf* buf, void* data, size_t length) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   buf->buf.Reset(data, length);
 }
 
 bool PD_PaddleBufEmpty(PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   return buf->buf.empty();
 }
 
 void* PD_PaddleBufData(PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   return buf->buf.data();
 }
 
 size_t PD_PaddleBufLength(PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   return buf->buf.length();
 }
 
diff --git a/paddle/fluid/inference/capi/c_api_internal.h b/paddle/fluid/inference/capi/c_api_internal.h
index 2dd82722977..7e69b721076 100644
--- a/paddle/fluid/inference/capi/c_api_internal.h
+++ b/paddle/fluid/inference/capi/c_api_internal.h
@@ -18,7 +18,6 @@
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
-#include "paddle/fluid/platform/enforce.h"
 
 using PD_PaddleDType = paddle::PaddleDType;
 using PD_ACPrecision = paddle::AnalysisConfig::Precision;
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index b99abc06b27..af8d4a69ecf 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -20,6 +20,7 @@
 #include <vector>
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"
 
 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -40,7 +41,10 @@ void PD_DeleteAnalysisConfig(PD_AnalysisConfig* config) {
 void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
                  const char* params_path) {
   LOG(INFO) << model_dir;
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   LOG(INFO) << std::string(model_dir);
   if (!params_path) {
     config->config.SetModel(std::string(model_dir));
@@ -50,104 +54,164 @@ void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
 }
 
 void PD_SetProgFile(PD_AnalysisConfig* config, const char* x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetProgFile(std::string(x));
 }
 
 void PD_SetParamsFile(PD_AnalysisConfig* config, const char* x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetParamsFile(std::string(x));
 }
 
 void PD_SetOptimCacheDir(PD_AnalysisConfig* config, const char* opt_cache_dir) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetOptimCacheDir(std::string(opt_cache_dir));
 }
 
 const char* PD_ModelDir(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.model_dir().c_str();
 }
 
 const char* PD_ProgFile(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.prog_file().c_str();
 }
 
 const char* PD_ParamsFile(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.params_file().c_str();
 }
 
 void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb,
                      int device_id) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableUseGpu(static_cast<uint64_t>(memory_pool_init_size_mb),
                               device_id);
 }
 
 void PD_DisableGpu(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.DisableGpu();
 }
 
 bool PD_UseGpu(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.use_gpu();
 }
 
 int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.gpu_device_id();
 }
 
 int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.memory_pool_init_size_mb();
 }
 
 float PD_FractionOfGpuMemoryForPool(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.fraction_of_gpu_memory_for_pool();
 }
 
 void PD_EnableCUDNN(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableCUDNN();
 }
 
 bool PD_CudnnEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.cudnn_enabled();
 }
 
 void PD_SwitchIrOptim(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SwitchIrOptim(x);
 }
 
 bool PD_IrOptim(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.ir_optim();
 }
 
 void PD_SwitchUseFeedFetchOps(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SwitchUseFeedFetchOps(x);
 }
 
 bool PD_UseFeedFetchOpsEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.use_feed_fetch_ops_enabled();
 }
 
 void PD_SwitchSpecifyInputNames(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SwitchSpecifyInputNames(x);
 }
 
 bool PD_SpecifyInputName(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.specify_input_name();
 }
 
@@ -155,110 +219,168 @@ void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, int workspace_size,
                              int max_batch_size, int min_subgraph_size,
                              Precision precision, bool use_static,
                              bool use_calib_mode) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableTensorRtEngine(
       workspace_size, max_batch_size, min_subgraph_size,
       paddle::ConvertToACPrecision(precision), use_static, use_calib_mode);
 }
 
 bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.tensorrt_engine_enabled();
 }
 
 void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SwitchIrDebug(x);
 }
 
 void PD_EnableMKLDNN(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableMKLDNN();
 }
 
 void PD_SetMkldnnCacheCapacity(PD_AnalysisConfig* config, int capacity) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetMkldnnCacheCapacity(capacity);
 }
 
 bool PD_MkldnnEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.mkldnn_enabled();
 }
 
 void PD_SetCpuMathLibraryNumThreads(PD_AnalysisConfig* config,
                                     int cpu_math_library_num_threads) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetCpuMathLibraryNumThreads(cpu_math_library_num_threads);
 }
 
 int PD_CpuMathLibraryNumThreads(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.cpu_math_library_num_threads();
 }
 
 void PD_EnableMkldnnQuantizer(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableMkldnnQuantizer();
 }
 
 bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.mkldnn_quantizer_enabled();
 }
 
 void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
-                                      "PD_AnalysisConfig should not be null"));
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableMkldnnBfloat16();
 }
 
 bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
-                                      "PD_AnalysisConfig should not be null"));
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.mkldnn_bfloat16_enabled();
 }
 
 void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
                        size_t prog_buffer_size, const char* params_buffer,
                        size_t params_buffer_size) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer,
                                 params_buffer_size);
 }
 
 bool PD_ModelFromMemory(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.model_from_memory();
 }
 
 void PD_EnableMemoryOptim(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableMemoryOptim();
 }
 
 bool PD_MemoryOptimEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.enable_memory_optim();
 }
 
 void PD_EnableProfile(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableProfile();
 }
 
 bool PD_ProfileEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.profile_enabled();
 }
 
 void PD_SetInValid(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetInValid();
 }
 
 bool PD_IsValid(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.is_valid();
 }
 
diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc
index 8aa1e2a7b7f..0509a619021 100644
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"
 
 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -81,7 +82,10 @@ extern "C" {
 bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
                      int in_size, PD_Tensor** output_data, int* out_size,
                      int batch_size) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   VLOG(3) << "Predoctor: PD_PredictorRun. ";
   static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
       predictors;
@@ -111,7 +115,10 @@ bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
 bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
                              PD_ZeroCopyData* inputs, int in_size,
                              PD_ZeroCopyData** output, int* out_size) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
       predictors;
   if (!predictors.count(config->config.model_dir())) {
@@ -144,7 +151,8 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
         input_t->copy_from_cpu(static_cast<uint8_t*>(inputs[i].data));
         break;
       default:
-        CHECK(false) << "Unsupport data type.";
+        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+            "Unsupported data type."));
         break;
     }
   }
@@ -227,7 +235,8 @@ void PD_SetZeroCopyInput(PD_Predictor* predictor,
       input->copy_from_cpu(static_cast<uint8_t*>(tensor->data.data));
       break;
     default:
-      CHECK(false) << "Unsupport data type.";
+      PADDLE_THROW(
+          paddle::platform::errors::InvalidArgument("Unsupported data type."));
       break;
   }
 
@@ -294,7 +303,8 @@ void PD_GetZeroCopyOutput(PD_Predictor* predictor, PD_ZeroCopyTensor* tensor) {
       output->copy_to_cpu(reinterpret_cast<uint8_t*>(tensor->data.data));
       break;
     default:
-      CHECK(false) << "Unsupport data type.";
+      PADDLE_THROW(
+          paddle::platform::errors::InvalidArgument("Unsupported data type."));
       break;
   }
 }
diff --git a/paddle/fluid/inference/capi/pd_tensor.cc b/paddle/fluid/inference/capi/pd_tensor.cc
index b4811f1d6ff..9b1eedd7c5a 100644
--- a/paddle/fluid/inference/capi/pd_tensor.cc
+++ b/paddle/fluid/inference/capi/pd_tensor.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"
 
 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -37,44 +38,60 @@ void PD_DeletePaddleTensor(PD_Tensor* tensor) {
 }
 
 void PD_SetPaddleTensorName(PD_Tensor* tensor, char* name) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   tensor->tensor.name = std::string(name);
 }
 
 void PD_SetPaddleTensorDType(PD_Tensor* tensor, PD_DataType dtype) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   tensor->tensor.dtype = paddle::ConvertToPaddleDType(dtype);
 }
 
 void PD_SetPaddleTensorData(PD_Tensor* tensor, PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   tensor->tensor.data = buf->buf;
 }
 
 void PD_SetPaddleTensorShape(PD_Tensor* tensor, int* shape, int size) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   tensor->tensor.shape.assign(shape, shape + size);
 }
 
 const char* PD_GetPaddleTensorName(const PD_Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   return tensor->tensor.name.c_str();
 }
 
 PD_DataType PD_GetPaddleTensorDType(const PD_Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   return ConvertToPDDataType(tensor->tensor.dtype);
 }
 
 PD_PaddleBuf* PD_GetPaddleTensorData(const PD_Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   PD_PaddleBuf* ret = PD_NewPaddleBuf();
   ret->buf = tensor->tensor.data;
   return ret;
 }
 
 const int* PD_GetPaddleTensorShape(const PD_Tensor* tensor, int* size) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   const std::vector<int>& shape = tensor->tensor.shape;
   *size = shape.size();
   return shape.data();
-- 
GitLab


From cc3f4b813a0b065661e04071f1ada1a211d37080 Mon Sep 17 00:00:00 2001
From: Adam <38704900+grygielski@users.noreply.github.com>
Date: Mon, 14 Sep 2020 05:28:11 +0200
Subject: [PATCH 057/261] Add int8 GRU kernel (#27220)

* Add int8 GRU kernel with UTs

* Lint fixes

* More lint fixes
---
 cmake/external/mkldnn.cmake                   |   4 +-
 paddle/fluid/operators/fused/fusion_gru_op.cc |  21 +++
 .../fused/mkldnn/fusion_gru_mkldnn_op.cc      | 150 ++++++++++++------
 .../mkldnn/test_fusion_gru_int8_mkldnn_op.py  | 145 +++++++++++++++++
 4 files changed, 271 insertions(+), 49 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index ad7b7c2c2ab..c0adda0da31 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -19,8 +19,8 @@ SET(MKLDNN_PREFIX_DIR     ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
-SET(MKLDNN_REPOSITORY     https://github.com/intel/mkl-dnn.git)
-SET(MKLDNN_TAG            4c05c181b40cf7132f8943411fb3fab1786df0f7)
+SET(MKLDNN_REPOSITORY     https://github.com/oneapi-src/oneDNN.git)
+SET(MKLDNN_TAG            64a48f9565aa72f6359917b3406328075a409939)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index f731a78f77b..40139066096 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -206,6 +206,27 @@ void FusionGRUOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<std::string>(
+      "mkldnn_data_type",
+      "(string, default \"float32\"). Data type of mkldnn kernel")
+      .SetDefault("float32")
+      .InEnum({"float32", "int8", "bfloat16"});
+  AddAttr<float>("Scale_data",
+                 "Scale to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<float>("Shift_data",
+                 "Shift to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(0.0f);
+  AddAttr<std::vector<float>>("Scale_weights",
+                              "Scale_weights to be used for int8 weights data."
+                              "Only used with MKL-DNN INT8.")
+      .SetDefault({1.0f});
+  AddAttr<bool>("force_fp32_output",
+                "(bool, default false) Force INT8 kernel output FP32, only "
+                "used in MKL-DNN INT8")
+      .SetDefault(false);
   AddComment(R"DOC(
 The Fusion complete GRU Operator.
 This operator fuse the fully-connected operator into GRU, 
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index a31fe168439..5fad1b116de 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -21,11 +21,12 @@ namespace operators {
 using paddle::framework::LoDTensor;
 using paddle::framework::Tensor;
 using paddle::platform::CPUDeviceContext;
+using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;
 
-template <typename T>
+template <typename T, typename T_out = T>
 class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
  public:
   GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
@@ -38,7 +39,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
                    const std::string& unique_name)
       : platform::MKLDNNHandlerT<T, dnnl::gru_forward>(
             dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(unique_name, Ti)),
+            CreateKey(unique_name, MKLDNNGetDataType<T>(), Ti)),
         N(N),
         Ti(Ti),
         IC(IC),
@@ -47,9 +48,29 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
     // do not depend on Ti size but primitive and input/output memory do
     if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() !=
         platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) {
-      memory_key_ = unique_name;
+      memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>());
     } else {
-      memory_key_ = unique_name + "-t:" + platform::ThreadIDasStr();
+      memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>(), "-t:",
+                              platform::ThreadIDasStr());
+    }
+
+    // Is it int8 kernel
+    const bool is_INT8 = std::is_same<T, uint8_t>::value;
+
+    if (is_INT8) {
+      // Int8 attributes
+      const float scale_data = ctx.Attr<float>("Scale_data");
+      const float shift_data = ctx.Attr<float>("Shift_data");
+      const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
+
+      const int weights_scale_mask =
+          0 +
+          (1 << 3)  // bit, indicating the unique scales for `g` dim in `ldigo`
+          +
+          (1 << 4);  // bit, indicating the unique scales for `o` dim in `ldigo`
+
+      attr_.set_rnn_data_qparams(scale_data, shift_data);
+      attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
     }
 
     if (!this->isCached()) {
@@ -63,6 +84,10 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
           platform::errors::Unimplemented(
               "oneDNN fusion_gru supports only tanh as an activation."));
 
+      // Weights for int8 kernel are of a type s8
+      const auto weights_dt =
+          is_INT8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32;
+
       // oneDNN RNN dimensions
       const int64_t D = 1;  // Directions
       const int64_t L = 1;  // Layers (PP supports only 1 stacked layer)
@@ -71,19 +96,16 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
       // Create memory descriptors
       auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
                                     MKLDNNMemoryFormat::any);
-      auto weight_x_md = MKLDNNMemDesc(
-          {L, D, IC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
-      auto weight_h_md = MKLDNNMemDesc(
-          {L, D, OC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
+      auto weight_x_md =
+          MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
+      auto weight_h_md =
+          MKLDNNMemDesc({L, D, OC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
       auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
                                    MKLDNNMemoryFormat::ldgo);
-      auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T>(),
+      auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
                                      MKLDNNMemoryFormat::any);
-      auto h0_md = dnnl::memory::desc();
-      if (h0) {
-        h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
-                              MKLDNNMemoryFormat::ldnc);
-      }
+      auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
+                                 MKLDNNMemoryFormat::ldnc);
 
       // Create GRU oneDNN primitive
       const auto direction =
@@ -91,7 +113,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
                      : dnnl::rnn_direction::unidirectional_left2right;
 
       this->AcquireForwardPrimitiveDescriptor(
-          dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
+          attr_, dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
           weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc());
     }
   }
@@ -101,29 +123,31 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
             dnnl::memory::format_tag::ntc);
   }
 
-  void reorderRNNdata(const T* input_data, T* output_data,
+  void reorderRNNdata(void* input_data, void* output_data,
                       std::vector<size_t> lod, const bool is_reverse,
                       platform::RNNReorderType reorder_type) {
     switch (reorder_type) {
       // Reorder input memory [WORDS, C] + LoD -> [N, T, C]
       case platform::RNNReorderType::PP_NTC: {
-        auto* input_data_iter = input_data;
+        auto* input_data_iter = reinterpret_cast<T*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T*>(output_data);
         for (int n = 0; n < N; ++n) {
           const auto num_elements = (lod[n + 1] - lod[n]) * IC;
           const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
-          memcpy(output_data + n * Ti * IC + offset, input_data_iter,
+          memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
                  sizeof(T) * num_elements);
           input_data_iter += num_elements;
         }
       } break;
       // Reorder input memory [WORDS, C] + LoD -> [T, N, C]
       case platform::RNNReorderType::PP_TNC: {
-        auto* input_data_iter = input_data;
+        auto* input_data_iter = reinterpret_cast<T*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T*>(output_data);
         for (int n = 0; n < N; ++n) {
           const auto num_elements = (lod[n + 1] - lod[n]);
           const auto offset = is_reverse ? (Ti - num_elements) : 0;
           for (size_t t = 0; t < num_elements; ++t) {
-            memcpy(output_data + (t + offset) * N * IC + n * IC,
+            memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
                    input_data_iter, sizeof(T) * IC);
             input_data_iter += IC;
           }
@@ -131,24 +155,27 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
       } break;
       // Reorder output values to PP format [N, T, C] -> [WORDS, C]
       case platform::RNNReorderType::NTC_PP: {
-        auto* output_data_iter = output_data;
+        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
         for (int n = 0; n < N; ++n) {
           const auto num_elements = (lod[n + 1] - lod[n]) * OC;
           const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
-          memcpy(output_data_iter, input_data + n * Ti * OC + offset,
-                 sizeof(T) * num_elements);
+          memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
+                 sizeof(T_out) * num_elements);
           output_data_iter += num_elements;
         }
       } break;
       // Reorder output values to PP format [T, N, C] -> [WORDS, C]
       case platform::RNNReorderType::TNC_PP: {
-        auto* output_data_iter = output_data;
+        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
         for (int n = 0; n < N; ++n) {
           const auto num_elements = lod[n + 1] - lod[n];
           const auto offset = is_reverse ? (Ti - num_elements) : 0;
           for (size_t t = 0; t < num_elements; ++t) {
             memcpy(output_data_iter,
-                   input_data + (t + offset) * N * OC + n * OC, sizeof(T) * OC);
+                   input_data_iter + (t + offset) * N * OC + n * OC,
+                   sizeof(T_out) * OC);
             output_data_iter += OC;
           }
         }
@@ -169,9 +196,9 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
     }
 
     const auto& input_lod = input->lod()[0];
-    auto* x_data = input->data<T>();
+    auto* x_data = to_void_cast(input->data<T>());
 
-    auto* x_onednn_data = reinterpret_cast<T*>(memory_p->get_data_handle());
+    auto* x_onednn_data = memory_p->get_data_handle();
     memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
 
     if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
@@ -198,19 +225,35 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
     return memory_p;
   }
 
+  // TODO(grygielski) H0 is for now persistable
   std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
     const std::string h0_key = memory_key_ + "@h0";
     auto memory_p =
         std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
 
-    auto* h0_data = to_void_cast(h0->data<T>());
-
     if (!memory_p) {
-      memory_p = std::make_shared<dnnl::memory>(
-          this->fwd_pd_->weights_layer_desc(), this->engine_, h0_data);
+      auto user_h0_memory = dnnl::memory();
+      if (h0) {
+        user_h0_memory =
+            dnnl::memory({{1, 1, N, OC},
+                          MKLDNNGetDataType<float>(),
+                          MKLDNNMemoryFormat::ldnc},
+                         this->engine_, to_void_cast(h0->data<float>()));
+      } else {
+        user_h0_memory = dnnl::memory({{1, 1, N, OC},
+                                       MKLDNNGetDataType<float>(),
+                                       MKLDNNMemoryFormat::ldnc},
+                                      this->engine_);
+        memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
+      }
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
+                                                this->engine_);
+
+      dnnl::stream astream(this->engine_);
+      dnnl::reorder(user_h0_memory, *memory_p, attr_)
+          .execute(astream, user_h0_memory, *memory_p);
+
       this->dev_ctx_.SetBlob(h0_key, memory_p);
-    } else {
-      memory_p->set_data_handle(h0_data);
     }
     return memory_p;
   }
@@ -245,7 +288,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
           this->fwd_pd_->weights_layer_desc(), this->engine_);
 
       dnnl::stream astream(this->engine_);
-      dnnl::reorder(user_memory, *memory_p)
+      dnnl::reorder(user_memory, *memory_p, attr_)
           .execute(astream, user_memory, *memory_p);
 
       this->dev_ctx_.SetBlob(wx_key, memory_p);
@@ -298,7 +341,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
           this->fwd_pd_->weights_iter_desc(), this->engine_);
 
       dnnl::stream astream(this->engine_);
-      dnnl::reorder(user_memory, *memory_p)
+      dnnl::reorder(user_memory, *memory_p, attr_)
           .execute(astream, user_memory, *memory_p);
 
       this->dev_ctx_.SetBlob(wh_key, memory_p);
@@ -347,12 +390,26 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
   // Memory size of weights, bias and h0 does not depend
   // on Ti size, thus we need another key to cache them
   std::string memory_key_;
+  dnnl::primitive_attr attr_;
 };
 
 template <typename T>
 class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const bool is_INT8 = std::is_same<T, uint8_t>::value;
+    const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+
+    // TODO(grygielski) Add option for bfloat
+    if (!is_INT8 || force_fp32_output) {
+      RunKernel<float>(ctx);
+    } else {
+      RunKernel<uint8_t>(ctx);
+    }
+  }
+
+  template <typename Tout = T>
+  void RunKernel(const framework::ExecutionContext& ctx) const {
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -390,12 +447,14 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
     const int64_t IC = x_mat_dims_vec[1];  // Input channels
     const int64_t OC = weight_h_dims[0];   // Output channels
 
-    GRUMKLDNNHandler<T> handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(),
-                                input, weight_h, h0, is_reverse, N, Ti, IC, OC,
-                                ctx.InputName("X") + ctx.InputName("WeightH"));
+    GRUMKLDNNHandler<T, Tout> handler(
+        ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
+        is_reverse, N, Ti, IC, OC,
+        ctx.InputName("X") + ctx.InputName("WeightH"));
 
     auto input_memory_p =
         handler.AcquireInputMemoryWithReorder(input, is_reverse);
+    auto h0_memory_p = handler.AcquireH0Memory(h0);
     auto weight_x_memory_p =
         handler.AcquireWeightXMemory(weight_x, origin_mode);
     auto weight_h_memory_p =
@@ -405,25 +464,21 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
 
     std::unordered_map<int, dnnl::memory> gru_args = {
         {DNNL_ARG_SRC_LAYER, *input_memory_p},
+        {DNNL_ARG_SRC_ITER, *h0_memory_p},
         {DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
         {DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
         {DNNL_ARG_BIAS, *bias_memory_p},
         {DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};
 
-    if (h0) {
-      auto h0_memory_p = handler.AcquireH0Memory(h0);
-      gru_args.insert({DNNL_ARG_SRC_ITER, *h0_memory_p});
-    }
-
     auto gru_forward_p = handler.AcquireForwardPrimitive();
 
     dnnl::stream astream(mkldnn_engine);
     gru_forward_p->execute(astream, gru_args);
     astream.wait();
 
-    auto* hidden_onednn_data =
-        reinterpret_cast<T*>(hidden_onednn_memory_p->get_data_handle());
-    auto* hidden_data = hidden->mutable_data<T>(ctx.GetPlace());
+    auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle();
+    auto* hidden_data =
+        to_void_cast(hidden->mutable_data<Tout>(ctx.GetPlace()));
     if (handler.is_NTC()) {
       handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
                              is_reverse, platform::RNNReorderType::NTC_PP);
@@ -439,4 +494,5 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace,
-                   ops::FusionGRUMKLDNNKernel<float>);
+                   ops::FusionGRUMKLDNNKernel<float>,
+                   ops::FusionGRUMKLDNNKernel<uint8_t>);
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
new file mode 100644
index 00000000000..ff4531f0e25
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
@@ -0,0 +1,145 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_fusion_gru_op import fusion_gru
+from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION
+
+
+class TestFusionGRUINT8MKLDNNOp(OpTest):
+    def set_confs(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "fusion_gru"
+        self.lod = [[2, 4, 3]]
+        self.IC = 3
+        self.OC = 5
+        self.is_reverse = False
+        self.with_h0 = False
+        self.with_bias = True
+        self.act_state = 'tanh'
+        self.act_gate = 'sigmoid'
+        self.origin_mode = True
+        self.use_mkldnn = True
+        self.force_fp32_output = True
+        self.error_margin = 1e-5
+        self.set_confs()
+
+        # RNN dimensions
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
+
+        # Input data
+        x_f32 = np.random.rand(T, self.IC).astype('float32') * 2 - 1
+        scale_data = 63
+        shift_data = 64
+        x_u8 = (x_f32 * scale_data + shift_data).astype(np.uint8)
+
+        # WeightX/WeightH data
+        wx = np.random.rand(self.IC, 3 * self.OC).astype('float32') * 2 - 1
+        wh = np.random.rand(self.OC, 3 * self.OC).astype('float32') * 2 - 1
+
+        # Calculating weight scales
+        # scales = 63 / max(abs(channel_wise(weightsX + weightsH)))
+        # WeightX data shape in PP: [IC, 3 * OC]
+        # WeightH data shape in PP: [OC, 2 * OC] + [OC, OC]
+        # Scales shape in oneDNN:   [3, OC]
+        scale_ur = 63 / np.max(np.abs(
+            np.concatenate(
+                [
+                    wx[:, :2 * self.OC], wh.flatten()[:2 * self.OC * self.OC]
+                    .reshape(self.OC, 2 * self.OC)
+                ],
+                axis=0)),
+                               axis=0)
+        scale_o = 63 / np.max(np.abs(
+            np.concatenate(
+                [
+                    wx[:, 2 * self.OC:], wh.flatten()[2 * self.OC * self.OC:]
+                    .reshape(self.OC, self.OC)
+                ],
+                axis=0)),
+                              axis=0)
+
+        scale_weights = np.concatenate([scale_ur, scale_o]).astype('float')
+
+        bias = np.random.rand(
+            1, 3 * self.OC).astype('float32') if self.with_bias else np.zeros(
+                (1, 3 * self.OC), dtype='float32')
+        h0 = np.random.rand(
+            N, self.OC).astype('float32') if self.with_h0 else np.zeros(
+                (N, self.OC), dtype='float32')
+
+        _, _, _, hidden_f32 = fusion_gru(x_f32, self.lod, h0, wx, wh, bias,
+                                         self.is_reverse, self.origin_mode,
+                                         ACTIVATION[self.act_state],
+                                         ACTIVATION[self.act_gate])
+
+        self.inputs = {'X': (x_u8, self.lod), 'WeightX': wx, 'WeightH': wh}
+
+        if self.with_bias:
+            self.inputs['Bias'] = bias
+
+        if self.with_h0:
+            self.inputs['H0'] = h0
+
+        if self.force_fp32_output:
+            self.error_margin = 1e-1
+            self.outputs = {'Hidden': (hidden_f32, self.lod)}
+        else:
+            self.error_margin = 1
+            hidden_u8 = (hidden_f32 * scale_data + shift_data).astype(np.uint8)
+            self.outputs = {'Hidden': (hidden_u8, self.lod)}
+
+        self.attrs = {
+            'activation': self.act_state,
+            'gate_activation': self.act_gate,
+            'is_reverse': self.is_reverse,
+            'origin_mode': self.origin_mode,
+            'use_mkldnn': self.use_mkldnn,
+            'force_fp32_output': self.force_fp32_output,
+            'Scale_data': scale_data,
+            'Shift_data': shift_data,
+            'Scale_weights': scale_weights
+        }
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False, atol=self.error_margin)
+
+
+class TestFusionGRUINT8MKLDNNOp2(TestFusionGRUINT8MKLDNNOp):
+    def set_confs(self):
+        self.force_fp32_output = False
+
+
+class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUINT8MKLDNNOp):
+    def set_confs(self):
+        self.origin_mode = False
+
+
+class TestFusionGRUINT8MKLDNNOp4(TestFusionGRUINT8MKLDNNOp):
+    def set_confs(self):
+        self.with_bias = False
+
+
+class TestFusionGRUINT8MKLDNNOp5(TestFusionGRUINT8MKLDNNOp):
+    def set_confs(self):
+        self.with_h0 = False
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From 2b6a5793fe89cc52dbd0eba2b8f8deb2efdae1eb Mon Sep 17 00:00:00 2001
From: ShenLiang <shenliang03@baidu.com>
Date: Mon, 14 Sep 2020 11:30:39 +0800
Subject: [PATCH 058/261] remove auto mode from localsgd optimizer (#27237)

* rm auto from localsgd
---
 .../framework/distributed_strategy.proto      |  5 +-
 .../fleet/base/distributed_strategy.py        |  9 +--
 .../meta_optimizers/localsgd_optimizer.py     | 62 ++++++-------------
 .../test_fleet_distributed_strategy.py        |  5 +-
 .../test_fleet_localsgd_meta_optimizer.py     |  1 +
 5 files changed, 29 insertions(+), 53 deletions(-)
 mode change 100755 => 100644 paddle/fluid/framework/distributed_strategy.proto

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
old mode 100755
new mode 100644
index 8d0093388b4..edd1700ae72
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -36,7 +36,10 @@ message AMPConfig {
   repeated string custom_black_varnames = 9;
 }
 
-message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; }
+message LocalSGDConfig {
+  optional int32 k_steps = 1 [ default = 1 ];
+  optional int32 begin_step = 2 [ default = 1 ];
+}
 
 message GradientMergeConfig {
   optional int32 k_steps = 1 [ default = 1 ];
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index d65be0dd4b1..1b86056c004 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -707,11 +707,7 @@ class DistributedStrategy(object):
 
         **Notes**:
             k_steps(int) The local steps for training before parameter synchronization. Default 1.
-
-            If strategy.auto is set True, the local steps will be calculated automatically during training.
-            The algorithm is referenced in this paper: 
-            `Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
-            In this case, k_steps indicates the first local steps which is suggested setting to 1.
+            begin_step(int) The step of begining training by localsgd. Default 1.
 
         Examples:
           .. code-block:: python
@@ -719,7 +715,8 @@ class DistributedStrategy(object):
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.localsgd = True
-            strategy.localsgd_configs = {"k_steps": 4}
+            strategy.localsgd_configs = {"k_steps": 4,
+                                         "begin_step": 30}
         """
 
         return get_msg_dict(self.strategy.localsgd_configs)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 4d33dfe7456..6fa34d8d28a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -49,7 +49,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
 
     def _enable_strategy(self, dist_strategy, context):
         dist_strategy.localsgd = True
-        dist_strategy.localsgd_configs = {"k_steps": 1}
+        dist_strategy.localsgd_configs = {"k_steps": 1, "begin_step": 1}
 
     def snapshot_name(self, param_name):
         return param_name + self.snapshot_key
@@ -86,8 +86,9 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         minimized = self.inner_opt.minimize(
             loss, startup_program=startup_program)
 
-        init_k_steps = self.user_defined_strategy.localsgd_configs['k_steps']
-        auto_steps = self.user_defined_strategy.auto
+        k_steps_value = self.user_defined_strategy.localsgd_configs['k_steps']
+        begin_step_value = self.user_defined_strategy.localsgd_configs[
+            'begin_step']
 
         if startup_program is None:
             startup_program = default_startup_program()
@@ -101,45 +102,28 @@ class LocalSGDOptimizer(MetaOptimizerBase):
 
         p2s = self.create_snapshot_vars(main_block.program)
         with program_guard(main_block.program, startup_program):
-            step = layers.autoincreased_step_counter(begin=0)
+            step = layers.autoincreased_step_counter(begin=1)
             k_steps = layers.create_global_var(
                 name="k_steps",
                 shape=[1],
-                value=init_k_steps,
+                value=k_steps_value,
                 dtype='int64',
                 persistable=True)
+
+            begin_step = layers.create_global_var(
+                name="begin_step",
+                shape=[1],
+                value=begin_step_value,
+                dtype='int64',
+                persistable=True)
+
             last_step = layers.create_global_var(
                 name="last_step",
                 shape=[1],
-                value=int(0),
+                value=begin_step_value,
                 dtype='int64',
                 persistable=True)
 
-            if auto_steps:
-                avg_loss = layers.collective._c_allreduce(
-                    loss) / self.role_maker.worker_num()
-
-                lr_0 = layers.create_global_var(
-                    name="lr_0",
-                    shape=[1],
-                    value=float(0),
-                    dtype='float32',
-                    persistable=True)
-                loss_0 = layers.create_global_var(
-                    name="loss_0",
-                    shape=[1],
-                    value=float(0),
-                    dtype='float32',
-                    persistable=True)
-
-                global_lr = self.inner_opt._global_learning_rate()
-
-                def initialize():
-                    layers.assign(loss, loss_0)
-                    layers.assign(global_lr, lr_0)
-
-                layers.cond(step == 0, initialize)
-
             def communicate():
                 sub_block = default_main_program().current_block()
                 ring_id = -1
@@ -195,20 +179,10 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                         inputs={'X': [param]},
                         outputs={'Out': [snapshot]},
                         attrs={OP_ROLE_KEY: OpRole.Optimize})
-
-                if auto_steps:
-                    next_local_steps = layers.cast(
-                        layers.ceil(
-                            layers.sqrt(lr_0 * loss / (global_lr * loss_0) *
-                                        float(init_k_steps))),
-                        dtype='int64')
-                    max_local_steps = layers.fill_constant(
-                        shape=[1], dtype='int64', value=16)
-                    next_local_steps = layers.elementwise_min(next_local_steps,
-                                                              max_local_steps)
-                    layers.assign(next_local_steps, k_steps)
                 layers.assign(step, last_step)
 
-            layers.cond(step - last_step == k_steps, communicate)
+            def begin_localsgd():
+                layers.cond(step - last_step == k_steps, communicate)
 
+            layers.cond(step > begin_step, begin_localsgd, communicate)
         return minimized
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 83db1b33551..6f8af3017ef 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -81,9 +81,10 @@ class TestStrategyConfig(unittest.TestCase):
 
     def test_localsgd_configs(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
-        configs = {"k_steps": 4}
+        configs = {"k_steps": 4, "begin_step": 120}
         strategy.localsgd_configs = configs
         self.assertEqual(strategy.localsgd_configs["k_steps"], 4)
+        self.assertEqual(strategy.localsgd_configs["begin_step"], 120)
 
     def test_dgc(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
@@ -230,7 +231,7 @@ class TestStrategyConfig(unittest.TestCase):
         strategy.a_sync = True
         strategy.localsgd = True
         strategy.dgc = True
-        localsgd_configs = {"k_steps": 5}
+        localsgd_configs = {"k_steps": 5, "begin_step": 1}
         strategy.localsgd_configs = localsgd_configs
         build_strategy = paddle.fluid.BuildStrategy()
         build_strategy.enable_sequential_execution = True
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
index 07b988bf875..945f5ae5745 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
@@ -44,6 +44,7 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
         strategy.auto = True
         config = strategy.localsgd_configs
         config['k_steps'] = 1
+        config['begin_step'] = 1
         strategy.localsgd_configs = config
 
         optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-- 
GitLab


From d708b21074400147ae6e57b8af4f6cdc05f92998 Mon Sep 17 00:00:00 2001
From: Zhen Wang <wangzhen31@baidu.com>
Date: Mon, 14 Sep 2020 12:39:54 +0800
Subject: [PATCH 059/261] Update amp_check_finite_and_scale_op and add an
 updating_loss_scaling op for static graph amp training. (#26240)

* update amp_check_finite_and_scale_op for static_amp.

* use amp_check_finite_and_scale in static graph amp.

* update grads to zero when grads own infinite values(as for amp_checkout_finite_and_scale op).

* add update_loss_scaling op in cpp.

* add update_loss_scaling_op unit test.

* update the doc of the check_finite_and_unscale op

* Update the process of gradients updating skipping if the gradients have infinite values.

* update the way to zero grads.

* update test_update_loss_scaling_op.py

* add log info when find infinite grads.

* add the unit test for UpdateLossScaling Layer.
---
 .../amp/amp_check_finite_and_scale_op.cc      | 104 --------
 .../amp/amp_check_finite_and_scale_op.h       |  66 -----
 .../amp/check_finite_and_unscale_op.cc        | 141 ++++++++++
 ...e_op.cu => check_finite_and_unscale_op.cu} |  40 +--
 .../amp/check_finite_and_unscale_op.h         |  31 +++
 .../operators/amp/update_loss_scaling_op.cc   | 170 ++++++++++++
 .../operators/amp/update_loss_scaling_op.cu   |  84 ++++++
 .../operators/amp/update_loss_scaling_op.h    | 123 +++++++++
 paddle/fluid/pybind/op_function_generator.cc  |   4 +-
 .../fluid/contrib/mixed_precision/amp_nn.py   | 124 +++++++++
 .../contrib/mixed_precision/decorator.py      |  67 +++--
 .../contrib/mixed_precision/fp16_utils.py     |  74 ------
 .../paddle/fluid/dygraph/amp/loss_scaler.py   |   5 +-
 .../test_amp_check_finite_and_scale_op.py     |  14 +-
 .../test_fleet_amp_meta_optimizer.py          |   2 +-
 .../unittests/test_update_loss_scaling_op.py  | 250 ++++++++++++++++++
 .../white_list/no_check_set_white_list.py     |   3 +-
 17 files changed, 993 insertions(+), 309 deletions(-)
 delete mode 100644 paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
 delete mode 100644 paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h
 create mode 100644 paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
 rename paddle/fluid/operators/amp/{amp_check_finite_and_scale_op.cu => check_finite_and_unscale_op.cu} (63%)
 create mode 100644 paddle/fluid/operators/amp/check_finite_and_unscale_op.h
 create mode 100644 paddle/fluid/operators/amp/update_loss_scaling_op.cc
 create mode 100644 paddle/fluid/operators/amp/update_loss_scaling_op.cu
 create mode 100644 paddle/fluid/operators/amp/update_loss_scaling_op.h
 create mode 100644 python/paddle/fluid/contrib/mixed_precision/amp_nn.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py

diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
deleted file mode 100644
index 7f0ca1493f7..00000000000
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
-
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel {
- public:
-  AmpCheckFiniteAndScaleOp(const std::string &type,
-                           const framework::VariableNameMap &inputs,
-                           const framework::VariableNameMap &outputs,
-                           const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X",
-                   "amp_check_finite_and_unscale");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
-                   "amp_check_finite_and_unscale");
-    PADDLE_ENFORCE_EQ(
-        ctx->Inputs("X").size(), ctx->Outputs("Out").size(),
-        platform::errors::InvalidArgument(
-            "The input(X) and output(Out) should have same size in "
-            "Operator(amp_check_finite_and_unscale), size of input(X) is %d "
-            "and size of output(Out) is %d.",
-            ctx->Inputs("X").size(), ctx->Outputs("Out").size()));
-    auto x_dims = ctx->GetInputsDim("X");
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->SetOutputDim("FoundInfinite", {1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class AmpCheckFiniteAndScaleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensors) The input tensors of amp_check_finite_and_scale operator.")
-        .AsDuplicable();
-    AddInput("Scale",
-             "(Tensor) 1-dim tensor, the scale of amp_check_finite_and_scale "
-             "operator.");
-    AddOutput("Out",
-              "(Tensors) The scaled output tensor of "
-              "amp_check_finite_and_unscale operator.")
-        .AsDuplicable();
-    AddOutput("FoundInfinite",
-              "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
-              "if there there is infinite or nan item in input X.");
-    AddComment(R"DOC(
-amp_check_finite_and_scale operator.
-Check if input X contains all finite data, if yes, scale it by input Scale.
-
-$$Out = X * scale$$
-
-If any tensor in X contains Inf or Nan, the Out will generate a indicator.
-FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
-Out should not be used, and its data may not be deterministic. 
-Otherwise, FoundInfinite will be 0 (False).
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    amp_check_finite_and_scale, ops::AmpCheckFiniteAndScaleOp,
-    ops::AmpCheckFiniteAndScaleOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    amp_check_finite_and_scale,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
-                                      float>,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
-                                      double>);
diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h
deleted file mode 100644
index 6c2c4eb8a61..00000000000
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/isfinite_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AmpCheckFiniteAndScaleKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    const auto xs = ctx.MultiInput<framework::Tensor>("X");
-    const auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
-
-    const T* scale_data = scale->data<T>();
-    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
-
-    *found_inf_data = false;
-    framework::Tensor is_finite =
-        ctx.AllocateTmpTensor<bool, DeviceContext>({1}, dev_ctx);
-    bool* is_finite_data = is_finite.template data<bool>();
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    for (size_t i = 0; i < xs.size(); ++i) {
-      const auto* x = xs[i];
-      auto* out = outs[i];
-      out->mutable_data<T>(dev_ctx.GetPlace());
-      if (!(*found_inf_data)) {
-        framework::TensorIsfinite(*x, &is_finite);
-        if (*is_finite_data) {
-          auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-          auto eigen_in = framework::EigenVector<T>::Flatten(*x);
-          eigen_out.device(dev) = (*scale_data) * eigen_in;
-        } else {
-          *found_inf_data = true;
-          break;
-        }
-      }
-    }
-    return;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
new file mode 100644
index 00000000000..51c659d5db1
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace operators {
+
+class CheckFiniteAndUnscaleOp : public framework::OperatorWithKernel {
+ public:
+  CheckFiniteAndUnscaleOp(const std::string& type,
+                          const framework::VariableNameMap& inputs,
+                          const framework::VariableNameMap& outputs,
+                          const framework::AttributeMap& attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X",
+                   "check_finite_and_unscale");
+    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
+                   "check_finite_and_unscale");
+    PADDLE_ENFORCE_EQ(
+        ctx->Inputs("X").size(), ctx->Outputs("Out").size(),
+        platform::errors::InvalidArgument(
+            "The input(X) and output(Out) should have same size in "
+            "Operator(check_finite_and_unscale), size of input(X) is %d "
+            "and size of output(Out) is %d.",
+            ctx->Inputs("X").size(), ctx->Outputs("Out").size()));
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->SetOutputDim("FoundInfinite", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "X",
+        "(Tensors) The input tensors of check_finite_and_unscale operator.")
+        .AsDuplicable();
+    AddInput("Scale",
+             "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
+             "operator.");
+    AddOutput("Out",
+              "(Tensors) The scaled output tensor of "
+              "check_finite_and_unscale operator.")
+        .AsDuplicable();
+    AddOutput("FoundInfinite",
+              "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
+              "if there there is infinite or nan item in input X.");
+    AddComment(R"DOC(
+check_finite_and_unscale operator.
+Check if input X contains all finite data, if yes, scale it by input Scale.
+
+$$Out = X / scale$$
+
+If any tensor in X contains Inf or Nan, the Out will generate a indicator.
+FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
+Out should not be used, and its data may not be deterministic. 
+Otherwise, FoundInfinite will be 0 (False).
+
+)DOC");
+  }
+};
+
+template <typename T>
+class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
+
+    const T* scale_data = scale->data<T>();
+    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
+
+    *found_inf_data = false;
+    framework::Tensor is_finite =
+        ctx.AllocateTmpTensor<bool, platform::CPUDeviceContext>({1}, dev_ctx);
+    bool* is_finite_data = is_finite.template data<bool>();
+
+    auto& dev = *ctx.template device_context<platform::CPUDeviceContext>()
+                     .eigen_device();
+
+    T inverse_scale = Inverse<T>(*scale_data);
+    for (size_t i = 0; i < xs.size(); ++i) {
+      const auto* x = xs[i];
+      auto* out = outs[i];
+      out->mutable_data<T>(dev_ctx.GetPlace());
+      if (!(*found_inf_data)) {
+        framework::TensorIsfinite(*x, &is_finite);
+        *found_inf_data = !(*is_finite_data);
+      }
+      auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+      auto eigen_in = framework::EigenVector<T>::Flatten(*x);
+      if (!(*found_inf_data)) {
+        eigen_out.device(dev) = eigen_in * inverse_scale;
+      } else {
+        eigen_out.device(dev) = eigen_in * static_cast<T>(0);
+      }
+    }
+    return;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    check_finite_and_unscale, ops::CheckFiniteAndUnscaleOp,
+    ops::CheckFiniteAndUnscaleOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(check_finite_and_unscale,
+                       ops::CheckFiniteAndUnscaleCpuKernel<float>,
+                       ops::CheckFiniteAndUnscaleCpuKernel<double>);
diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
similarity index 63%
rename from paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
rename to paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
index ee00d7c5f44..cf9df34a246 100644
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
@@ -14,28 +14,31 @@ limitations under the License. */
 
 #include <cuda.h>
 
-#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void AmpCheckFiniteAndScale(const T* in, const T* scale, int num,
-                                       bool* found_inf, T* out) {
+__global__ void GpuInverse(const T* s, T* o) {
+  *o = Inverse<T>(*s);
+}
+
+template <typename T>
+__global__ void CheckFiniteAndUnscale(const T* in, const T* scale, int num,
+                                      bool* found_inf, T* out) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
 
   if (idx < num) {
     if (!isfinite(in[idx])) {
-      *found_inf = 1;
+      *found_inf = true;
     }
-    out[idx] = *found_inf ? in[idx] : in[idx] * scale[0];
+    out[idx] = *found_inf ? in[idx] : in[idx] * (*scale);
   }
 }
 
 template <typename T>
-class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
@@ -48,6 +51,12 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
     bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
     cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool));
 
+    framework::Tensor inverse_scale =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({1}, dev_ctx);
+    T* inverse_scale_v = inverse_scale.template data<T>();
+
+    GpuInverse<T><<<1, 1, 0, dev_ctx.stream()>>>(scale_data, inverse_scale_v);
+
     for (size_t i = 0; i < xs.size(); ++i) {
       const auto* x = xs[i];
       auto* out = outs[i];
@@ -55,11 +64,11 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
       T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
 
       int num = x->numel();
-      int block = 512;
+      int block = 1024;
       int grid = (num + block - 1) / block;
       VLOG(3) << "launch kernel";
-      AmpCheckFiniteAndScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, scale_data, num, found_inf_data, out_data);
+      CheckFiniteAndUnscale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          x_data, inverse_scale_v, num, found_inf_data, out_data);
       VLOG(3) << "finish kernel";
     }
   }
@@ -68,9 +77,6 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    amp_check_finite_and_scale,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
-                                      float>,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
-                                      double>);
+REGISTER_OP_CUDA_KERNEL(check_finite_and_unscale,
+                        ops::CheckFiniteAndUnscaleGpuKernel<float>,
+                        ops::CheckFiniteAndUnscaleGpuKernel<double>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.h b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h
new file mode 100644
index 00000000000..4fb8744d0ee
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/isfinite_op.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+inline HOSTDEVICE T Inverse(T s) {
+  return 1.0 / s;
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
new file mode 100644
index 00000000000..fca3c531b40
--- /dev/null
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -0,0 +1,170 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+#include <cstring>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class UpdateLossScalingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("FoundInfinite"), "Input", "FoundInfinite",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("PrevLossScaling"), "Input", "PrevLossScaling",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("InGoodSteps"), "Input", "InGoodSteps",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("InBadSteps"), "Input", "InBadSteps",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutput("LossScaling"), "Output", "LossScaling",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutput("OutGoodSteps"), "Output", "OutGoodSteps",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutput("OutBadSteps"), "Output", "OutBadSteps",
+                   "update_loss_scaling");
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->SetOutputDim("LossScaling", {1});
+    ctx->SetOutputDim("OutGoodSteps", {1});
+    ctx->SetOutputDim("OutBadSteps", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "PrevLossScaling"),
+        ctx.device_context());
+  }
+};
+
+class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensors) The input tensors of update_loss_scaling operator.")
+        .AsDuplicable();
+    AddInput("FoundInfinite",
+             "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
+             "whether there is any infinite gradient.");
+    AddInput("PrevLossScaling",
+             "(Tensor) 1-dim tensor, previous loss scaling.");
+    AddInput("InGoodSteps",
+             "(Tensor) 1-dim tensor, accumulates good steps in which all "
+             "gradients are finite.");
+    AddInput("InBadSteps",
+             "(Tensor) 1-dim tensor, accumulates bad steps in which some "
+             "gradients are infinite.");
+    AddOutput("Out",
+              "(Tensors) The output tensor of update_loss_scaling operator.")
+        .AsDuplicable();
+    AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling.");
+    AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps.");
+    AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps.");
+    AddAttr<int>("incr_every_n_steps",
+                 "A value represents increasing loss scaling every n "
+                 "consecutive steps with finite gradients.");
+    AddAttr<int>("decr_every_n_nan_or_inf",
+                 "A value represents decreasing loss scaling every n "
+                 "accumulated steps with nan or inf gradients.");
+    AddAttr<float>("incr_ratio",
+                   "The multiplier to use when increasing the loss scaling.")
+        .AddCustomChecker([](float incr_ratio) {
+          PADDLE_ENFORCE_EQ(incr_ratio > 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'incr_ratio' should be greater than 1, but "
+                                "the received is %f",
+                                incr_ratio));
+        });
+    AddAttr<float>(
+        "decr_ratio",
+        "The less-than-one-multiplier to use when decreasing loss scaling.")
+        .AddCustomChecker([](float decr_ratio) {
+          PADDLE_ENFORCE_EQ(decr_ratio > 0.0f && decr_ratio < 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'incr_ratio' should be between 0 and 1, but "
+                                "the received is %f",
+                                decr_ratio));
+        });
+    AddComment(R"DOC(
+Update loss scaling according to overall gradients. If all gradients is 
+finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
+Otherwise, loss scaling will decrease by decr_ratio after
+decr_every_n_nan_or_inf steps and each step some gradients are infinite.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const bool* found_inf_data, const T* pre_loss_scaling_data,
+                  const int* good_in_data, const int* bad_in_data,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, T* updated_loss_scaling_data,
+                  int* good_out_data, int* bad_out_data) const {
+    Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+              incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+              decr_ratio, updated_loss_scaling_data, good_out_data,
+              bad_out_data);
+  }
+};
+
+template <typename T>
+class LazyZeroInputs<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const bool* found_inf_data,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const {
+    if (*found_inf_data) {
+      VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
+      for (size_t i = 0; i < xs.size(); ++i) {
+        auto* out = outs[i];
+        T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+        int num = out->numel();
+        std::memset(out_data, 0, num * sizeof(T));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(
+    update_loss_scaling, ops::UpdateLossScalingOp,
+    ops::UpdateLossScalingOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(update_loss_scaling,
+                       ops::UpdateLossScalingKernel<CPU, float>,
+                       ops::UpdateLossScalingKernel<CPU, double>);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
new file mode 100644
index 00000000000..2bc60423d24
--- /dev/null
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -0,0 +1,84 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void GpuUpdateLossScaling(
+    const bool* found_inf_data, const T* pre_loss_scaling_data,
+    const int* good_in_data, const int* bad_in_data,
+    const int incr_every_n_steps, const int decr_every_n_nan_or_inf,
+    const float incr_ratio, const float decr_ratio,
+    T* updated_loss_scaling_data, int* good_out_data, int* bad_out_data) {
+  Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+            incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+            updated_loss_scaling_data, good_out_data, bad_out_data);
+}
+
+template <typename T>
+class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const bool* found_inf_data, const T* pre_loss_scaling_data,
+                  const int* good_in_data, const int* bad_in_data,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, T* updated_loss_scaling_data,
+                  int* good_out_data, int* bad_out_data) const {
+    GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
+        found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+        updated_loss_scaling_data, good_out_data, bad_out_data);
+  }
+};
+
+template <typename T>
+class LazyZeroInputs<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const bool* found_inf_data,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const {
+    const auto gpu_place =
+        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    bool has_inf{false};
+    memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data,
+                 sizeof(bool), dev_ctx.stream());
+    if (has_inf) {
+      VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
+      for (size_t i = 0; i < xs.size(); ++i) {
+        auto* out = outs[i];
+        T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+        int num = out->numel();
+        cudaMemset(out_data, 0, num * sizeof(T));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using GPU = paddle::platform::CUDADeviceContext;
+
+REGISTER_OP_CUDA_KERNEL(update_loss_scaling,
+                        ops::UpdateLossScalingKernel<GPU, float>,
+                        ops::UpdateLossScalingKernel<GPU, double>);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h
new file mode 100644
index 00000000000..ca23b72eff0
--- /dev/null
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h
@@ -0,0 +1,123 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+HOSTDEVICE void Update(const bool* found_inf_data,
+                       const T* pre_loss_scaling_data, const int* good_in_data,
+                       const int* bad_in_data, const int incr_every_n_steps,
+                       const int decr_every_n_nan_or_inf,
+                       const float incr_ratio, const float decr_ratio,
+                       T* updated_loss_scaling_data, int* good_out_data,
+                       int* bad_out_data) {
+  if (*found_inf_data) {
+    *good_out_data = 0;
+    *bad_out_data = *bad_in_data + 1;
+    if (*bad_out_data == decr_every_n_nan_or_inf) {
+      T new_loss_scaling = *pre_loss_scaling_data * decr_ratio;
+      *updated_loss_scaling_data = new_loss_scaling < static_cast<T>(1)
+                                       ? static_cast<T>(1)
+                                       : new_loss_scaling;
+      *bad_out_data = 0;
+    }
+  } else {
+    *bad_out_data = 0;
+    *good_out_data = *good_in_data + 1;
+    if (*good_out_data == incr_every_n_steps) {
+      T new_loss_scaling = *pre_loss_scaling_data * incr_ratio;
+      *updated_loss_scaling_data = std::isfinite(new_loss_scaling)
+                                       ? new_loss_scaling
+                                       : *pre_loss_scaling_data;
+      *good_out_data = 0;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class UpdateLossScalingFunctor {
+ public:
+  void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
+                  const T* pre_loss_scaling_data, const int* good_in_data,
+                  const int* bad_in_data, const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, T* updated_loss_scaling_data,
+                  int* good_out_data, int* bad_out_data) const;
+};
+
+template <typename DeviceContext, typename T>
+class LazyZeroInputs {
+ public:
+  void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const;
+};
+
+template <typename DeviceContext, typename T>
+class UpdateLossScalingKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
+    const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
+    const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
+    const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
+    auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
+    auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
+
+    PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "FoundInfinite must has only one element."));
+
+    const bool* found_inf_data = found_inf->data<bool>();
+    const T* pre_loss_scaling_data = pre_loss_scaling->data<T>();
+    const int* good_in_data = good_in->data<int>();
+    const int* bad_in_data = bad_in->data<int>();
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    T* updated_loss_scaling_data =
+        updated_loss_scaling->mutable_data<T>(dev_ctx.GetPlace());
+    int* good_out_data = good_out->mutable_data<int>(dev_ctx.GetPlace());
+    int* bad_out_data = bad_out->mutable_data<int>(dev_ctx.GetPlace());
+
+    const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
+    const int decr_every_n_nan_or_inf =
+        ctx.Attr<int>("decr_every_n_nan_or_inf");
+    const float incr_ratio = ctx.Attr<float>("incr_ratio");
+    const float decr_ratio = ctx.Attr<float>("decr_ratio");
+    UpdateLossScalingFunctor<DeviceContext, T>{}(
+        dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
+        bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+        decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
+    LazyZeroInputs<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 178ecaff7e8..f751136640c 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -111,7 +111,9 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"Out", "OutScale", "OutAccum", "OutState"}},
     {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
-    {"amp_check_finite_and_scale", {"Out", "FoundInfinite"}},
+    {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
+    {"update_loss_scaling",
+     {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
 };
 
 // clang-format off
diff --git a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
new file mode 100644
index 00000000000..d4dc968ca0d
--- /dev/null
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
@@ -0,0 +1,124 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_type
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import Variable
+
+__all__ = ['check_finite_and_unscale', 'update_loss_scaling']
+
+
+def check_finite_and_unscale(x, scale, name=None):
+    """
+    Check if input X contains all finite data, if yes, scale it by input Scale.
+
+    $$Out = X / scale$$
+
+    If any tensor in X contains Inf or Nan, the Out will generate a indicator.
+    FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
+    Out should not be used, and its data may not be deterministic. 
+    Otherwise, FoundInfinite will be 0 (False).
+    Args:
+        x(list|tuple): The input tensors of check_finite_and_unscale operator.
+        scale: The scale of check_finite_and_unscale operator.
+    """
+    check_type(x, 'x', (tuple, list), 'check_finite_and_unscale')
+    for e in x:
+        check_variable_and_dtype(e, "x", ['float32', 'float64'],
+                                 'check_finite_and_unscale')
+
+    helper = LayerHelper("check_finite_and_unscale", **locals())
+    found_inf = helper.create_variable_for_type_inference(dtype='bool')
+
+    inputs = {'X': x, 'Scale': scale}
+    outputs = {'Out': x, 'FoundInfinite': found_inf}
+    helper.append_op(
+        type='check_finite_and_unscale', inputs=inputs, outputs=outputs)
+
+    return x, found_inf
+
+
+def update_loss_scaling(x,
+                        found_inf,
+                        prev_loss_scaling,
+                        num_good_steps,
+                        num_bad_steps,
+                        incr_every_n_steps,
+                        decr_every_n_nan_or_inf,
+                        incr_ratio,
+                        decr_ratio,
+                        name=None):
+    """
+    Update loss scaling according to overall gradients. If all gradients is 
+    finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
+    Otherwise, loss scaling will decrease by decr_ratio after
+    decr_every_n_nan_or_inf steps and each step some gradients are infinite.
+
+    Args:
+        x(list|tuple): The input tensors of update_loss_scaling operator.
+        found_inf (Variable): A boolean variable indicates whether 
+                                     there is any infinite gradient.
+        prev_loss_scaling (Variable): Previous loss scaling.
+        num_good_steps (Variable): A variable accumulates good steps in which 
+                                   all gradients are finite.
+        num_bad_steps (Variable): A variable accumulates bad steps in which 
+                                  some gradients are infinite.
+        incr_every_n_steps (int): A variable represents increasing loss 
+                                       scaling every n consecutive steps with 
+                                       finite gradients.
+        decr_every_n_nan_or_inf (int): A variable represents decreasing 
+                                            loss scaling every n accumulated 
+                                            steps with nan or inf gradients.
+        incr_ratio(float): The multiplier to use when increasing the loss 
+                           scaling.
+        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
+                           loss scaling.
+    """
+
+    check_variable_and_dtype(prev_loss_scaling, "prev_loss_scaling",
+                             ['float32', 'float64'], "update_loss_scaling")
+    check_type(x, 'x', (tuple, list), 'update_loss_scaling')
+    for e in x:
+        check_variable_and_dtype(e, "x", ['float32', 'float64'],
+                                 'update_loss_scaling')
+        assert prev_loss_scaling.dtype == e.dtype, "The dtype of prev_loss_scaling should be equal to the dtype of x."
+
+    helper = LayerHelper("update_loss_scaling", **locals())
+
+    inputs = {
+        'X': x,
+        'FoundInfinite': found_inf,
+        'PrevLossScaling': prev_loss_scaling,
+        'InGoodSteps': num_good_steps,
+        'InBadSteps': num_bad_steps
+    }
+
+    outputs = {
+        'Out': x,
+        'LossScaling': prev_loss_scaling,
+        'OutGoodSteps': num_good_steps,
+        'OutBadSteps': num_bad_steps
+    }
+
+    attrs = {
+        'incr_every_n_steps': incr_every_n_steps,
+        'decr_every_n_nan_or_inf': decr_every_n_nan_or_inf,
+        'incr_ratio': incr_ratio,
+        'decr_ratio': decr_ratio,
+    }
+
+    helper.append_op(
+        type='update_loss_scaling', inputs=inputs, outputs=outputs, attrs=attrs)
+
+    return x
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index bfbd2700ae1..c9112ac849c 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -17,9 +17,11 @@ from ... import default_startup_program
 from ... import layers
 from ... import unique_name
 from . import fp16_utils
-from .fp16_utils import update_loss_scaling, rewrite_program
+from .fp16_utils import rewrite_program
 from .fp16_utils import update_role_var_grad
 from .fp16_lists import AutoMixedPrecisionLists
+from .amp_nn import check_finite_and_unscale
+from .amp_nn import update_loss_scaling
 
 __all__ = ["decorate"]
 
@@ -67,10 +69,8 @@ class OptimizerWithMixedPrecision(object):
             persistable=True)
         self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
         if self._use_dynamic_loss_scaling:
-            self._incr_every_n_steps = layers.fill_constant(
-                shape=[1], dtype='int32', value=incr_every_n_steps)
-            self._decr_every_n_nan_or_inf = layers.fill_constant(
-                shape=[1], dtype='int32', value=decr_every_n_nan_or_inf)
+            self._incr_every_n_steps = incr_every_n_steps
+            self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
             self._incr_ratio = incr_ratio
             self._decr_ratio = decr_ratio
             self._num_good_steps = layers.create_global_var(
@@ -139,49 +139,46 @@ class OptimizerWithMixedPrecision(object):
         # Change the op_role_var attr for some ops, so that gradients
         # transferred across GPUs can be FP16.
         update_role_var_grad(self._train_program, self._params_grads)
-        scaled_params_grads = []
-        for p, g in self._params_grads:
-            with self._train_program._optimized_guard([p, g]):
-                scaled_g = g / self._loss_scaling
-                scaled_params_grads.append([p, scaled_g])
 
-        return scaled_params_grads
+        return self._params_grads
 
-    def apply_gradients(self, scaled_params_grads):
+    def apply_gradients(self, params_grads):
         """
         Check scaled gradients to determine whether to update loss scaling and update 
         parameters by their scaled gradients, 
   
         Args:
-            scaled_params_grads (list): A list of params and scaled grads.
+            params_grads (list): A list of params and scaled grads.
     
         Returns:
             A list of optimize operators.
         """
 
-        if self._use_dynamic_loss_scaling:
+        grads = [g for _, g in params_grads]
+        with self._train_program._optimized_guard(grads):
+            grads, found_inf = check_finite_and_unscale(
+                grads, self._loss_scaling, name="find_infinite_scale")
 
-            grads = [layers.reduce_sum(g) for [_, g] in scaled_params_grads]
-            all_grads = layers.concat(grads)
-            all_grads_sum = layers.reduce_sum(all_grads)
-            is_overall_finite = layers.isfinite(all_grads_sum)
-
-            update_loss_scaling(is_overall_finite, self._loss_scaling,
-                                self._num_good_steps, self._num_bad_steps,
-                                self._incr_every_n_steps,
-                                self._decr_every_n_nan_or_inf, self._incr_ratio,
-                                self._decr_ratio)
-
-            # apply_gradient append all ops in global block, thus we shouldn't
-            # apply gradient in the switch branch.
-            with layers.Switch() as switch:
-                with switch.case(is_overall_finite):
-                    pass
-                with switch.default():
-                    for _, g in scaled_params_grads:
-                        layers.assign(layers.zeros_like(g), g)
-
-        optimize_ops = self._optimizer.apply_gradients(scaled_params_grads)
+        if self._use_dynamic_loss_scaling:
+            with self._train_program._optimized_guard(grads):
+                grads = update_loss_scaling(
+                    grads,
+                    found_inf,
+                    self._loss_scaling,
+                    self._num_good_steps,
+                    self._num_bad_steps,
+                    self._incr_every_n_steps,
+                    self._decr_every_n_nan_or_inf,
+                    self._incr_ratio,
+                    self._decr_ratio,
+                    name="update_loss_scaling")
+
+        params_unscaled_grads = []
+        for pg, new_g in zip(params_grads, grads):
+            params_unscaled_grads.append((pg[0], new_g))
+        # apply_gradient append all ops in global block, thus we shouldn't
+        # apply gradient in the switch branch.
+        optimize_ops = self._optimizer.apply_gradients(params_unscaled_grads)
 
         return optimize_ops
 
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 328dafe6219..0b142ff33de 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -328,77 +328,3 @@ def update_role_var_grad(main_prog, params_grads):
                 raise ValueError("The op {0} is not in program".format(op))
             block.desc._remove_op(op_idx, op_idx + 1)
         block._sync_with_cpp()
-
-
-def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
-                        num_bad_steps, incr_every_n_steps,
-                        decr_every_n_nan_or_inf, incr_ratio, decr_ratio):
-    """
-    Update loss scaling according to overall gradients. If all gradients is 
-    finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
-    Otherwise, loss scaling will decrease by decr_ratio after
-    decr_every_n_nan_or_inf steps and each step some gradients are infinite.
-
-    Args:
-        is_overall_finite (Variable): A boolean variable indicates whether 
-                                     all gradients are finite.
-        prev_loss_scaling (Variable): Previous loss scaling.
-        num_good_steps (Variable): A variable accumulates good steps in which 
-                                   all gradients are finite.
-        num_bad_steps (Variable): A variable accumulates bad steps in which 
-                                  some gradients are infinite.
-        incr_every_n_steps (Variable): A variable represents increasing loss 
-                                       scaling every n consecutive steps with 
-                                       finite gradients.
-        decr_every_n_nan_or_inf (Variable): A variable represents decreasing 
-                                            loss scaling every n accumulated 
-                                            steps with nan or inf gradients.
-        incr_ratio(float): The multiplier to use when increasing the loss 
-                           scaling.
-        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
-                           loss scaling.
-    """
-    zero_steps = layers.fill_constant(shape=[1], dtype='int32', value=0)
-    with layers.Switch() as switch:
-        with switch.case(is_overall_finite):
-            should_incr_loss_scaling = layers.less_than(incr_every_n_steps,
-                                                        num_good_steps + 1)
-            with layers.Switch() as switch1:
-                with switch1.case(should_incr_loss_scaling):
-                    new_loss_scaling = prev_loss_scaling * incr_ratio
-                    loss_scaling_is_finite = layers.isfinite(new_loss_scaling)
-                    with layers.Switch() as switch2:
-                        with switch2.case(loss_scaling_is_finite):
-                            layers.assign(new_loss_scaling, prev_loss_scaling)
-                        with switch2.default():
-                            pass
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-
-                with switch1.default():
-                    layers.increment(num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-
-        with switch.default():
-            should_decr_loss_scaling = layers.less_than(decr_every_n_nan_or_inf,
-                                                        num_bad_steps + 1)
-            with layers.Switch() as switch3:
-                with switch3.case(should_decr_loss_scaling):
-                    new_loss_scaling = prev_loss_scaling * decr_ratio
-                    static_loss_scaling = \
-                        layers.fill_constant(shape=[1],
-                                             dtype='float32',
-                                             value=1.0)
-                    less_than_one = layers.less_than(new_loss_scaling,
-                                                     static_loss_scaling)
-                    with layers.Switch() as switch4:
-                        with switch4.case(less_than_one):
-                            layers.assign(static_loss_scaling,
-                                          prev_loss_scaling)
-                        with switch4.default():
-                            layers.assign(new_loss_scaling, prev_loss_scaling)
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-                with switch3.default():
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.increment(num_bad_steps)
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index 8f3ca9ec007..ff57f30dcd2 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -210,13 +210,12 @@ class AmpScaler(object):
     def _unscale(self, optimizer):
         if not self._enable:
             return
-        inv_scale = 1.0 / self._scale
         param_grads = [
             param._grad_ivar() for param in optimizer._parameter_list
             if param._grad_ivar() is not None
         ]
-        core.ops.amp_check_finite_and_scale(param_grads, inv_scale, param_grads,
-                                            self._found_inf)
+        core.ops.check_finite_and_unscale(param_grads, self._scale, param_grads,
+                                          self._found_inf)
 
     def _update(self):
         """
diff --git a/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py b/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
index 70863d3857c..fbacaa3d5ce 100644
--- a/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
@@ -18,9 +18,9 @@ from op_test import OpTest, skip_check_grad_ci
 import paddle.fluid as fluid
 
 
-class TestAmpCheckFiniteAndScaleOp(OpTest):
+class TestCheckFiniteAndUnscaleOp(OpTest):
     def setUp(self):
-        self.op_type = "amp_check_finite_and_scale"
+        self.op_type = "check_finite_and_unscale"
         self.init_dtype()
         x = np.random.random((1024, 1024)).astype(self.dtype)
         scale = np.random.random((1)).astype(self.dtype)
@@ -28,7 +28,7 @@ class TestAmpCheckFiniteAndScaleOp(OpTest):
         self.inputs = {'X': [('x0', x)], 'Scale': scale}
         self.outputs = {
             'FoundInfinite': np.array([0]),
-            'Out': [('out0', x * scale)],
+            'Out': [('out0', x / scale)],
         }
 
     def init_dtype(self):
@@ -38,9 +38,9 @@ class TestAmpCheckFiniteAndScaleOp(OpTest):
         self.check_output()
 
 
-class TestAmpCheckFiniteAndScaleOpWithNan(OpTest):
+class TestCheckFiniteAndUnscaleOpWithNan(OpTest):
     def setUp(self):
-        self.op_type = "amp_check_finite_and_scale"
+        self.op_type = "check_finite_and_unscale"
         self.init_dtype()
         x = np.random.random((1024, 1024)).astype(self.dtype)
         x[128][128] = np.nan
@@ -61,9 +61,9 @@ class TestAmpCheckFiniteAndScaleOpWithNan(OpTest):
         self.check_output(no_check_set=['Out'])
 
 
-class TestAmpCheckFiniteAndScaleOpWithInf(OpTest):
+class TestCheckFiniteAndUnscaleOpWithInf(OpTest):
     def setUp(self):
-        self.op_type = "amp_check_finite_and_scale"
+        self.op_type = "check_finite_and_unscale"
         self.init_dtype()
         x = np.random.random((1024, 1024)).astype(self.dtype)
         x[128][128] = np.inf
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
index 38c3903306e..73e014b3500 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -57,7 +57,7 @@ class TestFleetAMPOptimizer(unittest.TestCase):
 
         ops = [op.type for op in avg_cost.block.ops]
         self.assertIn('cast', ops)
-        self.assertIn('isfinite', ops)
+        self.assertIn('check_finite_and_unscale', ops)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
new file mode 100644
index 00000000000..fb93334415c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
@@ -0,0 +1,250 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn
+
+
+class TestUpdateLossScalingOp(OpTest):
+    def setUp(self):
+        self.op_type = "update_loss_scaling"
+        self.init()
+        found_inf = np.array([False], dtype=np.bool)
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+
+        self.inputs = {
+            'X': [('x0', x)],
+            'FoundInfinite': found_inf,
+            'PrevLossScaling': self.prev_loss_scaling,
+            'InGoodSteps': self.num_good_steps,
+            'InBadSteps': self.num_bad_steps
+        }
+
+        self.outputs = {
+            'Out': [('out0', np.zeros_like(x))],
+            'LossScaling': self.prev_loss_scaling * self.incr_ratio,
+            'OutGoodSteps': self.zero_steps,
+            'OutBadSteps': self.zero_steps
+        }
+
+    def init(self):
+        self.incr_ratio = 2.0
+        self.decr_ratio = 0.8
+        self.dtype = np.float32
+        self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
+        self.num_good_steps = np.array([999], dtype=np.int32)
+        self.num_bad_steps = np.array([1], dtype=np.int32)
+        self.zero_steps = np.array([0], dtype=np.int32)
+        self.attrs = {
+            'incr_every_n_steps': 1000,
+            'decr_every_n_nan_or_inf': 2,
+            'incr_ratio': self.incr_ratio,
+            'decr_ratio': self.decr_ratio,
+        }
+
+    def test_check_output(self):
+        self.check_output(no_check_set=['Out'])
+
+
+class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
+    def setUp(self):
+        self.op_type = "update_loss_scaling"
+        self.init()
+        found_inf = np.array([True], dtype=np.bool)
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+        i = np.random.randint(0, 1024, 1)
+        j = np.random.randint(0, 1024, 1)
+        x[i[0]][j[0]] = np.inf
+
+        self.inputs = {
+            'X': [('x0', x)],
+            'FoundInfinite': found_inf,
+            'PrevLossScaling': self.prev_loss_scaling,
+            'InGoodSteps': self.num_good_steps,
+            'InBadSteps': self.num_bad_steps
+        }
+
+        self.outputs = {
+            'Out': [('out0', np.zeros_like(x))],
+            'LossScaling': self.prev_loss_scaling * self.decr_ratio,
+            'OutGoodSteps': self.zero_steps,
+            'OutBadSteps': self.zero_steps
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestUpdateLossScalingLayer(unittest.TestCase):
+    def loss_scaling_check(self, use_cuda=True, scope=fluid.Scope()):
+        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+        x = [a, b]
+        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+        prev_loss_scaling = fluid.data(
+            name="prev_loss_scaling", shape=[1], dtype='float32')
+        num_good_steps = fluid.data(
+            name="num_good_steps", shape=[1], dtype='int32')
+        num_bad_steps = fluid.data(
+            name="num_bad_steps", shape=[1], dtype='int32')
+
+        a_v = np.random.random([1024, 1024]).astype('float32')
+        b_v = np.random.random([512, 128]).astype('float32')
+        found_inf_v = np.array([False]).astype('bool')
+        prev_loss_scaling_v = np.array([2048]).astype('float32')
+        num_good_steps_v = np.array([999], dtype=np.int32)
+        num_bad_steps_v = np.array([1], dtype=np.int32)
+
+        incr_every_n_steps = 1000
+        decr_every_n_nan_or_inf = 2
+        incr_ratio = 2
+        decr_ratio = 0.8
+
+        result = amp_nn.update_loss_scaling(
+            x,
+            found_inf,
+            prev_loss_scaling,
+            num_good_steps,
+            num_bad_steps,
+            incr_every_n_steps,
+            decr_every_n_nan_or_inf,
+            incr_ratio,
+            decr_ratio,
+            name="update_loss_scaling")
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        with fluid.scope_guard(scope):
+            exe.run(fluid.default_startup_program())
+            result_v = exe.run(feed={
+                'a': a_v,
+                'b': b_v,
+                'found_inf': found_inf_v,
+                'prev_loss_scaling': prev_loss_scaling_v,
+                'num_good_steps': num_good_steps_v,
+                'num_bad_steps': num_bad_steps_v
+            },
+                               fetch_list=[
+                                   result, x, found_inf, prev_loss_scaling,
+                                   num_good_steps, num_bad_steps
+                               ])
+        assert np.array_equal(result_v[0], a_v)
+        assert np.array_equal(result_v[1], b_v)
+        assert np.array_equal(result_v[0], result_v[2])
+        assert np.array_equal(result_v[1], result_v[3])
+        assert np.array_equal(result_v[4], found_inf_v)
+        assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio)
+        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+    def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
+        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+        x = [a, b]
+        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+        prev_loss_scaling = fluid.data(
+            name="prev_loss_scaling", shape=[1], dtype='float32')
+        num_good_steps = fluid.data(
+            name="num_good_steps", shape=[1], dtype='int32')
+        num_bad_steps = fluid.data(
+            name="num_bad_steps", shape=[1], dtype='int32')
+
+        a_v = np.random.random([1024, 1024]).astype('float32')
+        b_v = np.random.random([512, 128]).astype('float32')
+        i = np.random.randint(0, 1024, 1)
+        j = np.random.randint(0, 1024, 1)
+        a_v[i[0]][j[0]] = np.inf
+        found_inf_v = np.array([True]).astype('bool')
+        prev_loss_scaling_v = np.array([2048]).astype('float32')
+        num_good_steps_v = np.array([999], dtype=np.int32)
+        num_bad_steps_v = np.array([1], dtype=np.int32)
+
+        incr_every_n_steps = 1000
+        decr_every_n_nan_or_inf = 2
+        incr_ratio = 2
+        decr_ratio = 0.8
+
+        result = amp_nn.update_loss_scaling(
+            x,
+            found_inf,
+            prev_loss_scaling,
+            num_good_steps,
+            num_bad_steps,
+            incr_every_n_steps,
+            decr_every_n_nan_or_inf,
+            incr_ratio,
+            decr_ratio,
+            name="update_loss_scaling")
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        with fluid.scope_guard(scope):
+            exe.run(fluid.default_startup_program())
+            result_v = exe.run(feed={
+                'a': a_v,
+                'b': b_v,
+                'found_inf': found_inf_v,
+                'prev_loss_scaling': prev_loss_scaling_v,
+                'num_good_steps': num_good_steps_v,
+                'num_bad_steps': num_bad_steps_v
+            },
+                               fetch_list=[
+                                   result, x, found_inf, prev_loss_scaling,
+                                   num_good_steps, num_bad_steps
+                               ])
+        assert np.array_equal(result_v[0], np.zeros_like(a_v))
+        assert np.array_equal(result_v[1], np.zeros_like(b_v))
+        assert np.array_equal(result_v[2], np.zeros_like(a_v))
+        assert np.array_equal(result_v[3], np.zeros_like(b_v))
+        assert np.array_equal(result_v[4], found_inf_v)
+        assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio)
+        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+    def test_loss_scaling_cpu(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check(use_cuda=False)
+
+    def test_loss_scaling_cpu_inf(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check_inf(use_cuda=False)
+
+    def test_loss_scaling_gpu(self):
+        if fluid.core.is_compiled_with_cuda():
+            main = fluid.Program()
+            startup = fluid.Program()
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    self.loss_scaling_check(use_cuda=True)
+
+    def test_loss_scaling_gpu_inf(self):
+        if fluid.core.is_compiled_with_cuda():
+            main = fluid.Program()
+            startup = fluid.Program()
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    self.loss_scaling_check_inf(use_cuda=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
index 0de0eeb464a..afd3414943e 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -25,6 +25,7 @@ no_check_set_white_list = [
     'unsqueeze2',
     'cross_entropy2',
     'seed',
-    'amp_check_finite_and_scale',
+    'check_finite_and_unscale',
+    'update_loss_scaling',
     'cudnn_lstm',
 ]
-- 
GitLab


From aae41c6fca67be6a090d4f83bdf6160737d15162 Mon Sep 17 00:00:00 2001
From: Pei Yang <peiyang@baidu.com>
Date: Mon, 14 Sep 2020 12:55:22 +0800
Subject: [PATCH 060/261] refine error message related to paddle-TRT (#27256)

---
 paddle/fluid/inference/tensorrt/engine.cc     | 84 ++++++++++++++-----
 paddle/fluid/inference/tensorrt/engine.h      | 16 +++-
 .../tensorrt/plugin/elementwise_op_plugin.cu  | 44 ++++++++--
 .../fluid/inference/tensorrt/test_engine.cc   | 16 +++-
 .../inference/tensorrt/trt_int8_calibrator.cc |  5 +-
 .../operators/tensorrt/tensorrt_engine_op.h   | 20 +++--
 6 files changed, 138 insertions(+), 47 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 22be8774932..754979f77ac 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -63,11 +63,13 @@ void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
 void TensorRTEngine::FreezeNetwork() {
   freshDeviceId();
   VLOG(3) << "TRT to freeze network";
-  PADDLE_ENFORCE(infer_builder_ != nullptr,
-                 "Call InitNetwork first to initialize network.");
-  PADDLE_ENFORCE_EQ(network() != nullptr, true,
-                    platform::errors::InvalidArgument(
-                        "Call InitNetwork first to initialize network."));
+  PADDLE_ENFORCE_NOT_NULL(infer_builder_,
+                          platform::errors::InvalidArgument(
+                              "Inference builder of TRT is null. Please make "
+                              "sure you call InitNetwork first."));
+  PADDLE_ENFORCE_NOT_NULL(network(),
+                          platform::errors::InvalidArgument(
+                              "Call InitNetwork first to initialize network."));
   // build engine.
   infer_builder_->setMaxBatchSize(max_batch_);
   infer_builder_->setMaxWorkspaceSize(max_workspace_);
@@ -210,7 +212,10 @@ void TensorRTEngine::FreezeNetwork() {
   } else {
     infer_engine_.reset(infer_builder_->buildCudaEngine(*network()));
   }
-  PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
+  PADDLE_ENFORCE_NOT_NULL(
+      infer_engine_, platform::errors::Fatal(
+                         "Build TensorRT cuda engine failed! Please recheck "
+                         "you configurations related to paddle-TensorRT."));
 }
 
 nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
@@ -220,8 +225,16 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
                     platform::errors::InvalidArgument(
                         "The TRT network should be initialized first."));
   auto *input = network()->addInput(name.c_str(), dtype, dims);
-  PADDLE_ENFORCE(input, "infer network add input %s failed", name);
-  PADDLE_ENFORCE(input->isNetworkInput());
+  PADDLE_ENFORCE_NOT_NULL(
+      input, platform::errors::InvalidArgument("Adding input %s failed in "
+                                               "TensorRT inference network. "
+                                               "Please recheck your input.",
+                                               name));
+  PADDLE_ENFORCE_EQ(input->isNetworkInput(), true,
+                    platform::errors::InvalidArgument(
+                        "Input %s is not the input of TRT inference network. "
+                        "Please recheck your input.",
+                        name));
   TensorRTEngine::SetITensor(name, input);
   return input;
 }
@@ -230,31 +243,53 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
                                    const std::string &name) {
   auto *output = layer->getOutput(offset);
   SetITensor(name, output);
-  PADDLE_ENFORCE(output != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      output, platform::errors::InvalidArgument(
+                  "The output %s of TRT engine should not be null.", name));
   output->setName(name.c_str());
-  PADDLE_ENFORCE(!output->isNetworkInput());
+  PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
+                    platform::errors::InvalidArgument(
+                        "The output %s of TRT engine should not be the input "
+                        "of the network at the same time.",
+                        name));
   network()->markOutput(*output);
-  PADDLE_ENFORCE(output->isNetworkOutput());
+  PADDLE_ENFORCE_EQ(
+      output->isNetworkOutput(), true,
+      platform::errors::InvalidArgument(
+          "The output %s of TRT engine should be the output of the network.",
+          name));
 }
 
 void TensorRTEngine::DeclareOutput(const std::string &name) {
   auto *output = TensorRTEngine::GetITensor(name);
-  PADDLE_ENFORCE(output != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      output, platform::errors::InvalidArgument(
+                  "The output %s of TRT engine should not be null.", name));
   output->setName(name.c_str());
-  PADDLE_ENFORCE(!output->isNetworkInput());
+  PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
+                    platform::errors::InvalidArgument(
+                        "The output %s of TRT engine should not be the input "
+                        "of the network at the same time.",
+                        name));
   network()->markOutput(*output);
 }
 
 void TensorRTEngine::SetITensor(const std::string &name,
                                 nvinfer1::ITensor *tensor) {
-  PADDLE_ENFORCE(tensor != nullptr);
-  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
-                    name);
+  PADDLE_ENFORCE_NOT_NULL(
+      tensor, platform::errors::InvalidArgument(
+                  "Tensor named %s of TRT engine should not be null.", name));
+  PADDLE_ENFORCE_EQ(
+      0, itensor_map_.count(name),
+      platform::errors::InvalidArgument(
+          "Tensor named %s of TRT engine should not be duplicated", name));
   itensor_map_[name] = tensor;
 }
 
 nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
-  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
+  PADDLE_ENFORCE_EQ(itensor_map_.count(name), true,
+                    platform::errors::NotFound(
+                        "Tensor named %s is not found in TRT engine", name));
   return itensor_map_[name];
 }
 
@@ -271,11 +306,11 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
   std::string splitter = "__";
   std::string name_with_suffix = name + splitter + name_suffix;
   platform::CPUPlace cpu_place;
-  PADDLE_ENFORCE_EQ(
-      weight_map.count(name_with_suffix), 0,
-      "During TRT Op converter: We set weight %s with the same name "
-      "twice into the weight_map",
-      name_with_suffix);
+  PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix), 0,
+                    platform::errors::AlreadyExists(
+                        "The weight named %s is set into the weight map "
+                        "twice in TRT OP converter.",
+                        name_with_suffix));
   weight_map[name_with_suffix].reset(new framework::Tensor());
   weight_map[name_with_suffix]->Resize(weight_tensor->dims());
   TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get());
@@ -297,7 +332,10 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
 void TensorRTEngine::freshDeviceId() {
   int count;
   cudaGetDeviceCount(&count);
-  PADDLE_ENFORCE_LT(device_id_, count);
+  PADDLE_ENFORCE_LT(device_id_, count,
+                    platform::errors::OutOfRange(
+                        "Device id %d exceeds the current device count: %d.",
+                        device_id_, count));
   cudaSetDevice(device_id_);
 }
 
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 1a3413657ce..a85ed483c1d 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -196,8 +196,10 @@ class TensorRTEngine {
   }
 
   nvinfer1::IHostMemory* Serialize() {
-    PADDLE_ENFORCE(infer_engine_ != nullptr,
-                   "You should build engine first and then serialize");
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_engine_,
+        platform::errors::InvalidArgument(
+            "The TensorRT engine must be built first before serialization"));
     ihost_memory_.reset(infer_engine_->serialize());
     return ihost_memory_.get();
   }
@@ -222,8 +224,14 @@ class TensorRTEngine {
           engine_serialized_data.c_str(), engine_serialized_data.size(),
           &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
     }
-    PADDLE_ENFORCE(infer_engine_ != nullptr,
-                   "build cuda engine failed when deserialize engine info.!");
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_engine_,
+        platform::errors::Fatal(
+            "Building TRT cuda engine failed when deserializing engine info. "
+            "Please check:\n1. Your TRT serialization is generated and loaded "
+            "on the same GPU architecture;\n2. The Paddle Inference version of "
+            "generating serialization file and doing inference are "
+            "consistent."));
   }
 
   void SetRuntimeBatch(size_t batch_size);
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
index 0ec803fe64a..457d9dd8737 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -56,14 +56,27 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data,
 
 nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
     int index, const nvinfer1::Dims *input_dims, int num_inputs) {
-  PADDLE_ENFORCE_EQ(index, 0);
-  PADDLE_ENFORCE_EQ(num_inputs, 2);
-  PADDLE_ENFORCE_NOT_NULL(input_dims);
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "There is only one output in TRT elementwise "
+                                  "op plugin, but got output index: %d.",
+                                  index));
+  PADDLE_ENFORCE_EQ(num_inputs, 2, platform::errors::InvalidArgument(
+                                       "There are 2 inputs in TRT elementwise "
+                                       "op plugin, but got input number: %d.",
+                                       num_inputs));
+  PADDLE_ENFORCE_NOT_NULL(
+      input_dims,
+      platform::errors::InvalidArgument(
+          "The input dims of TRT elementwise op plugin should not be null."));
   return input_dims[0];
 }
 
 int ElementWisePlugin::initialize() {
-  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0);
+  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0,
+                    platform::errors::InvalidArgument(
+                        "The dimension of input Y of TRT elementwise op plugin "
+                        "should be greater than 0, but got %d.",
+                        dims_y_.nbDims));
 
   axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
   int trimed_nb_dims = dims_y_.nbDims;
@@ -74,8 +87,18 @@ int ElementWisePlugin::initialize() {
   }
   dims_y_.nbDims = trimed_nb_dims;
 
-  PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_);
-  PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims);
+  PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_,
+                    platform::errors::InvalidArgument(
+                        "We expect [number of x dims] >= [number of y dims + "
+                        "axis] in TRT elementwise op plugin, but got [number "
+                        "of x dims] = %d, [number of y dims + axis] = %d.",
+                        dims_x_.nbDims, dims_y_.nbDims + axis_));
+  PADDLE_ENFORCE_LT(
+      axis_, dims_x_.nbDims,
+      platform::errors::InvalidArgument("We expect [axis] < [number of x dims] "
+                                        "in TRT elementwise op plugin, but got "
+                                        "[axis] = %d, [number of x dims] = %d.",
+                                        axis_, dims_x_.nbDims));
 
   prev_size_ = 1;
   midd_size_ = 1;
@@ -86,7 +109,9 @@ int ElementWisePlugin::initialize() {
 
   for (int i = 0; i < dims_y_.nbDims; ++i) {
     PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
-                      "Broadcast dimension mismatch.");
+                      platform::errors::InvalidArgument(
+                          "Broadcast dimension mismatch. The dims of input Y "
+                          "should be a subsequence of X."));
     midd_size_ *= dims_y_.d[i];
   }
 
@@ -221,7 +246,10 @@ int ElementwisePluginDynamic::enqueue(
     elementwise_kernel<<<block, thread, 0, stream>>>(
         num, x, y, out, prev_size, midd_size, post_size, details::Mul<float>());
   } else {
-    PADDLE_THROW("Not implemented.");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Paddle-TRT only support elementwise operation: {add, mul} currently, "
+        "but got %s.",
+        type_));
   }
 
   return cudaGetLastError() != cudaSuccess;
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index a03dd45db0f..72962c733ec 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -74,7 +74,9 @@ TEST_F(TensorRTEngineTest, add_layer) {
                                   nvinfer1::DimsCHW{1, 1, 1});
   auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
                                         weight.get(), bias.get());
-  PADDLE_ENFORCE(fc_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(fc_layer,
+                          platform::errors::InvalidArgument(
+                              "TRT fully connected layer building failed."));
 
   engine_->DeclareOutput(fc_layer, 0, "y");
   LOG(INFO) << "freeze network";
@@ -116,7 +118,9 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
                                   nvinfer1::DimsCHW{1, 2, 1});
   auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
                                         weight.get(), bias.get());
-  PADDLE_ENFORCE(fc_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(fc_layer,
+                          platform::errors::InvalidArgument(
+                              "TRT fully connected layer building failed."));
 
   engine_->DeclareOutput(fc_layer, 0, "y");
   engine_->FreezeNetwork();
@@ -160,7 +164,9 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
   auto *conv_layer =
       TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
                            weight.get(), bias.get());
-  PADDLE_ENFORCE(conv_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(conv_layer,
+                          platform::errors::InvalidArgument(
+                              "TRT convolution layer building failed."));
   conv_layer->setStride(nvinfer1::DimsHW{1, 1});
   conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
 
@@ -199,7 +205,9 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
   auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
                                           nvinfer1::DimsHW{2, 2});
 
-  PADDLE_ENFORCE(pool_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      pool_layer,
+      platform::errors::InvalidArgument("TRT pooling layer building failed."));
   pool_layer->setStride(nvinfer1::DimsHW{1, 1});
   pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
 
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
index 34b7072b2ee..743f7740e5f 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
@@ -83,9 +83,8 @@ bool TRTInt8Calibrator::setBatch(
           engine_name_, it.first));
     }
     const auto& d = dataptr->second;
-    PADDLE_ENFORCE(
-        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice),
-        "Fail to cudaMemcpy %s for %s", engine_name_, it.first);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice));
   }
 
   data_is_set_ = true;
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index cc6ee7b19ea..9cfe47da5db 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -208,8 +208,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
 
-    PADDLE_ENFORCE_EQ(input_names_.empty(), false,
-                      "should pass at least one input");
+    PADDLE_ENFORCE_EQ(
+        input_names_.empty(), false,
+        platform::errors::PreconditionNotMet(
+            "TensorRT engine needs at least one input, but no input is found. "
+            "Please check if you set the input correctly."));
 
     std::vector<std::string> output_maps =
         Attr<std::vector<std::string>>("output_name_mapping");
@@ -295,12 +298,19 @@ class TensorRTEngineOp : public framework::OperatorBase {
 #endif
       }
       auto *fluid_v = scope.FindVar(y);
-      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
+      PADDLE_ENFORCE_NOT_NULL(
+          fluid_v,
+          platform::errors::NotFound(
+              "Output variable %s is not found in TensorRT subgraph.", y));
       auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
       fluid_t->Resize(framework::make_ddim(ddim));
 
-      PADDLE_ENFORCE(bind_index < num_bindings,
-                     "The bind index should be less than num_bindings");
+      PADDLE_ENFORCE_LT(bind_index, num_bindings,
+                        platform::errors::InvalidArgument(
+                            "The binding index in TRT engine should be less "
+                            "than the number of bindings, but got binding "
+                            "index = %d, number of bindings = %d.",
+                            bind_index, num_bindings));
       buffers[bind_index] = static_cast<void *>(fluid_t->mutable_data<float>(
           BOOST_GET_CONST(platform::CUDAPlace, dev_place)));
 
-- 
GitLab


From 8d531727943f44861b17c522fbdebf3e659a905a Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 14 Sep 2020 13:51:31 +0800
Subject: [PATCH 061/261] move DataLoader._worker_loop to top level (#27247)

* move worker loop to top level

* move reader process loop to top level

* fix failed unittests
---
 .../fluid/dataloader/dataloader_iter.py       | 174 +++++++++---------
 python/paddle/fluid/reader.py                 |  49 ++---
 .../test_imperative_data_loader_process.py    |   5 +-
 .../test_multiprocess_dataloader_exception.py |  15 +-
 4 files changed, 126 insertions(+), 117 deletions(-)

diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 6a996493e4d..1ef0d494e07 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -347,6 +347,92 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
         return self.__next__()
 
 
+# NOTE(chenweihang): _worker_loop must be top level method to be pickled
+def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
+                 collate_fn, init_fn, worker_id, num_workers,
+                 use_shared_memory):
+    try:
+        # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
+        # some shared memory objects may have been applied for but have not yet
+        # been put into the inter-process Queue. This part of the object needs
+        # to be cleaned up when the process ends.
+        CleanupFuncRegistrar.register(_cleanup_mmap)
+
+        # set signal handler
+        core._set_process_signal_handler()
+
+        global _worker_info
+        _worker_info = WorkerInfo(
+            id=worker_id, num_workers=num_workers, dataset=dataset)
+
+        init_exception = None
+        try:
+            if init_fn is not None:
+                init_fn(worker_id)
+            fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
+                                                  collate_fn, True)
+        except:
+            init_exception = Exception("init_fn failed in worker {}: " \
+                                    "{}".format(worker_id, sys.exc_info()))
+
+        iterator_drained = False
+        parent_watch_dog = ParentWatchDog()
+
+        while parent_watch_dog.is_alive():
+            try:
+                data = indices_queue.get(MP_INDICES_CHECK_INTERVAL)
+            except queue.Empty:
+                continue
+
+            # None as poison piil, so worker event should be set
+            if data is None:
+                assert done_event.is_set() or iterator_drained, \
+                        "get None when worker done_event set"
+                break
+            # If worker done event is set but get still get data in
+            # indices_queue, remaining data should be get and skipped.
+            if done_event.is_set() or iterator_drained:
+                continue
+
+            idx, indices = data
+            try:
+                if init_exception is not None:
+                    batch = init_exception
+                    init_exception = None
+                else:
+                    batch = fetcher.fetch(indices)
+            except Exception as e:
+                if isinstance(
+                        e, StopIteration) and dataset_kind == _DatasetKind.ITER:
+                    out_queue.put(_IterableDatasetStopIteration(worker_id))
+                    iterator_drained = True
+                else:
+                    out_queue.put((idx, e))
+            else:
+                if use_shared_memory:
+                    # FIXME(dkp): _convert_to_tensor_list only support np.array
+                    #             list now, should support paddle.Tensor list
+                    if isinstance(batch[0][0], paddle.Tensor):
+                        np_batch = []
+                        for sample in batch:
+                            np_batch.append([s.numpy() for s in sample])
+                        batch = np_batch
+
+                    tensor_list = core._convert_to_tensor_list(batch)
+                    out_queue.put((idx, tensor_list))
+                    core._remove_tensor_list_mmap_fds(tensor_list)
+                else:
+                    out_queue.put((idx, batch))
+    except KeyboardInterrupt:
+        # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
+        pass
+    except:
+        six.reraise(*sys.exc_info())
+    finally:
+        if use_shared_memory:
+            _cleanup_mmap()
+
+
 class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
     def __init__(self, loader):
         super(_DataLoaderIterMultiProcess, self).__init__(loader)
@@ -404,11 +490,11 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             indices_queue = multiprocessing.Queue()
             self._indices_queues.append(indices_queue)
             worker = multiprocessing.Process(
-                target=self._worker_loop,
+                target=_worker_loop,
                 args=(self._dataset, self._dataset_kind, indices_queue,
                       self._data_queue, self._workers_done_event,
                       self._collate_fn, self._worker_init_fn, i,
-                      self._num_workers))
+                      self._num_workers, self._use_shared_memory))
             worker.daemon = True
             worker.start()
             self._workers.append(worker)
@@ -483,90 +569,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
         self._blocking_queue.kill()
         logging.error("DataLoader reader thread raised an exception!")
 
-    def _worker_loop(self, dataset, dataset_kind, indices_queue, out_queue,
-                     done_event, collate_fn, init_fn, worker_id, num_workers):
-        try:
-            # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
-            # some shared memory objects may have been applied for but have not yet
-            # been put into the inter-process Queue. This part of the object needs
-            # to be cleaned up when the process ends.
-            CleanupFuncRegistrar.register(_cleanup_mmap)
-
-            # set signal handler
-            core._set_process_signal_handler()
-
-            global _worker_info
-            _worker_info = WorkerInfo(
-                id=worker_id, num_workers=num_workers, dataset=dataset)
-
-            init_exception = None
-            try:
-                if init_fn is not None:
-                    init_fn(worker_id)
-                fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
-                                                      collate_fn, True)
-            except:
-                init_exception = Exception("init_fn failed in worker {}: " \
-                                     "{}".format(worker_id, sys.exc_info()))
-
-            iterator_drained = False
-            parent_watch_dog = ParentWatchDog()
-
-            while parent_watch_dog.is_alive():
-                try:
-                    data = indices_queue.get(MP_INDICES_CHECK_INTERVAL)
-                except queue.Empty:
-                    continue
-
-                # None as poison piil, so worker event should be set
-                if data is None:
-                    assert done_event.is_set() or iterator_drained, \
-                            "get None when worker done_event set"
-                    break
-                # If worker done event is set but get still get data in
-                # indices_queue, remaining data should be get and skipped.
-                if done_event.is_set() or iterator_drained:
-                    continue
-
-                idx, indices = data
-                try:
-                    if init_exception is not None:
-                        batch = init_exception
-                        init_exception = None
-                    else:
-                        batch = fetcher.fetch(indices)
-                except Exception as e:
-                    if isinstance(
-                            e,
-                            StopIteration) and dataset_kind == _DatasetKind.ITER:
-                        out_queue.put(_IterableDatasetStopIteration(worker_id))
-                        iterator_drained = True
-                    else:
-                        out_queue.put((idx, e))
-                else:
-                    if self._use_shared_memory:
-                        # FIXME(dkp): _convert_to_tensor_list only support np.array
-                        #             list now, should support paddle.Tensor list
-                        if isinstance(batch[0][0], paddle.Tensor):
-                            np_batch = []
-                            for sample in batch:
-                                np_batch.append([s.numpy() for s in sample])
-                            batch = np_batch
-
-                        tensor_list = core._convert_to_tensor_list(batch)
-                        out_queue.put((idx, tensor_list))
-                        core._remove_tensor_list_mmap_fds(tensor_list)
-                    else:
-                        out_queue.put((idx, batch))
-        except KeyboardInterrupt:
-            # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
-            pass
-        except:
-            six.reraise(*sys.exc_info())
-        finally:
-            if self._use_shared_memory:
-                _cleanup_mmap()
-
     def _thread_loop(self):
         while not self._thread_done_event.is_set():
             batch = self._get_data()
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 76c95be75d6..f2bb567b95b 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -85,6 +85,30 @@ def _convert_places(places):
     return ret
 
 
+# NOTE(chenweihang): _reader_process_loop must be top level method to be pickled
+def _reader_process_loop(batch_reader, data_queue):
+    try:
+        # set signal handler
+        core._set_process_signal_handler()
+
+        # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
+        # some shared memory objects may have been applied for but have not yet
+        # been put into the inter-process Queue. This part of the object needs
+        # to be cleaned up when the process ends.
+        CleanupFuncRegistrar.register(_cleanup_mmap)
+
+        for batch in batch_reader():
+            tensor_list = core._convert_to_tensor_list(batch)
+            data_queue.put(tensor_list)
+            core._remove_tensor_list_mmap_fds(tensor_list)
+        data_queue.put(None)
+    except KeyboardInterrupt:
+        # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
+        pass
+    except:
+        six.reraise(*sys.exc_info())
+
+
 class DataLoaderBase(object):
     def __init__(self):
         self._places = None
@@ -811,7 +835,8 @@ class DygraphGeneratorLoader(DataLoaderBase):
             global multiprocess_queue_set
             multiprocess_queue_set.add(self._data_queue)
             self._process = multiprocessing.Process(
-                target=self._reader_process_loop)
+                target=_reader_process_loop,
+                args=(self._batch_reader, self._data_queue))
             self._process.daemon = True
             self._process.start()
 
@@ -867,28 +892,6 @@ class DygraphGeneratorLoader(DataLoaderBase):
         self._blocking_queue.kill()
         logging.error("DataLoader reader thread raised an exception!")
 
-    def _reader_process_loop(self):
-        try:
-            # set signal handler
-            core._set_process_signal_handler()
-
-            # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
-            # some shared memory objects may have been applied for but have not yet
-            # been put into the inter-process Queue. This part of the object needs
-            # to be cleaned up when the process ends.
-            CleanupFuncRegistrar.register(_cleanup_mmap)
-
-            for batch in self._batch_reader():
-                tensor_list = core._convert_to_tensor_list(batch)
-                self._data_queue.put(tensor_list)
-                core._remove_tensor_list_mmap_fds(tensor_list)
-            self._data_queue.put(None)
-        except KeyboardInterrupt:
-            # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
-            pass
-        except:
-            six.reraise(*sys.exc_info())
-
     def _reader_thread_loop_for_multiprocess(self):
         while not self._thread_done_event.is_set():
             try:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
index 7fb2cb0090d..9b2d71c9f90 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
@@ -18,6 +18,7 @@ import multiprocessing
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import core
+from paddle.fluid.reader import _reader_process_loop
 
 if sys.version_info[0] == 2:
     import Queue as queue
@@ -66,7 +67,7 @@ class TestDygraphDataLoaderProcess(unittest.TestCase):
                 batch_generator_creator(self.batch_size, self.batch_num),
                 places=fluid.CPUPlace())
             loader._data_queue = queue.Queue(self.batch_num + 1)
-            loader._reader_process_loop()
+            _reader_process_loop(loader._batch_reader, loader._data_queue)
             # For clean memory mapped files
             util_queue = multiprocessing.Queue(self.batch_num + 1)
             for _ in range(self.batch_num):
@@ -94,7 +95,7 @@ class TestDygraphDataLoaderProcess(unittest.TestCase):
             loader._data_queue = queue.Queue(self.batch_num + 1)
             exception = None
             try:
-                loader._reader_process_loop()
+                _reader_process_loop(loader._batch_reader, loader._data_queue)
             except core.EnforceNotMet as ex:
                 exception = ex
             self.assertIsNotNone(exception)
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
index 3a8867f6bd2..6fd14b40bc9 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
@@ -27,6 +27,7 @@ import paddle.fluid.core as core
 from paddle.io import Dataset, IterableDataset, BatchSampler, DataLoader
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid.dataloader.dataloader_iter import _worker_loop
 
 
 class RandomDataset(Dataset):
@@ -185,9 +186,10 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
                 for i in range(10):
                     indices_queue.put([i, i + 10])
                 indices_queue.put(None)
-                loader._worker_loop(
-                    loader._dataset, 0, indices_queue, loader._data_queue,
-                    loader._workers_done_event, _collate_fn, _init_fn, 0, 1)
+                _worker_loop(loader._dataset, 0, indices_queue,
+                             loader._data_queue, loader._workers_done_event,
+                             _collate_fn, _init_fn, 0, 1,
+                             loader._use_shared_memory)
                 self.assertTrue(False)
         except AssertionError:
             pass
@@ -228,9 +230,10 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
                     indices_queue.put([i, i + 10])
                 indices_queue.put(None)
                 loader._workers_done_event.set()
-                loader._worker_loop(
-                    loader._dataset, 0, indices_queue, loader._data_queue,
-                    loader._workers_done_event, _collate_fn, _init_fn, 0, 1)
+                _worker_loop(loader._dataset, 0, indices_queue,
+                             loader._data_queue, loader._workers_done_event,
+                             _collate_fn, _init_fn, 0, 1,
+                             loader._use_shared_memory)
                 self.assertTrue(True)
         except AssertionError:
             pass
-- 
GitLab


From 79149c8ee6dd16cc8d4e748db0df8fa090c9752c Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 14 Sep 2020 13:52:29 +0800
Subject: [PATCH 062/261] polish framework error message part 8 (#27269)

---
 paddle/fluid/framework/c/c_api.cc            |  3 +-
 paddle/fluid/framework/fleet/nccl_wrapper.cc | 11 +--
 paddle/fluid/framework/threadpool.cc         |  6 +-
 paddle/fluid/framework/threadpool.h          |  3 +-
 paddle/fluid/framework/var_desc.cc           | 88 ++++++++++++--------
 paddle/fluid/framework/var_type.h            |  6 +-
 paddle/fluid/framework/var_type_traits.cc    | 24 +++---
 paddle/fluid/framework/variable_helper.cc    |  9 +-
 8 files changed, 88 insertions(+), 62 deletions(-)

diff --git a/paddle/fluid/framework/c/c_api.cc b/paddle/fluid/framework/c/c_api.cc
index ab987fb5668..0dd2768ccb9 100644
--- a/paddle/fluid/framework/c/c_api.cc
+++ b/paddle/fluid/framework/c/c_api.cc
@@ -49,7 +49,8 @@ std::vector<std::string> PD_GetGradOpDescStrs(
     for (size_t i = 0; i < op_num; ++i) {
       PADDLE_ENFORCE_EQ(
           grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
-          "Cannot serialize message.");
+          paddle::platform::errors::Unavailable(
+              "Cannot serialize operator desc message."));
     }
   }
   return ret;
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
index d5a25605cf8..33a91388fd8 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -25,7 +25,7 @@ bool NCCLWrapper::is_initialized_ = false;
 
 void NCCLWrapper::InitNCCL() {
 #if defined(PADDLE_WITH_NCCL)
-  PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
       &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
       nccl_info_.my_global_rank_));
 #endif
@@ -41,7 +41,8 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
 
 NCCLInfo NCCLWrapper::GetNCCLId() {
 #if defined(PADDLE_WITH_NCCL)
-  PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
 #endif
   return nccl_info_;
 }
@@ -52,8 +53,8 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
   nccl_info_.local_rank_ = local_rank;
   nccl_info_.my_global_rank_ = global_rank;
   nccl_info_.global_ranks_ = ranks;
-  PADDLE_ENFORCE(cudaSetDevice(local_rank));
-  PADDLE_ENFORCE(cudaStreamCreate(&(nccl_info_.stream_)));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(local_rank));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
 #endif
   return;
 }
@@ -65,7 +66,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
     auto var = scope.FindVar(name);
     LoDTensor* tensor = var->GetMutable<LoDTensor>();
     int32_t total_size = tensor->numel();
-    PADDLE_ENFORCE(platform::dynload::ncclBcast(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
         reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
         root_rank, nccl_info_.comm_, nccl_info_.stream_));
     cudaStreamSynchronize(nccl_info_.stream_);
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 7f7f426d0e2..4682bfc264b 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -42,7 +42,8 @@ void ThreadPool::Init() {
       num_threads = FLAGS_dist_threadpool_size;
       VLOG(1) << "set dist_threadpool_size to " << num_threads;
     }
-    PADDLE_ENFORCE_GT(num_threads, 0);
+    PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument(
+                                          "The number of threads is 0."));
     threadpool_.reset(new ThreadPool(num_threads));
   }
 }
@@ -83,7 +84,8 @@ void ThreadPool::TaskLoop() {
       }
 
       if (tasks_.empty()) {
-        PADDLE_THROW("This thread has no task to Run");
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Current thread has no task to Run."));
       }
 
       // pop a task from the task queue
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index 654d81116b2..09528b6fc35 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -91,7 +91,8 @@ class ThreadPool {
     {
       std::unique_lock<std::mutex> lock(mutex_);
       if (!running_) {
-        PADDLE_THROW("enqueue on stopped ThreadPool");
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Task is enqueued into stopped ThreadPool."));
       }
       tasks_.push(std::move(task));
     }
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index f3ea1f624ee..2ee0b17b64b 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -43,8 +43,9 @@ void VarDesc::SetTensorDescNum(size_t num) {
     } break;
     default:
       PADDLE_THROW(
-          "Setting 'sub_tensor_number' is not supported by the type of var %s.",
-          this->Name());
+          platform::errors::Unavailable("Setting 'sub_tensor_number' is not "
+                                        "supported by the %s type variable.",
+                                        this->Name()));
   }
 }
 
@@ -55,8 +56,9 @@ size_t VarDesc::GetTensorDescNum() const {
       break;
     default:
       PADDLE_THROW(
-          "Getting 'sub_tensor_number' is not supported by the type of var %s.",
-          this->Name());
+          platform::errors::Unavailable("Getting 'sub_tensor_number' is not "
+                                        "supported by the %s type variable.",
+                                        this->Name()));
   }
 }
 
@@ -133,9 +135,9 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
       desc_.mutable_type()->mutable_tensor_array()->set_lod_level(lod_level);
       break;
     default:
-      PADDLE_THROW(
-          "Setting 'lod_level' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Setting 'lod_level' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
@@ -157,9 +159,9 @@ void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
       }
     } break;
     default:
-      PADDLE_THROW(
-          "Setting 'lod_levels' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Setting 'lod_levels' is not supported by the %s type variable",
+          this->Name()));
   }
 }
 
@@ -170,9 +172,9 @@ int32_t VarDesc::GetLoDLevel() const {
     case proto::VarType::LOD_TENSOR_ARRAY:
       return desc_.type().tensor_array().lod_level();
     default:
-      PADDLE_THROW(
-          "Getting 'lod_level' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'lod_level' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
@@ -187,15 +189,19 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const {
       return res;
       break;
     default:
-      PADDLE_THROW(
-          "Getting 'lod_levels' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'lod_levels' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
 const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
+  PADDLE_ENFORCE_EQ(
+      desc_.type().has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
   switch (desc_.type().type()) {
     case proto::VarType::SELECTED_ROWS:
       return desc_.type().selected_rows();
@@ -204,14 +210,16 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
     case proto::VarType::LOD_TENSOR_ARRAY:
       return desc_.type().tensor_array().tensor();
     default:
-      PADDLE_THROW(
-          "Getting 'tensor_desc' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'tensor_desc' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
 std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
   std::vector<proto::VarType::TensorDesc> res;
   res.reserve(GetTensorDescNum());
   switch (desc_.type().type()) {
@@ -221,16 +229,19 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
       }
       return res;
     default:
-      PADDLE_THROW(
-          "Getting 'tensor_descs' is not supported by the type of var "
-          "%s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'tensor_descs' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
 proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
+  PADDLE_ENFORCE_EQ(
+      desc_.type().has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
   switch (desc_.type().type()) {
     case proto::VarType::SELECTED_ROWS:
       return desc_.mutable_type()->mutable_selected_rows();
@@ -240,15 +251,19 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
       return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor();
     default:
       PADDLE_THROW(
-          "Getting 'mutable_tensor_desc' is not supported by the type of var "
-          "%s.",
-          this->Name());
+          platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not "
+                                        "supported by the %s type variable.",
+                                        this->Name()));
   }
 }
 
 std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
+  PADDLE_ENFORCE_EQ(
+      desc_.type().has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
   std::vector<proto::VarType::TensorDesc *> res;
   res.reserve(GetTensorDescNum());
   switch (desc_.type().type()) {
@@ -259,10 +274,9 @@ std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
       }
       return res;
     default:
-      PADDLE_THROW(
-          "Getting 'tensor_descs' is not supported by the type of var "
-          "%s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'tensor_descs' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 43e9ed553be..8affeda67b3 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -40,7 +40,8 @@ inline proto::VarType::Type ToVarType(int type) {
     case proto::VarType::READER:
       return static_cast<proto::VarType::Type>(type);
     default:
-      PADDLE_THROW("ToVarType:Unsupported type %d", type);
+      PADDLE_THROW(platform::errors::Unavailable(
+          "ToVarType method Unsupported type %d.", type));
   }
 }
 
@@ -66,7 +67,8 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
       visitor(var.Get<FetchList>());
       return;
     default:
-      PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
+      PADDLE_THROW(platform::errors::Unavailable("Not supported visit type %s.",
+                                                 ToTypeName(var.Type())));
   }
 }
 
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index 5c90b07149e..1e5e8d65755 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -46,12 +46,14 @@ struct VarIdToTypeIndexMapInitializerImpl {
     static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
     constexpr int kId = VarTypeTrait<Type>::kId;
     auto type = std::type_index(typeid(Type));
-    PADDLE_ENFORCE(id_to_type->count(kId) == 0,
-                   "Registered duplicate type id %d for type %s", kId,
-                   type.name());
-    PADDLE_ENFORCE(type_to_id->count(type) == 0,
-                   "Registered duplicate type_index %s for id %d", type.name(),
-                   kId);
+    PADDLE_ENFORCE_EQ(
+        id_to_type->count(kId), 0,
+        platform::errors::AlreadyExists(
+            "Registered duplicate type id %d for type %s.", kId, type.name()));
+    PADDLE_ENFORCE_EQ(
+        type_to_id->count(type), 0,
+        platform::errors::AlreadyExists(
+            "Registered duplicate type index %s for id %d.", type.name(), kId));
     id_to_type->emplace(kId, type);
     type_to_id->emplace(type, kId);
     VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
@@ -79,15 +81,17 @@ struct VarIdToTypeIndexMapHolder {
  public:
   static const std::type_index &ToTypeIndex(int var_id) {
     auto it = Instance().id_to_type_map_.find(var_id);
-    PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(),
-                   "VarId %d is not registered.", var_id);
+    PADDLE_ENFORCE_NE(it, Instance().id_to_type_map_.end(),
+                      platform::errors::NotFound(
+                          "Variable Id %d is not registered.", var_id));
     return it->second;
   }
 
   static int ToTypeId(const std::type_index &type) {
     auto it = Instance().type_to_id_map_.find(type);
-    PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(),
-                   "VarType %s is not registered.", type.name());
+    PADDLE_ENFORCE_NE(it, Instance().type_to_id_map_.end(),
+                      platform::errors::NotFound(
+                          "Variable Type %s is not registered.", type.name()));
     return it->second;
   }
 
diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc
index 67e17410a29..ec42aa30e5a 100644
--- a/paddle/fluid/framework/variable_helper.cc
+++ b/paddle/fluid/framework/variable_helper.cc
@@ -50,11 +50,11 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::RAW) {
     // GetMutable will be called in operator
   } else {
-    PADDLE_THROW(
+    PADDLE_THROW(platform::errors::Unavailable(
         "Variable type %d is not in "
         "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
-        var_type);
+        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW].",
+        var_type));
   }
 }
 
@@ -76,7 +76,8 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) {
     auto *dst_t = tmp_grad_slr->mutable_value();
     framework::TensorCopy(src_t, cpu_place, dst_t);
   } else {
-    PADDLE_THROW("unknown var type to copy");
+    PADDLE_THROW(
+        platform::errors::Unavailable("Unknown variable type to copy."));
   }
 }
 
-- 
GitLab


From 9166307315d2bc19c02e28537b19ca9c51c95429 Mon Sep 17 00:00:00 2001
From: MRXLT <xlt2024@gmail.com>
Date: Mon, 14 Sep 2020 14:13:43 +0800
Subject: [PATCH 063/261] add check for sparse parameters with weight_decay
 (#27141)

* add check for sparse parameters with weight_decay

* move sparse check to adam.py
---
 .../fluid/tests/unittests/test_adam_op.py     | 15 ++++++-
 python/paddle/optimizer/adam.py               | 44 +++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 14e83fccd65..47bf8f49e39 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -448,7 +448,6 @@ class TestAdamOpV2(unittest.TestCase):
 
     def test_adam_op_with_state_dict(self):
 
-        import paddle
         paddle.disable_static()
         emb = paddle.nn.Embedding(10, 10)
 
@@ -517,6 +516,20 @@ class TestAdamOpV2(unittest.TestCase):
             adam = paddle.optimizer.Adam(
                 0.1, epsilon=-1, parameters=linear.parameters())
 
+    def test_adam_op_with_sparse_input_and_weight_decay(self):
+
+        paddle.disable_static()
+        x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64)
+        x = paddle.to_tensor(x_data, stop_gradient=False)
+        emb = paddle.nn.Embedding(10, 10, sparse=True)
+        adam = paddle.optimizer.Adam(
+            0.001, parameters=emb.parameters(), weight_decay=0.01)
+
+        with self.assertRaises(RuntimeError):
+            out = emb(x)
+            out.backward()
+            adam.step()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 3150b8c2d03..708aaa788f6 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -250,3 +250,47 @@ class Adam(Optimizer):
             stop_gradient=True)
 
         return adam_op
+
+    @framework.dygraph_only
+    def step(self):
+        """
+        Execute the optimizer and update parameters once.
+        
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+                paddle.disable_static()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5)
+                # This can be any optimizer supported by dygraph.
+                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                adam.clear_grad()
+        """
+        parameter_list = self._parameter_list
+        self._dtype = None
+        params_grads = []
+        for param in self._parameter_list:
+            if not param.trainable:
+                continue
+            if hasattr(
+                    param, "_is_sparse"
+            ) and param._is_sparse and self.regularization is not None:
+                raise RuntimeError(
+                    "Adam don't support weight_decay with sparse parameters, please set it to None."
+                )
+            if param._grad_ivar() is not None:
+                grad_var = param._grad_ivar()
+                params_grads.append((param, grad_var))
+
+        optimize_ops = self._apply_optimize(
+            loss=None, startup_program=None, params_grads=params_grads)
-- 
GitLab


From d4f03dfb71157f8d5a67b78f1bac6548c558feca Mon Sep 17 00:00:00 2001
From: xiaoting <31891223+tink2123@users.noreply.github.com>
Date: Mon, 14 Sep 2020 14:38:31 +0800
Subject: [PATCH 064/261] fix for tuple,test=develop (#27190)

---
 python/paddle/nn/functional/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 014c778eee9..9f7fb018513 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -450,7 +450,7 @@ def interpolate(x,
             for i in range(len(x.shape) - 2):
                 scale_list.append(scale)
             attrs['scale'] = list(map(float, scale_list))
-        elif isinstance(scale, list) or isinstance(scale, float):
+        elif isinstance(scale, list) or isinstance(scale, tuple):
             if len(scale) != len(x.shape) - 2:
                 raise ValueError("scale_shape length should be {} for "
                                  "input {}-D tensor.".format(
-- 
GitLab


From bbad3414e89ab99e15de63da62b81ec4505f6b22 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Mon, 14 Sep 2020 16:40:49 +0800
Subject: [PATCH 065/261] Enhance the error messages for  files in
 operators/math

Enhance the error messages for  files in operators/math
---
 paddle/fluid/operators/math/concat_test.cc    | 86 +++++++++++++++----
 paddle/fluid/operators/math/context_project.h |  5 +-
 paddle/fluid/operators/math/cpu_vec.h         |  5 +-
 paddle/fluid/operators/math/cross_entropy.cu  |  4 +-
 paddle/fluid/operators/math/im2col.cc         | 66 ++++++++++----
 paddle/fluid/operators/math/im2col.cu         | 68 ++++++++++-----
 6 files changed, 175 insertions(+), 59 deletions(-)

diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index 411dbca25bb..270a9d3f80a 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -79,8 +79,16 @@ void ConcatCase1(DeviceContext* context) {
   concat_functor(*context, input, 0, &out);
 
   // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_a.dims(), dim_a));
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_b.dims(), dim_b));
 
   int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
@@ -95,10 +103,14 @@ void ConcatCase1(DeviceContext* context) {
   int idx_a = 0, idx_b = 0;
   for (int j = 0; j < 5 * 3 * 4; ++j) {
     if (j >= cols) {
-      PADDLE_ENFORCE_EQ(out_ptr[j], b_ptr[idx_b]);
+      PADDLE_ENFORCE_EQ(out_ptr[j], b_ptr[idx_b],
+                        paddle::platform::errors::InvalidArgument(
+                            "Concat test failed, the result should be equal."));
       ++idx_b;
     } else {
-      PADDLE_ENFORCE_EQ(out_ptr[j], a_ptr[idx_a]);
+      PADDLE_ENFORCE_EQ(out_ptr[j], a_ptr[idx_a],
+                        paddle::platform::errors::InvalidArgument(
+                            "Concat test failed, the result should be equal."));
       ++idx_a;
     }
   }
@@ -166,8 +178,16 @@ void ConcatCase2(DeviceContext* context) {
   concat_functor(*context, input, 1, &out);
 
   // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_a.dims(), dim_a));
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_b.dims(), dim_b));
 
   int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
@@ -183,10 +203,16 @@ void ConcatCase2(DeviceContext* context) {
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 28; ++j) {
       if (j >= cols) {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], b_ptr[idx_b]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 28 + j], b_ptr[idx_b],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_b;
       } else {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], a_ptr[idx_a]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 28 + j], a_ptr[idx_a],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_a;
       }
     }
@@ -255,8 +281,16 @@ void ConcatCase3(DeviceContext* context) {
   concat_functor(*context, input, 2, &out);
 
   // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_a.dims(), dim_a));
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_b.dims(), dim_b));
 
   int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
@@ -273,10 +307,16 @@ void ConcatCase3(DeviceContext* context) {
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 9; ++j) {
       if (j >= cols) {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], b_ptr[idx_b]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 9 + j], b_ptr[idx_b],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_b;
       } else {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], a_ptr[idx_a]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 9 + j], a_ptr[idx_a],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_a;
       }
     }
@@ -347,8 +387,16 @@ void ConcatCase4(DeviceContext* context) {
   context->Wait();
 
   // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_a.dims(), dim_a));
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_b.dims(), dim_b));
 
   int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
@@ -365,10 +413,16 @@ void ConcatCase4(DeviceContext* context) {
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 24; ++j) {
       if (j >= cols) {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], b_ptr[idx_b]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 24 + j], b_ptr[idx_b],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_b;
       } else {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], a_ptr[idx_a]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 24 + j], a_ptr[idx_a],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_a;
       }
     }
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index e9019c6d2fe..051c6019d74 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -134,7 +134,10 @@ class ContextProjectFunctor {
       }
     }
     if (padding_trainable) {
-      PADDLE_ENFORCE_NOT_NULL(padding_data);
+      PADDLE_ENFORCE_NOT_NULL(
+          padding_data,
+          platform::errors::InvalidArgument(
+              "The input tensor 'padding_data' should not be NULL."));
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
         if (lod_level_0[i] == lod_level_0[i + 1]) continue;
 
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 8940a41424b..925f3b6161a 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -621,7 +621,10 @@ class VecActivations {
     } else if (type == "identity" || type == "") {
       return vec_identity<T, isa>;
     }
-    PADDLE_THROW("Not support type: %s", type);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Expected type should be one of sigmod, relu, tanh, identity. But got "
+        "not support type: %s.",
+        type));
   }
 };
 
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index c7fac60dd3e..84fa0d6af99 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -27,8 +27,8 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
                                    const int ignore_index) {
   CUDA_KERNEL_LOOP(i, N) {
     PADDLE_ENFORCE(label[i] >= 0 && label[i] < D || label[i] == ignore_index,
-                   "label[%d] expected >= 0 and < %ld, or == %ld, but got "
-                   "%ld. Please check input value.",
+                   "The value of label[%d] expected >= 0 and < %ld, or == %ld, "
+                   "but got %ld. Please check input value.",
                    i, D, ignore_index, label[i]);
     Y[i] = ignore_index == label[i]
                ? static_cast<T>(0)
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 094a7237826..6fb393d791c 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -34,9 +34,16 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im.dims()));
     PADDLE_ENFORCE_EQ(col->dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col->dims()));
 
     if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
         dilation[1] == 1) {
@@ -70,9 +77,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im->dims()));
     PADDLE_ENFORCE_EQ(col.dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col.dims()));
     int im_channels =
         (data_layout != DataLayout::kNHWC ? im->dims()[0] : im->dims()[2]);
     int im_height =
@@ -88,16 +102,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                        ((dilation[0] * (filter_height - 1) + 1))) /
                               stride[0] +
                           1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+                      col_height, platform::errors::InvalidArgument(
+                                      "Output_height and padding(padding_up, "
+                                      "padding_down) are inconsistent."));
     PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
                        ((dilation[1] * (filter_width - 1) + 1))) /
                               stride[1] +
                           1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+                      col_width, platform::errors::InvalidArgument(
+                                     "Output_height and padding(padding_up, "
+                                     "padding_down) are inconsistent."));
 
     int channels_col = im_channels * filter_height * filter_width;
 
@@ -154,9 +168,16 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im.dims()));
     PADDLE_ENFORCE_EQ(col->dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col->dims()));
     int im_channels = im.dims()[0];
     int im_height = im.dims()[1];
     int im_width = im.dims()[2];
@@ -218,9 +239,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im->dims()));
     PADDLE_ENFORCE_EQ(col.dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col.dims()));
     int im_channels = im->dims()[0];
     int im_height = im->dims()[1];
     int im_width = im->dims()[2];
@@ -231,14 +259,14 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
 
     PADDLE_ENFORCE_EQ(
         (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
+        col_height, platform::errors::InvalidArgument(
+                        "Output_height and padding(padding_up, padding_down) "
+                        "are inconsistent."));
     PADDLE_ENFORCE_EQ(
         (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
         col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
+        platform::errors::InvalidArgument("col_width and padding(padding_left, "
+                                          "padding_right) are inconsistent."));
 
     T* im_data = im->data<T>();
     const T* col_data = col.data<T>();
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
index 97719300dae..f2a2148ba69 100644
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -81,9 +81,16 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im.dims()));
     PADDLE_ENFORCE_EQ(col->dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col->dims()));
 
     int im_channels =
         (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
@@ -182,9 +189,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im->dims()));
     PADDLE_ENFORCE_EQ(col.dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col.dims()));
 
     int im_channels =
         (data_layout != DataLayout::kNHWC ? im->dims()[0] : im->dims()[2]);
@@ -201,16 +215,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                        (dilation[0] * (filter_height - 1) + 1)) /
                               stride[0] +
                           1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+                      col_height, platform::errors::InvalidArgument(
+                                      "Output_height and padding(padding_up, "
+                                      "padding_down) are inconsistent."));
     PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
                        (dilation[1] * (filter_width - 1) + 1)) /
                               stride[1] +
                           1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
+                      col_width, platform::errors::InvalidArgument(
+                                     "col_width and padding(padding_left, "
+                                     "padding_right) are inconsistent."));
 
     size_t num_kernels = im_channels * im_height * im_width;
 
@@ -285,9 +299,16 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im.dims()));
     PADDLE_ENFORCE_EQ(col->dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col->dims()));
 
     int im_channels = im.dims()[0];
     int im_height = im.dims()[1];
@@ -370,9 +391,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im->dims()));
     PADDLE_ENFORCE_EQ(col.dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col.dims()));
 
     int im_channels = im->dims()[0];
     int im_height = im->dims()[1];
@@ -386,16 +414,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                        (dilation[0] * (filter_height - 1) + 1)) /
                               stride[0] +
                           1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+                      col_height, platform::errors::InvalidArgument(
+                                      "Output_height and padding(padding_up, "
+                                      "padding_down) are inconsistent."));
     PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
                        (dilation[1] * (filter_width - 1) + 1)) /
                               stride[1] +
                           1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
+                      col_width, platform::errors::InvalidArgument(
+                                     "col_width and padding(padding_left, "
+                                     "padding_right) are inconsistent."));
 
     int block_dim_x = 0;
     int block_dim_y = 0;
-- 
GitLab


From a685435962575734d9ba1a0dc830584cd8f7352b Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Mon, 14 Sep 2020 17:08:00 +0800
Subject: [PATCH 066/261] fix conv depthwise bug (#27278)

Fix conv deepwise bug when in_channels=1.
---
 python/paddle/nn/functional/conv.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 3c1482e69c3..5cf49539332 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -267,8 +267,8 @@ def conv1d(x,
     dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
 
     l_type = "conv2d"
-    if (num_channels == groups and num_filters % num_channels == 0 and
-            not use_cudnn):
+    if (num_channels == groups and num_channels != 1 and
+            num_filters % num_channels == 0 and not use_cudnn):
         l_type = 'depthwise_conv2d'
         use_cudnn = False
 
@@ -491,7 +491,8 @@ def conv2d(x,
     dilation = utils.convert_to_list(dilation, 2, 'dilation')
 
     l_type = "conv2d"
-    if (num_channels == groups and num_filters % num_channels == 0):
+    if (num_channels == groups and num_channels != 1 and
+            num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
         use_cudnn = False
 
@@ -761,7 +762,8 @@ def conv_transpose1d(x,
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_filters == 1 and not use_cudnn):
+    if (num_channels == groups and num_channels != 1 and num_filters == 1 and
+            not use_cudnn):
         op_type = 'depthwise_conv2d_transpose'
         use_cudnn = False
 
@@ -1010,7 +1012,7 @@ def conv_transpose2d(x,
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_filters == 1):
+    if (num_channels == groups and num_channels != 1 and num_filters == 1):
         op_type = 'depthwise_conv2d_transpose'
         use_cudnn = False
 
-- 
GitLab


From bc3e9ba1c60407c3a7d33c2eafaa341944c2b6b8 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Mon, 14 Sep 2020 19:59:42 +0800
Subject: [PATCH 067/261] check the validation of parameters for expand and
 tile apis (#26816)

* bug fix, test=develop
---
 python/paddle/tensor/manipulation.py | 39 +++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index db1222fa421..9de407841fb 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -21,6 +21,7 @@ from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_t
 from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
 import numpy as np
+import six
 # TODO: define functions to manipulate a tensor  
 from ..fluid.layers import cast  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
@@ -1056,10 +1057,25 @@ def tile(x, repeat_times, name=None):
     """
     if in_dygraph_mode():
         return core.ops.tile(x, 'repeat_times', repeat_times)
+    check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
+    if isinstance(repeat_times, Variable):
+        assert len(repeat_times.shape) == 1, (
+            'repeat_times must be an 1-D Tensor.')
+    else:
+        for elem in repeat_times:
+            if isinstance(elem, Variable):
+                assert len(elem.shape) == 1, (
+                    'Elements in repeat_times must be 1-D Tensors or integers.')
+            else:
+                if six.PY3:
+                    type_tuple = (int, np.int32, np.int64)
+                elif six.PY2:
+                    type_tuple = (int, long, np.int32, np.int64)
+                assert isinstance(elem, type_tuple), (
+                    'Elements in repeat_times must be 1-D Tensors or integers.')
 
     check_variable_and_dtype(
         x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'tile')
-    check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError(
             "When the date type is bool for the input 'x' of tile op, you "
@@ -1181,18 +1197,33 @@ def expand(x, shape, name=None):
     if in_dygraph_mode():
         return core.ops.expand_v2(x, 'shape', shape)
 
+    if isinstance(shape, Variable):
+        assert len(shape.shape) == 1, ('shape must be an 1-D Tensor.')
+    else:
+        for elem in shape:
+            if isinstance(elem, Variable):
+                assert len(elem.shape) == 1, (
+                    'Elements in shape must be 1-D Tensors or integers.')
+            else:
+                if six.PY3:
+                    type_tuple = (int, np.int32, np.int64)
+                elif six.PY2:
+                    type_tuple = (int, long, np.int32, np.int64)
+                assert isinstance(elem, type_tuple), (
+                    'Elements in shape must be 1-D Tensors or integers.')
+
     check_variable_and_dtype(
         x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand')
     check_type(shape, 'shape', (list, tuple, Variable), 'expand')
-
-    inputs = {"X": [x]}
-    attrs = {}
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError("When the data type of input 'x' for expand is bool, "
                          "you must set its stop_gradient to be False by "
                          "some_var.stop_gradient = True, supporting "
                          "some_var as the input.")
 
+    inputs = {"X": [x]}
+    attrs = {}
+
     helper = LayerHelper('expand', **locals())
 
     def get_attr_expand_shape(list_expand_shape):
-- 
GitLab


From bf461fa5247173fcec902beb0b89e8f90434019d Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Mon, 14 Sep 2020 20:03:33 +0800
Subject: [PATCH 068/261] Improving error report message for sequence_expand op
 (#27245)

* improve err report, test=develop
---
 paddle/fluid/operators/sequence_ops/sequence_expand_op.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
index 013170199a6..1186ed891e8 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
@@ -92,9 +92,11 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
     auto& x_lod = x->lod();
     auto& y_lod = y->lod();
 
-    PADDLE_ENFORCE_EQ(y_lod.empty(), false,
-                      "Input(Y) Tensor of SequenceExpandOp does not contain "
-                      "LoD information.");
+    PADDLE_ENFORCE_EQ(
+        y_lod.empty(), false,
+        platform::errors::InvalidArgument(
+            "Input(Y) Tensor of SequenceExpandOp does not contain "
+            "LoD information."));
 
     if (ref_level == -1) ref_level = y_lod.size() - 1;
 
-- 
GitLab


From ac9afa024b37ad00bc9d80df8a3f50fdf00a7911 Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Mon, 14 Sep 2020 20:09:37 +0800
Subject: [PATCH 069/261] paddle.nn.functional.logsigmoid -> log_sigmoid
 (#27277)

---
 python/paddle/fluid/dygraph/math_op_patch.py  |  2 +-
 python/paddle/fluid/layers/ops.py             |  7 ++++--
 .../tests/unittests/test_activation_op.py     | 25 +++++++++++++------
 .../fluid/tests/unittests/test_layers.py      |  7 ------
 .../unittests/test_math_op_patch_var_base.py  |  2 +-
 python/paddle/nn/functional/__init__.py       |  2 +-
 python/paddle/nn/functional/activation.py     | 17 ++++++-------
 python/paddle/nn/layer/activation.py          |  5 ++--
 8 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index f9fe4198fec..3aa7b9dfc26 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -285,7 +285,7 @@ def monkey_patch_math_varbase():
         ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
         ('__array_ufunc__', None),
         ('sigmoid', _method_creator_('sigmoid', 'name=None')),
-        ('logsigmoid', _method_creator_('logsigmoid', 'name=None')),
+        ('log_sigmoid', _method_creator_('logsigmoid', 'name=None')),
         ('exp', _method_creator_('exp', 'name=None')),
         ('tanh', _method_creator_('tanh', 'name=None')),
         ('atan', _method_creator_('atan', 'name=None')),
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 1efae3ddf1f..6cdc617a0dc 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -20,7 +20,10 @@ from ..framework import convert_np_dtype_to_dtype_, Variable
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from paddle.utils import deprecated
 
-__deprecated_func_name__ = {'tanh_shrink': 'tanhshrink', }
+__deprecated_func_name__ = {
+    'tanh_shrink': 'tanhshrink',
+    'logsigmoid': 'log_sigmoid'
+}
 
 __activations_noattr__ = [
     'sigmoid',
@@ -106,7 +109,7 @@ Examples:
         paddle.disable_static()
 
         x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = F.logsigmoid(x)
+        out = F.log_sigmoid(x)
         print(out.numpy())
         # [-0.91301525 -0.79813887 -0.64439666 -0.55435524]
 
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index ab61a5b3cfc..f6ba03194aa 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -128,7 +128,7 @@ class TestLogSigmoid(TestActivation):
 
 
 class TestLogSigmoidAPI(unittest.TestCase):
-    # test paddle.nn.LogSigmoid, paddle.nn.functional.logsigmoid
+    # test paddle.nn.LogSigmoid, paddle.nn.functional.log_sigmoid
     def setUp(self):
         self.x_np = np.random.uniform(-1, 1, [11, 17]).astype('float32')
         self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
@@ -137,36 +137,45 @@ class TestLogSigmoidAPI(unittest.TestCase):
     def test_static_api(self):
         with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.data('X', [11, 17])
-            out1 = F.logsigmoid(x)
+            out1 = F.log_sigmoid(x)
             m = paddle.nn.LogSigmoid()
             out2 = m(x)
             exe = paddle.static.Executor(self.place)
             res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
         out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
         for r in res:
-            self.assertEqual(np.allclose(out_ref, r), True)
+            self.assertTrue(np.allclose(out_ref, r))
 
     def test_dygraph_api(self):
         paddle.disable_static(self.place)
         x = paddle.to_tensor(self.x_np)
-        out1 = F.logsigmoid(x)
+        out1 = F.log_sigmoid(x)
         m = paddle.nn.LogSigmoid()
         out2 = m(x)
         out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
         for r in [out1, out2]:
-            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
         paddle.enable_static()
 
+    def test_fluid_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [11, 17])
+            out = paddle.fluid.layers.logsigmoid(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, F.logsigmoid, 1)
+            self.assertRaises(TypeError, F.log_sigmoid, 1)
             # The input dtype must be float16, float32, float64.
             x_int32 = paddle.data(name='x_int32', shape=[11, 17], dtype='int32')
-            self.assertRaises(TypeError, F.logsigmoid, x_int32)
+            self.assertRaises(TypeError, F.log_sigmoid, x_int32)
             # support the input dtype is float16
             x_fp16 = paddle.data(name='x_fp16', shape=[11, 17], dtype='float16')
-            F.logsigmoid(x_fp16)
+            F.log_sigmoid(x_fp16)
 
 
 class TestTanh(TestActivation, TestParameter):
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 89e9f7aad85..26073f49bdd 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2677,13 +2677,6 @@ class TestBook(LayerTest):
             out = layers.sigmoid(input, name='sigmoid')
             return (out)
 
-    def make_logsigmoid(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.logsigmoid(input, name='logsigmoid')
-            return (out)
-
     def make_exp(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index 9bb12d54655..a70862f4019 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -307,7 +307,7 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
             np.array_equal(x.sigmoid().numpy(), fluid.layers.sigmoid(x).numpy(
             )))
         self.assertTrue(
-            np.array_equal(x.logsigmoid().numpy(),
+            np.array_equal(x.log_sigmoid().numpy(),
                            fluid.layers.logsigmoid(x).numpy()))
         self.assertTrue(np.array_equal(x.exp().numpy(), paddle.exp(x).numpy()))
         self.assertTrue(
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index f3cc8c610ff..163c249ab37 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -39,7 +39,7 @@ from .activation import hard_sigmoid  #DEFINE_ALIAS
 from .activation import hard_swish  #DEFINE_ALIAS
 from .activation import hsigmoid  #DEFINE_ALIAS
 from .activation import leaky_relu  #DEFINE_ALIAS
-from .activation import logsigmoid  #DEFINE_ALIAS
+from .activation import log_sigmoid  #DEFINE_ALIAS
 from .activation import maxout  #DEFINE_ALIAS
 from .activation import prelu  #DEFINE_ALIAS
 from .activation import relu  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index ffedb027330..f7bbe0c94e0 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -35,7 +35,7 @@ __all__ = [
     'hard_swish',
     'hsigmoid',
     'leaky_relu',
-    'logsigmoid',
+    'log_sigmoid',
     'maxout',
     'prelu',
     'relu',
@@ -552,13 +552,13 @@ def relu(x, name=None):
     return out
 
 
-def logsigmoid(x, name=None):
+def log_sigmoid(x, name=None):
     """
-    logsigmoid activation.
+    log_sigmoid activation.
 
     .. math::
 
-        logsigmoid(x) = log \\frac{1}{1 + e^{-x}}
+        log\\_sigmoid(x) = log \\frac{1}{1 + e^{-x}}
     
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -573,20 +573,19 @@ def logsigmoid(x, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
             paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
-            out = F.logsigmoid(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            out = F.log_sigmoid(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
     """
 
     if in_dygraph_mode():
         return core.ops.logsigmoid(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'logsigmoid')
-    helper = LayerHelper("logsigmoid", **locals())
+                             'log_sigmoid')
+    helper = LayerHelper("log_sigmoid", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(type='logsigmoid', inputs={'X': x}, outputs={'Out': out})
     return out
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index c38d6018a25..585d369c607 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -860,11 +860,10 @@ class LogSigmoid(layers.Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
             m = paddle.nn.LogSigmoid()
             out = m(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
     """
@@ -874,7 +873,7 @@ class LogSigmoid(layers.Layer):
         self._name = name
 
     def forward(self, x):
-        return F.logsigmoid(x, self._name)
+        return F.log_sigmoid(x, self._name)
 
 
 class Softmax(layers.Layer):
-- 
GitLab


From 6947a58a1f52bed347341d373ce86452b17c3366 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Mon, 14 Sep 2020 20:14:26 +0800
Subject: [PATCH 070/261] disable three unittests,test=document_fix (#27299)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 102bacff963..8f3945a48e3 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -461,11 +461,11 @@ if(WITH_DISTRIBUTE)
     	   py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
 	   py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
 	   py_test_modules(test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy)
-	   py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
+	   #py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
         if(NOT WIN32)
             py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
-            py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
-            py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
+            #py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
+            #py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
         endif(NOT WIN32)
     endif(NOT APPLE)
     if(WITH_DGC)
-- 
GitLab


From 1483ea2304f6b26aeba0953fde2a187dd163d82f Mon Sep 17 00:00:00 2001
From: "joanna.wozna.intel" <joanna.wozna@intel.com>
Date: Mon, 14 Sep 2020 16:00:39 +0200
Subject: [PATCH 071/261] Add bfloat16 passes (#26999)

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   4 +
 .../framework/ir/graph_pattern_detector.cc    |  76 +++++++++
 .../framework/ir/graph_pattern_detector.h     |  41 +++++
 .../framework/ir/mkldnn/cpu_bfloat16_pass.cc  | 159 ++++++++++++++++++
 .../framework/ir/mkldnn/cpu_bfloat16_pass.h   |  34 ++++
 .../ir/mkldnn/cpu_bfloat16_pass_tester.cc     | 145 ++++++++++++++++
 .../ir/mkldnn/cpu_bfloat16_placement_pass.cc  |  91 ++++++++++
 .../ir/mkldnn/cpu_bfloat16_placement_pass.h   |  38 +++++
 .../cpu_bfloat16_placement_pass_tester.cc     | 132 +++++++++++++++
 .../inference/api/paddle_pass_builder.cc      |   4 +
 .../operators/mkldnn/quantize_mkldnn_op.cc    |  12 +-
 paddle/fluid/operators/quantize_op.cc         |   2 +
 paddle/fluid/platform/mkldnn_helper.h         |   7 +
 paddle/fluid/pybind/protobuf.cc               |   1 +
 14 files changed, 744 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
 create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
 create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
 create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
 create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
 create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 8787aa8a94a..5bb833f6135 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -102,6 +102,8 @@ if(WITH_MKLDNN)
     pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
+    pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
+    pass_library(cpu_bfloat16_pass inference DIR mkldnn)
     pass_library(fc_mkldnn_pass inference DIR mkldnn)
     pass_library(cpu_quantize_placement_pass base DIR mkldnn)
     pass_library(cpu_quantize_pass inference DIR mkldnn)
@@ -162,4 +164,6 @@ endif()
     cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
     cc_test(test_reshape_transpose_matmul_mkldnn_fuse_pass SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc DEPS reshape_transpose_matmul_mkldnn_fuse_pass)
     cc_test(test_matmul_transpose_reshape_fuse_pass SRCS mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc DEPS matmul_transpose_reshape_fuse_pass)
+    cc_test(test_cpu_bfloat16_placement_pass SRCS mkldnn/cpu_bfloat16_placement_pass_tester.cc DEPS cpu_bfloat16_placement_pass)
+    cc_test(test_cpu_bfloat16_pass SRCS mkldnn/cpu_bfloat16_pass_tester.cc DEPS cpu_bfloat16_pass)
 endif ()
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 3d65fe59537..9c1eaa99a3c 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1892,6 +1892,82 @@ PDNode *patterns::QuantizePlacement::operator()(
   return op;
 }
 
+PDNode *patterns::Bfloat16Placement::operator()(
+    const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
+  std::unordered_set<std::string> supported_op_types =
+      std::unordered_set<std::string>();
+  if (!bfloat16_enabled_op_types.empty()) {
+    supported_op_types = bfloat16_enabled_op_types;
+  }
+  auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
+  return op;
+}
+
+PDNode *patterns::OrphanedBfloat16::operator()() {
+  auto *prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+  prev_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "float32";
+  });
+  auto *prev_out = pattern->NewNode(prev_out_repr())->AsOutput();
+
+  auto *op = pattern->NewNode(op_repr())->assert_is_op();
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+  auto *op_out = pattern->NewNode(op_out_repr())->AsOutput();
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+  next_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "float32";
+  });
+
+  prev_op->LinksTo({prev_out});
+  op->LinksFrom({prev_out}).LinksTo({op_out});
+  next_op->LinksFrom({op_out});
+  return next_op;
+}
+
+PDNode *patterns::LastBfloat16Ops::operator()() {
+  auto *op = pattern->NewNode(op_repr())->assert_is_op();
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+  auto *op_out = pattern->NewNode(op_out_repr())->AsOutput();
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+  next_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") !=
+           "bfloat16";
+  });
+
+  op->LinksTo({op_out});
+  next_op->LinksFrom({op_out});
+  return next_op;
+}
+
+PDNode *patterns::FirstBfloat16Ops::operator()() {
+  auto *prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+  prev_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") !=
+           "bfloat16";
+  });
+  auto *op_in = pattern->NewNode(op_in_repr())->AsOutput();
+
+  auto *op = pattern->NewNode(op_repr())->assert_is_op();
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+
+  prev_op->LinksTo({op_in});
+  op->LinksFrom({op_in});
+  return op;
+}
+
 PDNode *patterns::MKLDNNInPlace::operator()() {
   const std::unordered_set<std::string> &supported_op_types = {
       "abs",
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 08032658841..053c1fe832b 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1129,6 +1129,47 @@ struct QuantizePlacement : public PatternBase {
   PATTERN_DECL_NODE(op);
 };
 
+struct Bfloat16Placement : public PatternBase {
+  Bfloat16Placement(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "bfloat16_placement") {}
+  PDNode* operator()(
+      const std::unordered_set<std::string>& bfloat16_enabled_op_types);
+
+  PATTERN_DECL_NODE(op);
+};
+
+struct OrphanedBfloat16 : public PatternBase {
+  OrphanedBfloat16(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "orphaned_bfloat16") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(prev_out);
+  PATTERN_DECL_NODE(op);
+  PATTERN_DECL_NODE(op_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+struct LastBfloat16Ops : public PatternBase {
+  LastBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "last_bfloat16_ops") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(op);
+  PATTERN_DECL_NODE(op_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+struct FirstBfloat16Ops : public PatternBase {
+  FirstBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "first_bfloat16_ops") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(op_in);
+  PATTERN_DECL_NODE(op);
+};
+
 // Pattern used for enforcing inplace computation for in-place computation
 // supporting DNNL ops. softmax, batch_norm and layer_norm
 struct MKLDNNInPlace : public PatternBase {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
new file mode 100644
index 00000000000..df498865245
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h"
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void UnlinkNodes(ir::Node* a, ir::Node* b) {
+  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
+                   a->outputs.end());
+  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
+                  b->inputs.end());
+}
+
+void CPUBFloat16Pass::SetInputDataType(ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::FirstBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
+                                          "first_bfloat16_ops"};
+  bfloat16_ops();
+  int quantize_counter = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
+
+    if (op->Op()->Type() != "conv2d" && prev_op->Op()->Type() != "quantize") {
+      VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
+      auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
+
+      // create a quantize op node
+      OpDesc q_desc;
+      q_desc.SetType("quantize");
+      q_desc.SetInput("Input", std::vector<std::string>({op_in->Name()}));
+      q_desc.SetOutput("Output",
+                       std::vector<std::string>({quantize_out_node->Name()}));
+      q_desc.SetAttr("Scale", 1.f);
+      q_desc.SetAttr("bfloat16", true);
+      q_desc.SetAttr("output_format", Has("data_layout")
+                                          ? Get<std::string>("data_layout")
+                                          : "NCHW");
+      auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
+
+      std::string op_input_name;
+      for (auto name : op->Op()->InputNames()) {
+        for (auto input_name : op->Op()->Input(name)) {
+          if (input_name == op_in->Name()) op_input_name = name;
+        }
+      }
+
+      PADDLE_ENFORCE_NE(
+          op_input_name.empty(), true,
+          platform::errors::NotFound(
+              "Operator before operator should have input as op output"));
+
+      op->Op()->SetInput(op_input_name,
+                         std::vector<std::string>({quantize_out_node->Name()}));
+
+      UnlinkNodes(op_in, op);
+      IR_NODE_LINK_TO(op_in, quantize_op);
+      IR_NODE_LINK_TO(quantize_op, quantize_out_node);
+      IR_NODE_LINK_TO(quantize_out_node, op);
+      quantize_counter++;
+    }
+  };
+  gpd(graph, handler);
+  PrettyLogDetail("---    added %d quantize op before bfloat16 op",
+                  quantize_counter);
+}
+
+void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::LastBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
+                                         "last_bfloat16_ops"};
+  bfloat16_ops();
+  int force_fp32_counter = 0, dequantize_counter = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(op_out, op_out, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, bfloat16_ops);
+
+    if ((op->Op()->HasAttr("force_fp32_output") ||
+         op->Op()->HasProtoAttr("force_fp32_output")) &&
+        !op->Op()->GetAttrIfExists<bool>("fuse_residual_connection")) {
+      op->Op()->SetAttr("force_fp32_output", true);
+      force_fp32_counter++;
+    } else if (op->Op()->Type() != "prior_box") {
+      // Create dequantize input variable
+      VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+      auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
+
+      // create a dequantize op node for output.
+      OpDesc deq_desc;
+      deq_desc.SetType("dequantize");
+      deq_desc.SetInput("Input",
+                        std::vector<std::string>({dequantize_in_node->Name()}));
+      deq_desc.SetOutput("Output", std::vector<std::string>({op_out->Name()}));
+      deq_desc.SetAttr("Scale", 1.0f);
+      auto dequantize_op = g->CreateOpNode(&deq_desc);
+
+      std::string op_output_name;
+      for (auto name : op->Op()->OutputNames()) {
+        for (auto output_name : op->Op()->Output(name)) {
+          if (output_name == op_out->Name()) op_output_name = name;
+        }
+      }
+
+      PADDLE_ENFORCE_NE(
+          op_output_name.empty(), true,
+          platform::errors::NotFound(
+              "Operator after operator should have input as op output"));
+
+      op->Op()->SetOutput(op_output_name, std::vector<std::string>(
+                                              {dequantize_in_node->Name()}));
+
+      UnlinkNodes(op, op_out);
+      IR_NODE_LINK_TO(op, dequantize_in_node);
+      IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
+      IR_NODE_LINK_TO(dequantize_op, op_out);
+      dequantize_counter++;
+    }
+  };
+  gpd(graph, handler);
+  PrettyLogDetail("---    added %d dequantize op and used %d force_fp32_output",
+                  dequantize_counter, force_fp32_counter);
+}
+
+void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const {
+  SetInputDataType(graph);
+  SetOutputDataType(graph);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_bfloat16_pass, paddle::framework::ir::CPUBFloat16Pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
new file mode 100644
index 00000000000..3a7271f7ddc
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class CPUBFloat16Pass : public Pass {
+ protected:
+  void SetInputDataType(ir::Graph* graph) const;
+  void SetOutputDataType(ir::Graph* graph) const;
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
new file mode 100644
index 00000000000..15109db9832
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn,
+           const std::string& mkldnn_data_type = "float32",
+           const bool force_fp32_output = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+
+  if (type == "conv2d") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+    op->SetAttr("force_fp32_output", force_fp32_output);
+  } else if (type == "pool2d" || type == "transpose2" || type == "reshape2" ||
+             type == "dropout") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "fc") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "concat") {
+    op->SetInput("X", inputs);
+    op->SetOutput("Out", outputs);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "matmul" || type == "elementwise_add") {
+    op->SetInput("X", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  }
+}
+
+void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
+                 const std::initializer_list<std::string> variable_names,
+                 int* original_nodes_num, int* current_nodes_num) {
+  auto pass = PassRegistry::Instance().Get("cpu_bfloat16_pass");
+
+  graph->reset(pass->Apply(graph->release()));
+
+  *original_nodes_num = (*graph)->Nodes().size();
+  (*graph).reset(pass->Apply((*graph).release()));
+  *current_nodes_num = (*graph)->Nodes().size();
+}
+
+static const std::initializer_list<std::string> variable_names{
+    "z", "a", "b", "c", "d", "e", "f", "g", "h", "i"};
+
+ProgramDesc BuildProgramDesc(bool use_mkldnn) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dropout", "Dropout1", {"z"}, {"a"}, use_mkldnn, "float32");
+  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "pool2d", "Pool1", {"b"}, {"c"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "conv2d", "Conv1", {"c"}, {"d"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "dropout", "Dropout2", {"d"}, {"e"}, use_mkldnn, "float32");
+  SetOp(&prog, "transpose2", "Transpose1", {"e"}, {"f"}, use_mkldnn,
+        "bfloat16");
+  SetOp(&prog, "reshape2", "Reshape1", {"f"}, {"g"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "concat", "Concat1", {"g"}, {"h"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "dropout", "Dropout3", {"h"}, {"i"}, use_mkldnn, "float32");
+
+  return prog;
+}
+
+void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
+              int transpose_count, int quant_count, int dequant_count,
+              int added_nodes_count) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int original_nodes_num, current_nodes_num;
+  PreparePass(&graph, prog, variable_names, &original_nodes_num,
+              &current_nodes_num);
+
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int conv2d_nodes_count = 0;
+  int pool2d_nodes_count = 0;
+  int transpose2_nodes_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        conv2d_nodes_count++;
+      } else if (op->Type() == "pool2d") {
+        pool2d_nodes_count++;
+      } else if (op->Type() == "transpose2") {
+        transpose2_nodes_count++;
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+      }
+    }
+  }
+  EXPECT_EQ(conv2d_nodes_count, conv_count);
+  EXPECT_EQ(pool2d_nodes_count, pool_count);
+  EXPECT_EQ(transpose2_nodes_count, transpose_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
+TEST(CpuQuantizePass, quantize) {
+  bool use_mkldnn = true;
+  // 1 quantize + 1 dequantize
+  int added_nodes = 2;
+  MainTest(BuildProgramDesc(use_mkldnn), 2, 1, 1, 1, 2, added_nodes);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_bfloat16_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
new file mode 100644
index 00000000000..3d7a9c1107b
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h"
+
+#include <string>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void CPUBfloat16PlacementPass::SetMkldnnDataType(
+    ir::Graph* graph, int* bfloat16_operators) const {
+  const auto& op_types_list =
+      Get<std::unordered_set<std::string>>("bfloat16_enabled_op_types");
+  // set mkldnn_data_type to bfloat16 to all operators that are in
+  // bfloat16_enabled_op_types vector or they are included to Bfloat16Placement
+  // pattern
+  GraphPatternDetector gpd;
+  patterns::Bfloat16Placement bfloat16_placement_pattern{gpd.mutable_pattern(),
+                                                         "bfloat16_placement"};
+  bfloat16_placement_pattern(op_types_list);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_placement_pattern);
+
+    if ((op->Op()->HasAttr("mkldnn_data_type") ||
+         op->Op()->HasProtoAttr("mkldnn_data_type")) &&
+        !platform::HasOpINT8DataType(op->Op())) {
+      op->Op()->SetAttr("mkldnn_data_type", std::string("bfloat16"));
+      (*bfloat16_operators)++;
+    }
+  };
+  gpd(graph, handler);
+}
+
+void CPUBfloat16PlacementPass::RemoveOrhanedOperators(
+    ir::Graph* graph, int* bfloat16_operators) const {
+  // find orphaned bfloat16 operator that is between two float32 operators
+  // revert mkldnn_data_type attr to float32
+  GraphPatternDetector gpd;
+  patterns::OrphanedBfloat16 orphaned_bfloat16_pattern{gpd.mutable_pattern(),
+                                                       "orphaned_bfloat16"};
+  orphaned_bfloat16_pattern();
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, orphaned_bfloat16_pattern);
+
+    op->Op()->SetAttr("mkldnn_data_type", std::string("float32"));
+    bfloat16_operators--;
+  };
+  gpd(graph, handler);
+}
+
+void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const {
+  int bfloat16_operators = 0;
+  SetMkldnnDataType(graph, &bfloat16_operators);
+  RemoveOrhanedOperators(graph, &bfloat16_operators);
+  PrettyLogDetail("---    marked %d operators to bfloat16 ",
+                  bfloat16_operators);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_bfloat16_placement_pass,
+              paddle::framework::ir::CPUBfloat16PlacementPass)
+    // a vector of operator type names with bfloat16 support ("conv2d" etc.)
+    // the second param is the default value for this vector
+    .DefaultPassAttr("bfloat16_enabled_op_types",
+                     new std::unordered_set<std::string>());
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
new file mode 100644
index 00000000000..1911b1a3cb3
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+ * Specifies which operators should be run on bfloat16.
+ */
+class CPUBfloat16PlacementPass : public Pass {
+ protected:
+  void SetMkldnnDataType(ir::Graph* graph, int* bfloat16_operators) const;
+
+  void RemoveOrhanedOperators(ir::Graph* graph, int* bfloat16_operators) const;
+
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
new file mode 100644
index 00000000000..b9797a4bfcc
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
@@ -0,0 +1,132 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs,
+           const std::string& mkldnn_data_type = "float32") {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+
+  op->SetType(type);
+  op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+
+  if (type == "conv2d") {
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  } else if (type == "concat") {
+    op->SetAttr("axis", 1);
+    op->SetInput("X", {inputs[0], inputs[1]});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+  op->SetOutput("Out", {outputs[0]});
+}
+
+// operator                      mkldnn_data_type
+// ---------------------------------------
+// (a,b)->concat->c              float32
+// c->conv->f                    float32
+// f->relu->g                    float32
+// g->pool->h                    float32
+// h->conv->k                    float32
+// k->pool->l                    float32
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "f", "g", "h", "k", "l"})) {
+    prog.MutableBlock(0)->Var(v);
+  }
+
+  SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"});
+  SetOp(&prog, "conv2d", "conv1", {"c"}, {"f"});
+  SetOp(&prog, "relu", "relu1", {"f"}, {"g"});
+  SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"});
+  SetOp(&prog, "conv2d", "conv2", {"h"}, {"k"});
+  SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"});
+
+  return prog;
+}
+
+void MainTest(std::initializer_list<std::string> bfloat16_enabled_op_types,
+              unsigned expected_bfloat16_data_type_count) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
+  pass->Set("bfloat16_enabled_op_types",
+            new std::unordered_set<std::string>(bfloat16_enabled_op_types));
+
+  graph.reset(pass->Apply(graph.release()));
+
+  unsigned bfloat16_data_type_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      if (platform::HasOpBFLOAT16DataType(node->Op())) {
+        ++bfloat16_data_type_count;
+      }
+    }
+  }
+
+  EXPECT_EQ(bfloat16_data_type_count, expected_bfloat16_data_type_count);
+}
+
+void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
+  auto prog = BuildProgramDesc();
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
+  graph.reset(pass->Apply(graph.release()));
+
+  unsigned bfloat16_data_type_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      if (platform::HasOpBFLOAT16DataType(node->Op())) {
+        ++bfloat16_data_type_count;
+      }
+    }
+  }
+  EXPECT_EQ(bfloat16_data_type_count, expected_bfloat16_data_type_count);
+}
+
+TEST(Bfloat16PlacementPass, enable_all) {
+  MainTest({"conv2d", "pool2d", "relu", "concat"}, 6);
+}
+
+TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
+  // 2 conv2d + 2 pool2 - 1 orphaned conv2d
+  MainTest({"conv2d", "pool2d"}, 3);
+}
+
+TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(0); }
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_bfloat16_placement_pass);
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 98a36a3308d..c19e77d2714 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -231,6 +231,10 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
 
 void CpuPassStrategy::EnableMkldnnBfloat16() {
 #ifdef PADDLE_WITH_MKLDNN
+  if (!use_mkldnn_bfloat16_) {
+    passes_.push_back("cpu_bfloat16_placement_pass");
+    passes_.push_back("cpu_bfloat16_pass");
+  }
   use_mkldnn_bfloat16_ = true;
 #else
   use_mkldnn_bfloat16_ = false;
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 29a86a35d7b..a6c8f8656a4 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -48,6 +48,7 @@ class QuantOpKernel : public framework::OpKernel<T> {
     const T* input_data = input->data<T>();
 
     bool is_negative = ctx.Attr<bool>("is_negative_input");
+    bool bfloat16 = ctx.Attr<bool>("bfloat16");
     std::string key =
         platform::CreateKey(platform::ThreadIDasStr(), src_tz, scale_data,
                             is_negative, ctx.OutputName("Output"));
@@ -74,7 +75,10 @@ class QuantOpKernel : public framework::OpKernel<T> {
           src_md, engine, to_void_cast<T>(input_data));
 
       std::shared_ptr<mkldnn::memory::desc> dst_md;
-      if (is_negative) {
+      if (bfloat16) {
+        platform::SetDstMemoryQuantized<paddle::platform::bfloat16>(
+            ctx, output, dst_tz, engine, dst_md, dst_memory, out_format);
+      } else if (is_negative) {
         platform::SetDstMemoryQuantized<int8_t>(ctx, output, dst_tz, engine,
                                                 dst_md, dst_memory, out_format);
       } else {
@@ -96,7 +100,11 @@ class QuantOpKernel : public framework::OpKernel<T> {
       dst_memory = std::static_pointer_cast<mkldnn::memory>(
           dev_ctx.GetBlob(key_dst_mem));
       auto place = ctx.GetPlace();
-      if (is_negative) {
+
+      if (bfloat16) {
+        dst_memory->set_data_handle(
+            output->mutable_data<paddle::platform::bfloat16>(place));
+      } else if (is_negative) {
         dst_memory->set_data_handle(output->mutable_data<int8_t>(place));
       } else {
         dst_memory->set_data_handle(output->mutable_data<uint8_t>(place));
diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc
index 8924e21b46f..602fdc6ff67 100644
--- a/paddle/fluid/operators/quantize_op.cc
+++ b/paddle/fluid/operators/quantize_op.cc
@@ -40,6 +40,8 @@ void QuantOpMaker::Make() {
   AddAttr<std::string>("output_format",
                        "Convert format to NHWC or NCHW during quantization.")
       .SetDefault("NHWC");
+  AddAttr<bool>("bfloat16", "(bool, default false) Convert to bfloat16")
+      .SetDefault(false);
   AddComment(R"DOC(This op will quantize data from FP32 to INT8)DOC");
 }
 
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 8fb66c6f34b..b012a103ea3 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -443,6 +443,13 @@ inline bool HasOpINT8DataType(const paddle::framework::OpDesc* op) {
           op->GetAttrIfExists<bool>("use_quantizer"));
 }
 
+inline bool HasOpBFLOAT16DataType(const paddle::framework::OpDesc* op) {
+  return op->GetAttrIfExists<std::string>("mkldnn_data_type") == "bfloat16";
+}
+
+inline bool HasOpFLOAT32DataType(const paddle::framework::OpDesc* op) {
+  return op->GetAttrIfExists<std::string>("mkldnn_data_type") == "float32";
+}
 enum class RNNReorderType { PP_NTC, PP_TNC, NTC_PP, TNC_PP };
 
 }  // namespace platform
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 9950eb9adc2..97056eca411 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -184,6 +184,7 @@ void BindVarDsec(pybind11::module *m) {
       .value("FP16", pd::proto::VarType::FP16)
       .value("FP32", pd::proto::VarType::FP32)
       .value("FP64", pd::proto::VarType::FP64)
+      .value("BF16", pd::proto::VarType::BF16)
       .value("LOD_TENSOR", pd::proto::VarType::LOD_TENSOR)
       .value("SELECTED_ROWS", pd::proto::VarType::SELECTED_ROWS)
       .value("FEED_MINIBATCH", pd::proto::VarType::FEED_MINIBATCH)
-- 
GitLab


From 3ae3b86489225becabe50453159c8b88e0b2d905 Mon Sep 17 00:00:00 2001
From: Pei Yang <peiyang@baidu.com>
Date: Tue, 15 Sep 2020 10:42:30 +0800
Subject: [PATCH 072/261] fix trt_dynamic_shape_ernie_deserialize_test (#27290)

* fix trt_dynamic_shape_ernie_deserialize_test

* support when opt cache dir does not exist
---
 .../fluid/inference/tests/api/CMakeLists.txt  |  7 +++---
 ...rt_dynamic_shape_ernie_deserialize_test.cc | 23 +++++++++++++++++++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index b3ec4b5714e..a1b43de4695 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -480,10 +480,9 @@ if(WITH_GPU AND TENSORRT_FOUND)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
     endif()
 
-    # disable test_trt_dynamic_shape_ernie_ser_deser temporary
-    #inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
-    #        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-    #        ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
+    inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
 
 endif()
 
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
index 524e08891f4..685f7b6600e 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -12,15 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <dirent.h>
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <unistd.h>
 
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
 namespace inference {
 
+int DeleteCache(std::string path) {
+  DIR* dir = opendir(path.c_str());
+  if (dir == NULL) return 0;
+  struct dirent* ptr;
+  while ((ptr = readdir(dir)) != NULL) {
+    if (std::strcmp(ptr->d_name, ".") == 0 ||
+        std::strcmp(ptr->d_name, "..") == 0) {
+      continue;
+    } else if (ptr->d_type == 8) {
+      std::string file_rm = path + "/" + ptr->d_name;
+      return remove(file_rm.c_str());
+    }
+  }
+  return 0;
+}
+
 void run(const AnalysisConfig& config, std::vector<float>* out_data) {
   auto predictor = CreatePaddlePredictor(config);
   auto input_names = predictor->GetInputNames();
@@ -86,6 +104,11 @@ void run(const AnalysisConfig& config, std::vector<float>* out_data) {
 void trt_ernie(bool with_fp16, std::vector<float> result) {
   AnalysisConfig config;
   std::string model_dir = FLAGS_infer_model;
+  // Delete serialization cache to perform serialization first rather than
+  // deserialization.
+  std::string opt_cache_dir = FLAGS_infer_model + "/_opt_cache";
+  DeleteCache(opt_cache_dir);
+
   SetConfig(&config, model_dir, true /* use_gpu */);
 
   config.SwitchUseFeedFetchOps(false);
-- 
GitLab


From 2d8281d5ad26bf0a9acfaab3bef917ee7868e50f Mon Sep 17 00:00:00 2001
From: cc <52520497+juncaipeng@users.noreply.github.com>
Date: Tue, 15 Sep 2020 11:05:48 +0800
Subject: [PATCH 073/261] Remove the cache  in post_traning_quantization,
 test=develop (#26450)

* Remove the cache in post_traning_quantization, test=develop
---
 .../post_training_quantization.py             | 172 +++++++++---------
 1 file changed, 88 insertions(+), 84 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 244a6216110..ddbd99e16ce 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -143,7 +143,7 @@ class PostTrainingQuantization(object):
                  weight_quantize_type='channel_wise_abs_max',
                  optimize_model=False,
                  is_use_cache_file=False,
-                 cache_dir="./temp_post_training"):
+                 cache_dir=None):
         '''
         Constructor.
 
@@ -206,13 +206,8 @@ class PostTrainingQuantization(object):
                 `conv2d/depthwise_conv2d + bn`, the weights scale for all channel will
                 be different. In address this problem, fuse the pattern before
                 quantization. Default False.
-            is_use_cache_file(bool, optional): If set is_use_cache_file as False,
-                all temp data will be saved in memory. If set is_use_cache_file as True,
-                it will save temp data to disk. When the fp32 model is complex or
-                the number of calibrate data is large, we should set is_use_cache_file
-                as True. Defalut is False.
-            cache_dir(str, optional): When is_use_cache_file is True, set cache_dir as
-                the directory for saving temp data. Default is ./temp_post_training.
+            is_use_cache_file(bool, optional): This param is deprecated.
+            cache_dir(str, optional): This param is deprecated.
         Returns:
             None
 
@@ -302,10 +297,6 @@ class PostTrainingQuantization(object):
                 assert op_type in self._support_quantize_op_type, \
                     op_type + " is not supported for quantization."
         self._optimize_model = optimize_model
-        self._is_use_cache_file = is_use_cache_file
-        self._cache_dir = cache_dir
-        if self._is_use_cache_file and not os.path.exists(self._cache_dir):
-            os.mkdir(self._cache_dir)
 
         # Define variables
         self._place = self._executor.place
@@ -317,11 +308,17 @@ class PostTrainingQuantization(object):
         self._out_scale_op_list = _out_scale_op_list
         self._quantized_weight_var_name = set()
         self._quantized_act_var_name = set()
-        self.weight_op_pairs = {}
+        self._weight_op_pairs = {}
+        # The vars for alog = KL
+        self._sampling_act_abs_min_max = {}
+        self._sampling_act_histogram = {}
         self._sampling_data = {}
         self._quantized_var_kl_threshold = {}
+        self._histogram_bins = 2048
+        # The vars for algo = min_max
         self._quantized_var_min = {}
         self._quantized_var_max = {}
+        # The vars for algo = abs_max
         self._quantized_var_abs_max = {}
 
     def quantize(self):
@@ -339,6 +336,25 @@ class PostTrainingQuantization(object):
         self._collect_target_varnames()
         self._set_activation_persistable()
 
+        if self._algo == "KL":
+            _logger.info("Preparation stage ...")
+            batch_id = 0
+            for data in self._data_loader():
+                self._executor.run(program=self._program,
+                                   feed=data,
+                                   fetch_list=self._fetch_list,
+                                   return_numpy=False,
+                                   scope=self._scope)
+                self._collect_activation_abs_min_max()
+                if batch_id % 5 == 0:
+                    _logger.info("Run batch: " + str(batch_id))
+                batch_id += 1
+                if self._batch_nums and batch_id >= self._batch_nums:
+                    break
+            _logger.info("Finish preparation stage, all batch:" + str(batch_id))
+            self._init_sampling_act_histogram()
+
+        _logger.info("Sampling stage ...")
         batch_id = 0
         for data in self._data_loader():
             self._executor.run(program=self._program,
@@ -346,17 +362,13 @@ class PostTrainingQuantization(object):
                                fetch_list=self._fetch_list,
                                return_numpy=False,
                                scope=self._scope)
-            if self._algo == "KL":
-                self._sample_data(batch_id)
-            else:
-                self._sample_threshold()
-
+            self._sampling()
             if batch_id % 5 == 0:
                 _logger.info("Run batch: " + str(batch_id))
             batch_id += 1
             if self._batch_nums and batch_id >= self._batch_nums:
                 break
-        _logger.info("Finish all batch: " + str(batch_id))
+        _logger.info("Finish sampling stage, all batch: " + str(batch_id))
 
         self._reset_activation_persistable()
 
@@ -397,6 +409,7 @@ class PostTrainingQuantization(object):
             target_vars=self._fetch_list,
             executor=self._executor,
             main_program=self._program)
+        _logger.info("The quantized model is saved in " + save_model_path)
 
     def _load_model_data(self):
         '''
@@ -454,7 +467,7 @@ class PostTrainingQuantization(object):
             for var_name in var_name_list:
                 if var_name in persistable_var_names:
                     self._quantized_weight_var_name.add(var_name)
-                    self.weight_op_pairs[var_name] = op_type
+                    self._weight_op_pairs[var_name] = op_type
                 else:
                     self._quantized_act_var_name.add(var_name)
 
@@ -494,20 +507,18 @@ class PostTrainingQuantization(object):
             if var.name in self._quantized_act_var_name:
                 var.persistable = False
 
-    def _sample_threshold(self):
+    def _sampling(self):
         '''
-        Sample the input threshold(min, max, or abs_max) in every iterations.
+        Sample the min/max, abs_max or histogram in every iterations.
         '''
-        assert self._algo in ["abs_max", "min_max"], \
-            "The algo should be abs_max or min_max for _sample_threshold."
         if self._algo == "abs_max":
-            self._sample_threshold_abs_max()
+            self._sample_abs_max()
         elif self._algo == "min_max":
-            self._sample_threshold_min_max()
+            self._sample_min_max()
+        elif self._algo == "KL":
+            self._sample_histogram()
 
-    def _sample_threshold_abs_max(self):
-        assert self._algo == "abs_max", \
-            "The algo should be abs_max for _sample_threshold_abs_max."
+    def _sample_abs_max(self):
         # Only calculate abs_max value for weight for once
         if self._quantized_var_abs_max == {}:
             for var_name in self._quantized_weight_var_name:
@@ -516,7 +527,7 @@ class PostTrainingQuantization(object):
                     abs_max_value = float(np.max(np.abs(var_tensor)))
                 elif self._weight_quantize_type == "channel_wise_abs_max":
                     abs_max_value = []
-                    if self.weight_op_pairs[
+                    if self._weight_op_pairs[
                             var_name] in _channelwise_quant_axis1_ops:
                         for i in range(var_tensor.shape[1]):
                             abs_max_value.append(
@@ -534,9 +545,7 @@ class PostTrainingQuantization(object):
                 (abs_max_value > self._quantized_var_abs_max[var_name]):
                 self._quantized_var_abs_max[var_name] = abs_max_value
 
-    def _sample_threshold_min_max(self):
-        assert self._algo == "min_max", \
-            "The algo should be min_max for _sample_threshold_min_max."
+    def _sample_min_max(self):
         if self._quantized_var_min == {} and self._quantized_var_max == {}:
             for var_name in self._quantized_weight_var_name:
                 var_tensor = _load_variable_data(self._scope, var_name)
@@ -546,7 +555,7 @@ class PostTrainingQuantization(object):
                 elif self._weight_quantize_type == "channel_wise_abs_max":
                     min_value = []
                     max_value = []
-                    if self.weight_op_pairs[
+                    if self._weight_op_pairs[
                             var_name] in _channelwise_quant_axis1_ops:
                         for i in range(var_tensor.shape[1]):
                             min_value.append(float(np.min(var_tensor[:, i])))
@@ -569,6 +578,14 @@ class PostTrainingQuantization(object):
                 (max_value > self._quantized_var_max[var_name]):
                 self._quantized_var_max[var_name] = max_value
 
+    def _sample_histogram(self):
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            var_tensor_abs = np.abs(var_tensor)
+            bins = self._sampling_act_histogram[var_name][1]
+            hist, _ = np.histogram(var_tensor_abs, bins=bins)
+            self._sampling_act_histogram[var_name][0] += hist
+
     def _save_input_threhold(self):
         '''
         Save input threshold to the quantized op.
@@ -585,27 +602,36 @@ class PostTrainingQuantization(object):
                     op._set_attr(var_name + ".max",
                                  self._quantized_var_max[var_name])
 
-    def _sample_data(self, iter):
+    def _collect_activation_abs_min_max(self):
         '''
-        Sample the tensor data of quantized variables, 
-        applied in every iteration.
+        Collect the abs_min and abs_max for all activation. When algo = KL,
+        get the min and max value, and then calculate the threshold.
         '''
-        assert self._algo == "KL", "The algo should be KL to sample data."
-        if self._is_use_cache_file:
-            for var_name in self._quantized_act_var_name:
-                var_tensor = _load_variable_data(self._scope, var_name)
-                var_tensor = var_tensor.ravel()
-                save_path = os.path.join(
-                    self._cache_dir,
-                    var_name.replace("/", ".") + "_" + str(iter) + ".npy")
-                np.save(save_path, var_tensor)
-        else:
-            for var_name in self._quantized_act_var_name:
-                if var_name not in self._sampling_data:
-                    self._sampling_data[var_name] = []
-                var_tensor = _load_variable_data(self._scope, var_name)
-                var_tensor = var_tensor.ravel()
-                self._sampling_data[var_name].append(var_tensor)
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            var_tensor = np.abs(var_tensor)
+            min_value = float(np.min(var_tensor))
+            max_value = float(np.max(var_tensor))
+            if var_name not in self._sampling_act_abs_min_max:
+                self._sampling_act_abs_min_max[
+                    var_name] = [min_value, max_value]
+            else:
+                if min_value < self._sampling_act_abs_min_max[var_name][0]:
+                    self._sampling_act_abs_min_max[var_name][0] = min_value
+                if max_value > self._sampling_act_abs_min_max[var_name][1]:
+                    self._sampling_act_abs_min_max[var_name][1] = max_value
+
+    def _init_sampling_act_histogram(self):
+        '''
+        Based on the min/max value, init the sampling_act_histogram.
+        '''
+        for var_name in self._quantized_act_var_name:
+            if var_name not in self._sampling_act_histogram:
+                min_val = self._sampling_act_abs_min_max[var_name][0]
+                max_val = self._sampling_act_abs_min_max[var_name][1]
+                hist, hist_edeges = np.histogram(
+                    [], bins=self._histogram_bins, range=(min_val, max_val))
+                self._sampling_act_histogram[var_name] = [hist, hist_edeges]
 
     def _calculate_kl_threshold(self):
         '''
@@ -621,7 +647,7 @@ class PostTrainingQuantization(object):
                 weight_threshold = float(np.max(np.abs(weight_data)))
             elif self._weight_quantize_type == "channel_wise_abs_max":
                 weight_threshold = []
-                if self.weight_op_pairs[
+                if self._weight_op_pairs[
                         var_name] in _channelwise_quant_axis1_ops:
                     for i in range(weight_data.shape[1]):
                         weight_threshold.append(
@@ -632,25 +658,10 @@ class PostTrainingQuantization(object):
                             float(np.max(np.abs(weight_data[i]))))
             self._quantized_var_kl_threshold[var_name] = weight_threshold
 
-        # KL threshold for activations
-        if self._is_use_cache_file:
-            for var_name in self._quantized_act_var_name:
-                sampling_data = []
-                filenames = [f for f in os.listdir(self._cache_dir) \
-                    if re.match(var_name.replace("/", ".")  + '_[0-9]+.npy', f)]
-                for filename in filenames:
-                    file_path = os.path.join(self._cache_dir, filename)
-                    sampling_data.append(np.load(file_path))
-                    os.remove(file_path)
-                sampling_data = np.concatenate(sampling_data)
-                self._quantized_var_kl_threshold[var_name] = \
-                    self._get_kl_scaling_factor(np.abs(sampling_data))
-        else:
-            for var_name in self._quantized_act_var_name:
-                self._sampling_data[var_name] = np.concatenate(
-                    self._sampling_data[var_name])
-                self._quantized_var_kl_threshold[var_name] = \
-                    self._get_kl_scaling_factor(np.abs(self._sampling_data[var_name]))
+        for var_name in self._quantized_act_var_name:
+            hist, hist_edeges = self._sampling_act_histogram[var_name]
+            self._quantized_var_kl_threshold[var_name] = \
+                self._get_kl_scaling_factor(hist, hist_edeges)
 
     def _update_program(self):
         '''
@@ -765,22 +776,15 @@ class PostTrainingQuantization(object):
                 for var_name in out_var_names:
                     analysis_and_save_info(op, var_name)
 
-    def _get_kl_scaling_factor(self, activation_blob, num_quantized_bins=255):
+    def _get_kl_scaling_factor(self, hist, hist_edeges, num_quantized_bins=255):
         '''
         Using the KL-divergenc method to get the more precise scaling factor.
         '''
-        max_val = np.max(activation_blob)
-        min_val = np.min(activation_blob)
-        if min_val >= 0:
-            hist, hist_edeges = np.histogram(
-                activation_blob, bins=2048, range=(min_val, max_val))
-            ending_iter = 2047
-            starting_iter = int(ending_iter * 0.7)
-        else:
-            _logger.error("Please first apply abs to activation_blob.")
+        ending_iter = self._histogram_bins - 1
+        starting_iter = int(ending_iter * 0.7)
         bin_width = hist_edeges[1] - hist_edeges[0]
 
-        P_sum = len(np.array(activation_blob).ravel())
+        P_sum = np.sum(np.array(hist).ravel())
         min_kl_divergence = 0
         min_kl_index = 0
         kl_inited = False
-- 
GitLab


From ee1ed42c9928c913c87030e24e9b4399cb93a355 Mon Sep 17 00:00:00 2001
From: GaoWei8 <53294385+GaoWei8@users.noreply.github.com>
Date: Tue, 15 Sep 2020 11:24:02 +0800
Subject: [PATCH 074/261] change sequence length attribute to input (#27193)

* replace sequence length attr to input
---
 paddle/fluid/operators/cudnn_lstm_cache.h     | 166 +++++++++++++++
 paddle/fluid/operators/cudnn_lstm_op.cc       |  26 ++-
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    | 195 ++++++++++--------
 paddle/fluid/platform/cudnn_helper.h          | 170 +--------------
 .../tests/unittests/test_lstm_cudnn_op.py     |  11 +-
 .../white_list/check_shape_white_list.py      |   1 +
 6 files changed, 304 insertions(+), 265 deletions(-)
 create mode 100644 paddle/fluid/operators/cudnn_lstm_cache.h

diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h
new file mode 100644
index 00000000000..4b46e2b475e
--- /dev/null
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
@@ -0,0 +1,166 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+
+namespace paddle {
+namespace operators {
+
+class ScopedRNNBase {
+ public:
+  ScopedRNNBase(int seq_length, int batch_size, int input_size, int hidden_size,
+                int num_layers, float dropout_prob, int seed, int weight_numel,
+                bool initialized, bool is_bidirec)
+      : seq_length_(seq_length),
+        batch_size_(batch_size),
+        input_size_(input_size),
+        hidden_size_(hidden_size),
+        num_layers_(num_layers),
+        dropout_prob_(dropout_prob),
+        seed_(seed),
+        weight_numel_(weight_numel),
+        initialized_(initialized),
+        is_bidirec_(is_bidirec) {}
+
+  template <typename T>
+  void Create(const cudnnHandle_t& handle, const platform::Place& place,
+              const std::vector<int>& sequence_length, size_t* workspace_size,
+              size_t* reserve_size, framework::Tensor* dropout_state) {
+    int numDirections = is_bidirec_ ? 2 : 1;
+    cudnnDataType_t cudnn_type = platform::CudnnDataType<T>::type;
+
+    // ------------------- cudnn x, y descriptors ---------------------
+    std::vector<int> dims_x = {batch_size_, input_size_, 1};
+    std::vector<int> strides_x = {input_size_, 1, 1};
+    std::vector<int> dims_y = {batch_size_, hidden_size_ * numDirections, 1};
+    std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
+    for (int i = 0; i < seq_length_; ++i) {
+      x_descs_.emplace_back(x_desc_.descriptor<T>(dims_x, strides_x));
+      y_descs_.emplace_back(y_desc_.descriptor<T>(dims_y, strides_y));
+    }
+    if (!sequence_length.empty()) {
+      x_seq_desc_.descriptor<T>(seq_length_, batch_size_, input_size_, true,
+                                sequence_length);
+      y_seq_desc_.descriptor<T>(seq_length_, batch_size_,
+                                hidden_size_ * numDirections, true,
+                                sequence_length);
+    }
+
+    // ------------------- cudnn hx, hy, cx, cy descriptors----------
+    std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
+                                hidden_size_};
+    std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
+    init_h_desc_.descriptor<T>(dims_hx, strides_hx);
+    init_c_desc_.descriptor<T>(dims_hx, strides_hx);
+    last_h_desc_.descriptor<T>(dims_hx, strides_hx);
+    last_c_desc_.descriptor<T>(dims_hx, strides_hx);
+
+    // ------------------- cudnn dropout descriptors ---------------------
+    size_t state_size;
+    if (!initialized_) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
+      dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
+                                           place);
+    }
+    dropout_desc_.descriptor(handle, place, initialized_, dropout_prob_,
+                             dropout_state, seed_, state_size);
+
+// ------------------- cudnn rnn descriptors ---------------------
+#if CUDNN_VERSION >= 6000
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
+        handle, rnn_desc_.desc(), hidden_size_, num_layers_,
+        dropout_desc_.desc(), CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_RNN_ALGO_STANDARD, cudnn_type));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
+        rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
+        CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        cudnn_type));
+#endif
+    if (!sequence_length.empty()) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
+          rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
+    }
+
+    // ------------------- cudnn weights_size ---------------------
+    size_t weights_size_;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
+        handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
+    PADDLE_ENFORCE_EQ(
+        weights_size_, sizeof(T) * weight_numel_,
+        platform::errors::InvalidArgument(
+            "The cudnn lstm and setting weight size should be same."));
+    // ------------------- cudnn weight descriptors ---------------------
+    platform::DataLayout layout = platform::DataLayout::kNCHW;
+    int dim_tmp = weights_size_ / sizeof(T);
+    std::vector<int> dim_w = {dim_tmp, 1, 1};
+    weight_desc_.descriptor<T>(layout, dim_w);
+    // ------------------- cudnn workspace, reserve size ---------------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
+        handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
+        workspace_size));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetRNNTrainingReserveSize(
+            handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
+            reserve_size));
+  }
+  cudnnTensorDescriptor_t* x_descs() { return x_descs_.data(); }
+  cudnnTensorDescriptor_t* y_descs() { return y_descs_.data(); }
+  cudnnRNNDataDescriptor_t x_seq_desc() { return x_seq_desc_.desc(); }
+  cudnnRNNDataDescriptor_t y_seq_desc() { return y_seq_desc_.desc(); }
+  cudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
+  cudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
+  cudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
+  cudnnTensorDescriptor_t last_c_desc() { return last_c_desc_.desc(); }
+  cudnnRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
+  cudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
+  cudnnFilterDescriptor_t weight_desc() { return weight_desc_.desc(); }
+
+ private:
+  int seq_length_;
+  int batch_size_;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  float dropout_prob_;
+  int seed_;
+  int weight_numel_;
+  bool initialized_;
+  bool is_bidirec_;
+  std::vector<cudnnTensorDescriptor_t> x_descs_;
+  std::vector<cudnnTensorDescriptor_t> y_descs_;
+
+  platform::ScopedTensorDescriptor x_desc_;
+  platform::ScopedTensorDescriptor y_desc_;
+  platform::ScopedRNNTensorDescriptor x_seq_desc_;
+  platform::ScopedRNNTensorDescriptor y_seq_desc_;
+  platform::ScopedTensorDescriptor init_h_desc_;
+  platform::ScopedTensorDescriptor init_c_desc_;
+  platform::ScopedTensorDescriptor last_h_desc_;
+  platform::ScopedTensorDescriptor last_c_desc_;
+  platform::ScopedDropoutDescriptor dropout_desc_;
+  platform::ScopedFilterDescriptor weight_desc_;
+  platform::ScopedRNNDescriptor rnn_desc_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index cc807f193ed..82954bc109a 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -51,6 +51,16 @@ class CudnnLSTMOp : public framework::OperatorWithKernel {
                           "received InitH's rank is %d.",
                           init_h_dims.size()));
 
+    if (ctx->HasInput("SequenceLength")) {
+      auto seq_dims = ctx->GetInputDim("SequenceLength");
+      PADDLE_ENFORCE_EQ(
+          in_dims[1], seq_dims[0],
+          platform::errors::InvalidArgument(
+              "The size of SequenceLength has to equal the batch_size. But "
+              "received batch_size is %d and the size of SequenceLength is %d.",
+              in_dims[1], seq_dims[0]));
+    }
+
     PADDLE_ENFORCE_EQ(
         in_dims[1], init_h_dims[1],
         platform::errors::InvalidArgument(
@@ -113,6 +123,12 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor) the learnable hidden-hidden weights."
              " The shape is (N), where N is total weight size of the LSTM. "
              " cudnn concatenate all the weight to one Tensor");
+    AddInput("SequenceLength",
+             "(Tensor) When the input data is padding, "
+             "set this parameter. This parameter represents "
+             "the variable sequence lengths in a batch. "
+             "The size of the vector has to equal the batch_size.")
+        .AsDispensable();
     AddOutput("Reserve",
               "(Tensor, a temporary output Tensor to store the reserve_data "
               "of cudnn kernel.")
@@ -155,13 +171,6 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(1);
     AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
     AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
-    AddAttr<std::vector<int>>("sequence_length",
-                              "(vector<int>) When the input data is padding, "
-                              "set this parameter. This parameter represents "
-                              "the variable sequence"
-                              "lengths in a batch. The size of the vector has "
-                              "to equal the batch_size.")
-        .SetDefault({});
     AddComment(R"DOC(
 CUDNN LSTM implementation
 
@@ -243,6 +252,9 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("InitH", this->Input("InitH"));
     op->SetInput("InitC", this->Input("InitC"));
     op->SetInput("W", this->Input("W"));
+    if (this->HasInput("SequenceLength")) {
+      op->SetInput("SequenceLength", this->Input("SequenceLength"));
+    }
     op->SetInput("Reserve", this->Output("Reserve"));
     op->SetInput("StateOut", this->Output("StateOut"));
     op->SetInput("Out", this->Output("Out"));
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index f60cd41d9a2..6457d9295dc 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/cudnn_rnn_cache.h"
+#include "paddle/fluid/operators/cudnn_lstm_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/cudnn_desc.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
@@ -24,6 +25,43 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
+template <typename T>
+void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
+                  const int &seq_length, ScopedRNNBase *rnn, const T *x_data,
+                  const T *init_h_data, const T *init_c_data, const T *w_data,
+                  T *out_data, T *last_h_data, T *last_c_data,
+                  framework::Tensor *workspace_data,
+                  const size_t &workspace_size) {
+  if (!has_seq_length) {
+    // for inference
+    // This interface is used when the input/output is unpadded.
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
+        handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
+        rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
+        rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
+        rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
+        workspace_data->data<uint8_t>(), workspace_size));
+  } else {
+#if CUDNN_VERSION >= 7201
+    // for inference
+    // This interface is used when the input/output is padded.
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
+        handle, rnn->rnn_desc(), rnn->x_seq_desc(), x_data, rnn->init_h_desc(),
+        init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(),
+        w_data, rnn->y_seq_desc(), out_data, rnn->last_h_desc(), last_h_data,
+        rnn->last_c_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
+        nullptr, nullptr, nullptr, nullptr, workspace_data->data<uint8_t>(),
+        workspace_size));
+#else
+    // CUDNN VERSION has to >=7.2.1
+    PADDLE_THROW(platform::errors::Unavailable(
+        "The padded input is supported by "
+        "cudnnRNNForwardInferenceEx, but it only works when "
+        "the version of cudnn is larger than 7.2.1"));
+#endif
+  }
+}
+
 template <typename T>
 class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
  public:
@@ -56,7 +94,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     int num_layers = ctx.Attr<int>("num_layers");
     bool is_test = ctx.Attr<bool>("is_test");
     int seed = ctx.Attr<int>("seed");
-    auto sequence_length = ctx.Attr<std::vector<int>>("sequence_length");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    std::vector<int> SequenceLength;
+    if (has_seq_length) {
+      auto *sequence_length = ctx.Input<Tensor>("SequenceLength");
+      SequenceLength = operators::GetDataFromTensor<int>(sequence_length);
+    }
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
@@ -70,58 +114,32 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     size_t workspace_size;
     size_t reserve_size;
 
-    platform::ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
-                                num_layers, dropout_prob, seed, weight_numel,
-                                state_initialized, is_bidirec);
-    rnn.Create<T>(handle, ctx.GetPlace(), sequence_length, &workspace_size,
+    ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
+                      num_layers, dropout_prob, seed, weight_numel,
+                      state_initialized, is_bidirec);
+    rnn.Create<T>(handle, ctx.GetPlace(), SequenceLength, &workspace_size,
                   &reserve_size, state_out);
 
     framework::Tensor workspace_data_;
-    workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
-    workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
+    workspace_data_.mutable_data<uint8_t>(
+        {static_cast<int64_t>(workspace_size)}, ctx.GetPlace());
 
     auto *reserve_data = reserve->mutable_data<uint8_t>(
         {static_cast<int64_t>(reserve_size)}, ctx.GetPlace());
 
     if (is_test) {
-      if (sequence_length.empty()) {
-        // for inference
-        // This interface is used when the input/output is unpadded.
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
-            handle, rnn.rnn_desc(), seq_length, rnn.x_desc(), x_data,
-            rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data,
-            rnn.w_desc(), w_data, rnn.y_desc(), out_data, rnn.hy_desc(),
-            last_h_data, rnn.cy_desc(), last_c_data,
-            workspace_data_.data<uint8_t>(), workspace_size));
-      } else {
-#if CUDNN_VERSION >= 7201
-        // for inference
-        // This interface is used when the input/output is padded.
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnRNNForwardInferenceEx(
-                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.hx_desc(),
-                init_h_data, rnn.cx_desc(), init_c_data, rnn.w_desc(), w_data,
-                rnn.y_seq_desc(), out_data, rnn.hy_desc(), last_h_data,
-                rnn.cy_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
-                nullptr, nullptr, nullptr, nullptr,
-                workspace_data_.data<uint8_t>(), workspace_size));
-#else
-        PADDLE_ENFORCE_NOT_NULL(
-            nullptr, platform::errors::Unavailable(
-                         "The padded input is supported by "
-                         "cudnnRNNForwardInferenceEx, but it only works when "
-                         "the version of cudnn is larger than 7.2.1"));
-#endif
-      }
+      LSTMInferece<T>(has_seq_length, handle, seq_length, &rnn, x_data,
+                      init_h_data, init_c_data, w_data, out_data, last_h_data,
+                      last_c_data, &workspace_data_, workspace_size);
     } else {
-      if (sequence_length.empty()) {
+      if (!has_seq_length) {
         // for train
         // This interface is used when the input/output is unpadded.
         PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
-            handle, rnn.rnn_desc(), seq_length, rnn.x_desc(), x_data,
-            rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data,
-            rnn.w_desc(), w_data, rnn.y_desc(), out_data, rnn.hy_desc(),
-            last_h_data, rnn.cy_desc(), last_c_data,
+            handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
+            rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+            rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
+            rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
             workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
             reserve_size));
       } else {
@@ -130,19 +148,18 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
         // This interface is used when the input/output is padded.
         PADDLE_ENFORCE_CUDA_SUCCESS(
             platform::dynload::cudnnRNNForwardTrainingEx(
-                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.hx_desc(),
-                init_h_data, rnn.cx_desc(), init_c_data, rnn.w_desc(), w_data,
-                rnn.y_seq_desc(), out_data, rnn.hy_desc(), last_h_data,
-                rnn.cy_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
-                nullptr, nullptr, nullptr, nullptr,
-                workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
-                reserve_size));
+                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data,
+                rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+                rnn.weight_desc(), w_data, rnn.y_seq_desc(), out_data,
+                rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr, workspace_data_.data<uint8_t>(), workspace_size,
+                reserve_data, reserve_size));
 #else
-        PADDLE_ENFORCE_NOT_NULL(
-            nullptr, platform::errors::Unavailable(
-                         "The padded input is supported by "
-                         "cudnnRNNForwardTrainingEx, but it only works when "
-                         "the version of cudnn is larger than 7.2.1"));
+        PADDLE_THROW(platform::errors::Unavailable(
+            "The padded input is supported by "
+            "cudnnRNNForwardTrainingEx, but it only works when "
+            "the version of cudnn is larger than 7.2.1"));
 #endif
       }
     }
@@ -203,7 +220,13 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     int hidden_size = ctx.Attr<int>("hidden_size");
     int num_layers = ctx.Attr<int>("num_layers");
     int seed = ctx.Attr<int>("seed");
-    auto sequence_length = ctx.Attr<std::vector<int>>("sequence_length");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    std::vector<int> SequenceLength;
+    if (has_seq_length) {
+      auto *sequence_length = ctx.Input<Tensor>("SequenceLength");
+      SequenceLength = operators::GetDataFromTensor<int>(sequence_length);
+    }
 
     int seq_length = input_dims[0];
     int batch_size = input->dims()[1];
@@ -213,33 +236,33 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     size_t workspace_size;
     size_t reserve_size;
 
-    platform::ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
-                                num_layers, dropout_prob, seed, weight_numel,
-                                true, is_bidirec);
+    ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
+                      num_layers, dropout_prob, seed, weight_numel, true,
+                      is_bidirec);
 
-    rnn.Create<T>(handle, ctx.GetPlace(), sequence_length, &workspace_size,
+    rnn.Create<T>(handle, ctx.GetPlace(), SequenceLength, &workspace_size,
                   &reserve_size, const_cast<Tensor *>(state_out));
 
     framework::Tensor workspace_data_;
-    workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
-    workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
+    workspace_data_.mutable_data<uint8_t>(
+        {static_cast<int64_t>(workspace_size)}, ctx.GetPlace());
     const uint8_t *reserve_data = reserve->data<uint8_t>();
 
-    if (sequence_length.empty()) {
+    if (!has_seq_length) {
       // This interface is used when the input/output is unpadded.
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
-          handle, rnn.rnn_desc(), seq_length, rnn.y_desc(), out_data,
-          rnn.y_desc(), out_grad_data, rnn.hy_desc(), last_h_grad_data,
-          rnn.cy_desc(), last_c_grad_data, rnn.w_desc(), weight_data,
-          rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data, rnn.x_desc(),
-          in_grad_data, rnn.hx_desc(), init_h_grad_data, rnn.cx_desc(),
-          init_c_grad_data, workspace_data_.data<uint8_t>(), workspace_size,
-          const_cast<uint8_t *>(reserve_data), reserve_size));
+          handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
+          rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
+          rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
+          rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+          rnn.x_descs(), in_grad_data, rnn.init_h_desc(), init_h_grad_data,
+          rnn.init_c_desc(), init_c_grad_data, workspace_data_.data<uint8_t>(),
+          workspace_size, const_cast<uint8_t *>(reserve_data), reserve_size));
 
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
-          handle, rnn.rnn_desc(), seq_length, rnn.x_desc(), input->data<T>(),
-          rnn.hx_desc(), init_h->data<T>(), rnn.y_desc(), out->data<T>(),
-          workspace_data_.data<uint8_t>(), workspace_size, rnn.w_desc(),
+          handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
+          rnn.init_h_desc(), init_h->data<T>(), rnn.y_descs(), out->data<T>(),
+          workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
           weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
           reserve_size));
     } else {
@@ -248,27 +271,25 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
       // This interface is used when the input/output is padded.
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
           handle, rnn.rnn_desc(), rnn.y_seq_desc(), out_data, rnn.y_seq_desc(),
-          out_grad_data, nullptr, nullptr, rnn.hy_desc(), last_h_grad_data,
-          rnn.cy_desc(), last_c_grad_data, rnn.w_desc(), weight_data,
-          rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data,
-          rnn.x_seq_desc(), in_grad_data, rnn.hx_desc(), init_h_grad_data,
-          rnn.cx_desc(), init_c_grad_data, nullptr, nullptr,
+          out_grad_data, nullptr, nullptr, rnn.last_h_desc(), last_h_grad_data,
+          rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
+          rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+          rnn.x_seq_desc(), in_grad_data, rnn.init_h_desc(), init_h_grad_data,
+          rnn.init_c_desc(), init_c_grad_data, nullptr, nullptr,
           workspace_data_.data<uint8_t>(), workspace_size,
           const_cast<uint8_t *>(reserve_data), reserve_size));
 
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx(
           handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
-          rnn.hx_desc(), init_h->data<T>(), rnn.y_seq_desc(), out->data<T>(),
-          workspace_data_.data<uint8_t>(), workspace_size, rnn.w_desc(),
-          weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
-          reserve_size));
+          rnn.init_h_desc(), init_h->data<T>(), rnn.y_seq_desc(),
+          out->data<T>(), workspace_data_.data<uint8_t>(), workspace_size,
+          rnn.weight_desc(), weight_grad->data<T>(),
+          const_cast<uint8_t *>(reserve_data), reserve_size));
 #else
-      PADDLE_ENFORCE_NOT_NULL(
-          nullptr,
-          platform::errors::Unavailable(
-              "The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
-              "cudnnRNNBackwardWeightsEx, but it only works when the version "
-              "of cudnn is larger than 7.2.1"));
+      PADDLE_THROW(platform::errors::Unavailable(
+          "The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
+          "cudnnRNNBackwardWeightsEx, but it only works when the version "
+          "of cudnn is larger than 7.2.1"));
 #endif
     }
   }
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index bbe847e7190..bb4c2a89f6f 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -287,6 +287,8 @@ class ScopedTensorDescriptor {
     return descriptor(CudnnDataType<T>::type, dim, stride);
   }
 
+  inline cudnnTensorDescriptor_t desc() { return desc_; }
+
  private:
   cudnnTensorDescriptor_t desc_;
   DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
@@ -329,6 +331,8 @@ class ScopedRNNTensorDescriptor {
                       input_size, time_major, seq_length);
   }
 
+  inline cudnnRNNDataDescriptor_t desc() { return desc_; }
+
  private:
   cudnnRNNDataDescriptor_t desc_;
   DISABLE_COPY_AND_ASSIGN(ScopedRNNTensorDescriptor);
@@ -361,6 +365,7 @@ class ScopedDropoutDescriptor {
     }
     return desc_;
   }
+  inline cudnnDropoutDescriptor_t desc() { return desc_; }
 
  private:
   cudnnDropoutDescriptor_t desc_;
@@ -376,7 +381,7 @@ class ScopedRNNDescriptor {
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyRNNDescriptor(desc_));
   }
 
-  inline cudnnRNNDescriptor_t descriptor() { return desc_; }
+  inline cudnnRNNDescriptor_t desc() { return desc_; }
 
  private:
   cudnnRNNDescriptor_t desc_;
@@ -419,172 +424,13 @@ class ScopedFilterDescriptor {
                       kernel, groups);
   }
 
+  inline cudnnFilterDescriptor_t desc() { return desc_; }
+
  private:
   cudnnFilterDescriptor_t desc_;
   DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor);
 };
 
-class ScopedRNNBase {
- public:
-  ScopedRNNBase(int seq_length, int batch_size, int input_size, int hidden_size,
-                int num_layers, float dropout_prob, int seed, int weight_numel,
-                bool initialized, bool is_bidirec)
-      : seq_length_(seq_length),
-        batch_size_(batch_size),
-        input_size_(input_size),
-        hidden_size_(hidden_size),
-        num_layers_(num_layers),
-        dropout_prob_(dropout_prob),
-        seed_(seed),
-        weight_numel_(weight_numel),
-        initialized_(initialized),
-        is_bidirec_(is_bidirec) {}
-
-  template <typename T>
-  void Create(const cudnnHandle_t& handle, const platform::Place& place,
-              std::vector<int> sequence_length, size_t* workspace_size,
-              size_t* reserve_size, framework::Tensor* dropout_state) {
-    int numDirections = is_bidirec_ ? 2 : 1;
-    cudnnDataType_t cudnn_type = platform::CudnnDataType<T>::type;
-
-    // ------------------- cudnn x, y descriptors ---------------------
-    std::vector<int> dims_x = {batch_size_, input_size_, 1};
-    std::vector<int> strides_x = {input_size_, 1, 1};
-
-    std::vector<int> dims_y = {batch_size_, hidden_size_ * numDirections, 1};
-    std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
-
-    for (int i = 0; i < seq_length_; ++i) {
-      x_desc_.emplace_back(x_d.descriptor<T>(dims_x, strides_x));
-      y_desc_.emplace_back(y_d.descriptor<T>(dims_y, strides_y));
-    }
-
-    if (!sequence_length.empty()) {
-      x_seq_desc_ = x_seq_d.descriptor<T>(seq_length_, batch_size_, input_size_,
-                                          true, sequence_length);
-      y_seq_desc_ = y_seq_d.descriptor<T>(seq_length_, batch_size_,
-                                          hidden_size_ * numDirections, true,
-                                          sequence_length);
-    }
-
-    // ------------------- cudnn hx, hy, cx, cy descriptors----------
-    std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
-                                hidden_size_};
-    std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
-
-    hx_desc_ = hx_d.descriptor<T>(dims_hx, strides_hx);
-    cx_desc_ = cx_d.descriptor<T>(dims_hx, strides_hx);
-    hy_desc_ = hy_d.descriptor<T>(dims_hx, strides_hx);
-    cy_desc_ = cy_d.descriptor<T>(dims_hx, strides_hx);
-
-    // ------------------- cudnn dropout descriptors ---------------------
-    size_t state_size;
-    if (!initialized_) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnDropoutGetStatesSize(handle, &state_size));
-      dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
-                                           place);
-    }
-    dropout_desc_ =
-        dropout_d.descriptor(handle, place, initialized_, dropout_prob_,
-                             dropout_state, seed_, state_size);
-
-    // ------------------- cudnn rnn descriptors ---------------------
-    rnn_desc_ = rnn_d.descriptor();
-
-#if CUDNN_VERSION >= 6000
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
-        handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
-        CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        CUDNN_RNN_ALGO_STANDARD, cudnn_type));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
-        rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        cudnn_type));
-#endif
-    if (!sequence_length.empty()) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
-          rnn_desc_, CUDNN_RNN_PADDED_IO_ENABLED));
-    }
-    // ------------------- cudnn weights_size ---------------------
-    size_t weights_size_;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
-        handle, rnn_desc_, x_desc_[0], &weights_size_, cudnn_type));
-
-    PADDLE_ENFORCE_EQ(
-        weights_size_, sizeof(T) * weight_numel_,
-        platform::errors::InvalidArgument(
-            "The cudnn lstm and setting weight size should be same."));
-
-    // ------------------- cudnn weight descriptors ---------------------
-    platform::DataLayout layout = platform::DataLayout::kNCHW;
-    int dim_tmp = weights_size_ / sizeof(T);
-    std::vector<int> dim_w = {dim_tmp, 1, 1};
-    w_desc_ = w_d.descriptor<T>(layout, dim_w);
-
-    // ------------------- cudnn workspace, reserve size ---------------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
-        handle, rnn_desc_, seq_length_, x_desc_.data(), workspace_size));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnGetRNNTrainingReserveSize(
-            handle, rnn_desc_, seq_length_, x_desc_.data(), reserve_size));
-  }
-
-  cudnnTensorDescriptor_t* x_desc() { return x_desc_.data(); }
-  cudnnTensorDescriptor_t* y_desc() { return y_desc_.data(); }
-  cudnnRNNDataDescriptor_t x_seq_desc() { return x_seq_desc_; }
-  cudnnRNNDataDescriptor_t y_seq_desc() { return y_seq_desc_; }
-  cudnnTensorDescriptor_t hx_desc() { return hx_desc_; }
-  cudnnTensorDescriptor_t cx_desc() { return cx_desc_; }
-  cudnnTensorDescriptor_t hy_desc() { return hy_desc_; }
-  cudnnTensorDescriptor_t cy_desc() { return cy_desc_; }
-  cudnnRNNDescriptor_t rnn_desc() { return rnn_desc_; }
-  cudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_; }
-  cudnnFilterDescriptor_t w_desc() { return w_desc_; }
-
- private:
-  int seq_length_;
-  int batch_size_;
-  int input_size_;
-  int hidden_size_;
-  int num_layers_;
-  float dropout_prob_;
-  int seed_;
-  int weight_numel_;
-  bool initialized_;
-  bool is_bidirec_;
-
-  std::vector<cudnnTensorDescriptor_t> x_desc_;
-  std::vector<cudnnTensorDescriptor_t> y_desc_;
-  cudnnRNNDataDescriptor_t x_seq_desc_;
-  cudnnRNNDataDescriptor_t y_seq_desc_;
-  // A tensor descriptor describing the initial hidden state of the RNN.
-  cudnnTensorDescriptor_t hx_desc_;
-  // A tensor descriptor describing the initial cell state for LSTM networks.
-  cudnnTensorDescriptor_t cx_desc_;
-  // A tensor descriptor describing the final hidden state of the RNN.
-  cudnnTensorDescriptor_t hy_desc_;
-  // A tensor descriptor describing the final cell state for LSTM networks.
-  cudnnTensorDescriptor_t cy_desc_;
-  cudnnDropoutDescriptor_t dropout_desc_;
-  cudnnFilterDescriptor_t w_desc_;
-  cudnnRNNDescriptor_t rnn_desc_;
-
-  ScopedTensorDescriptor x_d;
-  ScopedTensorDescriptor y_d;
-  ScopedRNNTensorDescriptor x_seq_d;
-  ScopedRNNTensorDescriptor y_seq_d;
-  ScopedTensorDescriptor hx_d;
-  ScopedTensorDescriptor cx_d;
-  ScopedTensorDescriptor hy_d;
-  ScopedTensorDescriptor cy_d;
-  ScopedDropoutDescriptor dropout_d;
-  ScopedFilterDescriptor w_d;
-  ScopedRNNDescriptor rnn_d;
-};
-
 class ScopedConvolutionDescriptor {
  public:
   ScopedConvolutionDescriptor() {
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 1f3dab67f2a..29a0fa55f77 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -400,7 +400,8 @@ class TestCUDNNLstmOp(OpTest):
             'Input': input,
             'W': flat_w,
             'InitH': init_h,
-            'InitC': init_c
+            'InitC': init_c,
+            'SequenceLength': self.sequence_length
         }
         self.attrs = {
             'dropout_prob': 0.0,
@@ -408,7 +409,6 @@ class TestCUDNNLstmOp(OpTest):
             'input_size': input_size,
             'hidden_size': hidden_size,
             'num_layers': 1,
-            'sequence_length': self.sequence_length.tolist()
         }
         self.outputs = {
             'Out': output,
@@ -436,13 +436,6 @@ class TestCUDNNLstmOp(OpTest):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNLstmOp2(TestCUDNNLstmOp):
-    def set_attrs(self):
-        self.sequence_length = np.array([], dtype=np.int32)
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestCUDNNLstmOp3(TestCUDNNLstmOp):
     def set_attrs(self):
         self.num_layers = 2
 
diff --git a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
index 227e6cc28fb..e19641e710d 100644
--- a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
@@ -26,4 +26,5 @@ NEED_TO_FIX_OP_LIST = [
     'squared_l2_distance',
     'tree_conv',
     'cvm',
+    'cudnn_lstm',
 ]
-- 
GitLab


From e6e2e53782b695331710a8a512a1df3efc08fe30 Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Tue, 15 Sep 2020 11:32:39 +0800
Subject: [PATCH 075/261] Optimize error report (#27254)

* optimize errror report

* add test case for pad op converter

* fix some spelling mistake commented by peiyang
---
 .../inference/tensorrt/convert/concat_op.cc   |  7 ++-
 .../inference/tensorrt/convert/conv2d_op.cc   |  4 +-
 .../tensorrt/convert/elementwise_op.cc        | 46 ++++++++++++---
 .../tensorrt/convert/io_converter.cc          | 59 ++++++++++++++-----
 .../inference/tensorrt/convert/io_converter.h | 16 +++--
 .../inference/tensorrt/convert/op_converter.h | 56 ++++++++++++------
 .../inference/tensorrt/convert/pad_op.cc      | 23 ++++++--
 .../inference/tensorrt/convert/swish_op.cc    | 13 +++-
 .../inference/tensorrt/convert/ut_helper.h    | 15 ++++-
 .../unittests/ir/inference/test_trt_pad_op.py | 53 +++++++++++++++++
 10 files changed, 237 insertions(+), 55 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py

diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index 28afb87a891..5d63aa2ace8 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -34,8 +34,11 @@ class ConcatOpConverter : public OpConverter {
       itensors.push_back(engine_->GetITensor(input_name));
     }
     int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
-    PADDLE_ENFORCE(axis > 0,
-                   "The axis attr of Concat op should be large than 0 for trt");
+    PADDLE_ENFORCE_GT(axis, 0, platform::errors::InvalidArgument(
+                                   "The axis attr of Concat"
+                                   " op should be larger than 0 for trt. "
+                                   "But received %d.",
+                                   axis));
 
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(),
                                        itensors.size());
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 10c212c0b4f..aa03bc44bd6 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -100,7 +100,9 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
   TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
   auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
                            nv_ksize, weight, bias);
-  PADDLE_ENFORCE(layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(layer,
+                          platform::errors::Fatal("TensorRT create conv2d"
+                                                  " layer error."));
   layer->setStride(nv_strides);
   layer->setPadding(nv_paddings);
   layer->setNbGroups(groups);
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index c4f0855dbb1..dfadb28a652 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -43,13 +43,30 @@ class ElementwiseWeightOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
 
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("X").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"X\").size() "
+            "should equal to 1, but received Input(\"X\").size() = %u.",
+            op_desc.Input("X").size()));
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("Y").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"Y\").size() "
+            "should equal to 1, but received Input(\"Y\").size() = %u.",
+            op_desc.Input("Y").size()));  // Y is a weight
+    PADDLE_ENFORCE_EQ(
+        op_desc.Output("Out").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Output(\"Out\").size() "
+            "should equal to 1, but reveceid Output(\"Out\").size() = %u.",
+            op_desc.Output("Out").size()));
 
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
-    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    PADDLE_ENFORCE_NOT_NULL(
+        Y_v, platform::errors::NotFound("Variable %s not found in scope.",
+                                        op_desc.Input("Y").front().c_str()));
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
     float* weight_data = nullptr;
     weight_data =
@@ -176,9 +193,24 @@ class ElementwiseTensorOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     nvinfer1::ILayer* layer = nullptr;
 
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("X").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"X\").size() "
+            "should equal to 1, but received Input(\"X\").size() = %u.",
+            op_desc.Input("X").size()));
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("Y").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"Y\").size() "
+            "should equal to 1, but received Input(\"Y\").size() = %u.",
+            op_desc.Input("Y").size()));  // Y is a weight
+    PADDLE_ENFORCE_EQ(
+        op_desc.Output("Out").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Output(\"Out\").size() "
+            "should equal to 1, but received Output(\"Out\").size() = %u.",
+            op_desc.Output("Out").size()));
 
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
index 854f434d93e..d9cf9e2e860 100644
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
@@ -29,38 +29,67 @@ class DefaultIOConverter : public EngineIOConverter {
   // NOTE out is GPU memory.
   virtual void operator()(const LoDTensor& in, void* out,
                           size_t max_size) override {
-    PADDLE_ENFORCE(out != nullptr);
-    PADDLE_ENFORCE(stream_ != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(out,
+                            platform::errors::InvalidArgument(
+                                "The input param 'out' must not be nullptr."));
+    PADDLE_ENFORCE_NOT_NULL(stream_,
+                            platform::errors::PreconditionNotMet(
+                                "You should set up stream_ by SetStream() "
+                                "before you call the operator()."));
     const auto& place = in.place();
     size_t size = in.memory_size();
-    PADDLE_ENFORCE_LE(size, max_size);
+    PADDLE_ENFORCE_LE(
+        size, max_size,
+        platform::errors::InvalidArgument(
+            "The input Tensor in's memory_size shoule be less than or equal to "
+            "the input max_size. But in's memory_size = %u, max_size = %u.",
+            size, max_size));
     if (is_cpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
-                                           cudaMemcpyHostToDevice, *stream_));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+          out, in.data<float>(), size, cudaMemcpyHostToDevice, *stream_));
     } else if (is_gpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
-                                           cudaMemcpyDeviceToDevice, *stream_));
+      PADDLE_ENFORCE_EQ(
+          0, cudaMemcpyAsync(out, in.data<float>(), size,
+                             cudaMemcpyDeviceToDevice, *stream_),
+          platform::errors::External(
+              "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error."));
     } else {
-      PADDLE_THROW("Unknown device for converter");
+      PADDLE_THROW(platform::errors::NotFound("Unknown device for converter"));
     }
     cudaStreamSynchronize(*stream_);
   }
   // NOTE in is GPU memory.
   virtual void operator()(const void* in, LoDTensor* out,
                           size_t max_size) override {
-    PADDLE_ENFORCE(in != nullptr);
-    PADDLE_ENFORCE(stream_ != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(in,
+                            platform::errors::InvalidArgument(
+                                "The input param 'in' must not be nullptr."));
+    PADDLE_ENFORCE_NOT_NULL(stream_,
+                            platform::errors::PreconditionNotMet(
+                                "You should set up stream_ by SetStream() "
+                                "before you call the operator()."));
     const auto& place = out->place();
     size_t size = out->memory_size();
-    PADDLE_ENFORCE_LE(size, max_size);
+    PADDLE_ENFORCE_LE(
+        size, max_size,
+        platform::errors::InvalidArgument(
+            "The input Tensor out's memory_size shoule be less than or equal "
+            "to the input max_size. "
+            "But out's memory_size = %u, max_size = %u.",
+            size, max_size));
     if (is_cpu_place(place)) {
       PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
-                                           cudaMemcpyDeviceToHost, *stream_));
+                                           cudaMemcpyDeviceToHost, *stream_),
+                        platform::errors::External(
+                            "cudaMemcpyAsync(cudaMemcpyDeviceToHost) error."));
     } else if (is_gpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
-                                           cudaMemcpyDeviceToDevice, *stream_));
+      PADDLE_ENFORCE_EQ(
+          0, cudaMemcpyAsync(out->data<float>(), in, size,
+                             cudaMemcpyDeviceToDevice, *stream_),
+          platform::errors::External(
+              "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error."));
     } else {
-      PADDLE_THROW("Unknown device for converter");
+      PADDLE_THROW(platform::errors::NotFound("Unknown device for converter"));
     }
     cudaStreamSynchronize(*stream_);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.h b/paddle/fluid/inference/tensorrt/convert/io_converter.h
index 5daa242f6ab..58c178028b8 100644
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h
@@ -44,10 +44,14 @@ class EngineIOConverter {
 
   static void ConvertInput(const std::string& op_type, const LoDTensor& in,
                            void* out, size_t max_size, cudaStream_t* stream) {
-    PADDLE_ENFORCE(stream != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(stream,
+                            platform::errors::InvalidArgument(
+                                "The input stream must not be nullptr."));
     auto* converter = Registry<EngineIOConverter>::Global().Lookup(
         op_type, "default" /* default_type */);
-    PADDLE_ENFORCE_NOT_NULL(converter);
+    PADDLE_ENFORCE_NOT_NULL(
+        converter, platform::errors::Unimplemented(
+                       "The %s in is not supported yet.", op_type.c_str()));
     converter->SetStream(stream);
     (*converter)(in, out, max_size);
   }
@@ -55,10 +59,14 @@ class EngineIOConverter {
   static void ConvertOutput(const std::string& op_type, const void* in,
                             LoDTensor* out, size_t max_size,
                             cudaStream_t* stream) {
-    PADDLE_ENFORCE(stream != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(stream,
+                            platform::errors::InvalidArgument(
+                                "The input stream must not be nullptr."));
     auto* converter = Registry<EngineIOConverter>::Global().Lookup(
         op_type, "default" /* default_type */);
-    PADDLE_ENFORCE_NOT_NULL(converter);
+    PADDLE_ENFORCE_NOT_NULL(
+        converter, platform::errors::Unimplemented(
+                       "The %s in not supported yet.", op_type.c_str()));
     converter->SetStream(stream);
     (*converter)(in, out, max_size);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index f4b0f5f23d8..ac0a04b9a11 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -53,7 +53,12 @@ class OpConverter {
     OpConverter* it{nullptr};
 
     if (op_desc.Type() == "mul") {
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "The input op mul's Input(\"Y\")."
+                            "size() should equal to 1, but reveceid "
+                            "Input(\"Y\").size() = %u.",
+                            op_desc.Input("Y").size()));
       std::string Y = op_desc.Input("Y")[0];
       if (parameters.count(Y)) {
         it = Registry<OpConverter>::Global().Lookup("fc");
@@ -66,38 +71,51 @@ class OpConverter {
       // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
       // "sub", "div"};
       static std::unordered_set<std::string> add_weight_op_set{"add", "mul"};
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "The input op's Input(\"Y\")."
+                            "size() should equal to 1, but reveceid "
+                            "Input(\"Y\").size() = %u.",
+                            op_desc.Input("Y").size()));
       int op_type_len = op_desc.Type().size();
       std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
       std::string Y = op_desc.Input("Y")[0];
       if (parameters.count(Y)) {
-        PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
-                       "Unsupported elementwise type" + op_type);
+        PADDLE_ENFORCE_GT(
+            add_weight_op_set.count(op_type), 0,
+            platform::errors::Unimplemented("Unsupported elementwise type %s",
+                                            op_type.c_str()));
         it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
                                                     "_weight");
-        PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                                op_desc.Type());
+        PADDLE_ENFORCE_NOT_NULL(
+            it, platform::errors::Unimplemented(
+                    "no OpConverter for optype [%s]", op_desc.Type()));
       } else {
-        PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
-                       "Unsupported elementwise type" + op_type);
+        PADDLE_ENFORCE_GT(
+            add_tensor_op_set.count(op_type), 0,
+            platform::errors::Unimplemented("Unsupported elementwise type %s",
+                                            op_type.c_str()));
         it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
                                                     "_tensor");
       }
-      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                              op_desc.Type());
+      PADDLE_ENFORCE_NOT_NULL(
+          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
     }
 
     if (op_desc.Type() == "depthwise_conv2d") {
       it = Registry<OpConverter>::Global().Lookup("conv2d");
-      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                              op_desc.Type());
+      PADDLE_ENFORCE_NOT_NULL(
+          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
     }
 
     if (!it) {
       it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
     }
-    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                            op_desc.Type());
+    PADDLE_ENFORCE_NOT_NULL(
+        it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                            op_desc.Type()));
 
     it->SetEngine(engine);
     (*it)(op, scope, test_mode);
@@ -149,9 +167,13 @@ class OpConverter {
     for (auto& input : inputs) {
       if (parameters.count(input)) continue;
       auto* var = block_desc->FindVar(input);
-      PADDLE_ENFORCE(var, "no variable called %s", input);
-      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
-                        "TensorRT engine only takes LoDTensor as input");
+      PADDLE_ENFORCE_NOT_NULL(
+          var, platform::errors::NotFound("no variable called %s in block.",
+                                          input.c_str()));
+      PADDLE_ENFORCE_EQ(
+          var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+          platform::errors::InvalidArgument("TensorRT engine only takes "
+                                            "LoDTensor as input"));
       auto var_shape = var->GetShape();
       if (engine->with_dynamic_shape()) {
 #if IS_TRT_VERSION_GE(6000)
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
index a1b0f3b4310..dd594404d33 100644
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -39,9 +39,22 @@ class PadOpConverter : public OpConverter {
     nvinfer1::Dims input_shape = input->getDimensions();
     int nbDims = input_shape.nbDims;
     int pad_size = static_cast<int>(paddings.size());
-    PADDLE_ENFORCE_GE(nbDims, 2);
-    PADDLE_ENFORCE_EQ((nbDims + 1) * 2, pad_size);
-    PADDLE_ENFORCE(pad_value == 0.0, "The pad layer of TRT only support zero.");
+    PADDLE_ENFORCE_GE(
+        nbDims, 2,
+        platform::errors::InvalidArgument(
+            "Input X[0]'s dimension should greater than or equal to 2. "
+            "But received %d.",
+            nbDims));
+    PADDLE_ENFORCE_EQ(
+        (nbDims + 1) * 2, pad_size,
+        platform::errors::InvalidArgument("Input X[0]'s dimension(nbDims for "
+                                          "short) should meet the condition:"
+                                          "(nbDims + 1) * 2 == pad_size. But "
+                                          "received nbDims:%d, pad_size:%d.",
+                                          nbDims, pad_size));
+    PADDLE_ENFORCE_EQ(pad_value, 0.0,
+                      platform::errors::InvalidArgument(
+                          "The pad layer of TRT only support zero."));
 
     nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]);
     nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]);
@@ -50,7 +63,9 @@ class PadOpConverter : public OpConverter {
                                        *const_cast<nvinfer1::ITensor*>(input),
                                        pre_pad, post_pad);
 
-    PADDLE_ENFORCE(layer != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(layer,
+                            platform::errors::External(
+                                "add padding layer to tensorrt engine error"));
     auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "pad", {output_name}, test_mode);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
index 4b3e1c9e70a..e220d80f0d7 100644
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
@@ -28,11 +28,20 @@ class SwishOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE(input_num == 1);
+    PADDLE_ENFORCE_EQ(input_num, 1,
+                      platform::errors::InvalidArgument(
+                          "The input X's size must equal to 1 in TRT swish op."
+                          " But received X's size %d.",
+                          input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
     // Get output
     size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE(output_num == 1);
+    PADDLE_ENFORCE_EQ(
+        output_num, 1UL,
+        platform::errors::InvalidArgument(
+            "The ouput Out's size must equal to 1 in TRT swish op. "
+            "But received Out's size %u.",
+            output_num));
     // Get attrs
     float beta = BOOST_GET_CONST(float, op_desc.GetAttr("beta"));
 
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 3c48c8192f6..cfb25eb2ba8 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -49,7 +49,10 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
                      const platform::DeviceContext& ctx) {
   auto dims = tensor->dims();
   size_t num_elements = analysis::AccuDims(dims, dims.size());
-  PADDLE_ENFORCE_GT(num_elements, 0);
+  PADDLE_ENFORCE_GT(
+      num_elements, 0UL,
+      platform::errors::PermissionDenied("RandomizeTensor only can be used for "
+                                         "tensor which dims is not zero."));
 
   platform::CPUPlace cpu_place;
   framework::LoDTensor temp_tensor;
@@ -79,7 +82,8 @@ class TRTConvertValidation {
         scope_(scope),
         if_add_batch_(if_add_batch),
         max_batch_size_(max_batch_size) {
-    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0,
+                      platform::errors::External("cudaStreamCreate error."));
     engine_.reset(new TensorRTEngine(max_batch_size, workspace_size));
     engine_->InitNetwork();
   }
@@ -154,7 +158,12 @@ class TRTConvertValidation {
   void Execute(int batch_size,
                std::unordered_set<std::string> neglected_output = {}) {
     // Execute Fluid Op
-    PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
+    PADDLE_ENFORCE_LE(batch_size, max_batch_size_,
+                      platform::errors::InvalidArgument(
+                          "Runtime batch_size should be less than or equal to "
+                          "max_batch_size_. "
+                          "But received batch_size:%d, max_batch_size_:%d",
+                          batch_size, max_batch_size_));
     platform::CUDADeviceContext ctx(place_);
     op_->Run(scope_, place_);
     cudaStreamSynchronize(stream_);
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
new file mode 100644
index 00000000000..060f6c6c5f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+
+
+class PadOpTRTTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[1, 3, 128, 128], dtype="float32")
+            pad_out = fluid.layers.pad(x=data,
+                                       paddings=[0, 0, 0, 0, 0, 1, 1, 2],
+                                       pad_value=0.0)
+            out = fluid.layers.batch_norm(pad_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 128, 128)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = PadOpTRTTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From dafb0e3bb7ccebd1cf16c09762a1a5b1a2b7db26 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 15 Sep 2020 11:42:52 +0800
Subject: [PATCH 076/261] Polish framework error message part 6 (#27257)

* polish framework error msg part 6

* polish lossed item

* fix failed unittest

* polish by reviewer comments
---
 paddle/fluid/framework/op_info.h             |  34 ++-
 paddle/fluid/framework/op_kernel_type.cc     |  12 +-
 paddle/fluid/framework/op_proto_maker.cc     |   4 +-
 paddle/fluid/framework/op_registry.h         |   7 +-
 paddle/fluid/framework/op_registry_test.cc   |   3 +-
 paddle/fluid/framework/op_version_registry.h |   8 +-
 paddle/fluid/framework/operator.cc           | 257 ++++++++++++-------
 paddle/fluid/framework/operator_test.cc      |   4 +-
 8 files changed, 212 insertions(+), 117 deletions(-)

diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index 171f0839076..89b49997579 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -69,7 +69,8 @@ class OpInfo {
 
   const OpCreator& Creator() const {
     PADDLE_ENFORCE_NOT_NULL(creator_,
-                            "Operator's Creator has not been registered");
+                            platform::errors::NotFound(
+                                "Operator's Creator has not been registered."));
     return creator_;
   }
 
@@ -79,11 +80,12 @@ class OpInfo {
     std::string type = proto_ ? proto_->type() : "unknown";
     PADDLE_ENFORCE_NOT_NULL(
         grad_op_maker_,
-        "Operator %s's GradOpMaker has not been "
-        "registered.\nPlease check whether %s_op has "
-        "grad_op.\nIf not, please set stop_gradient to True "
-        "for its input and output variables using var.stop_gradient=True.",
-        type.c_str(), type.c_str());
+        platform::errors::NotFound(
+            "Operator %s's GradOpMaker has not been "
+            "registered.\nPlease check whether (%s) operator has "
+            "gradient operator.\nIf not, please set stop_gradient to be True "
+            "for its input and output variables using var.stop_gradient=True.",
+            type.c_str(), type.c_str()));
     return grad_op_maker_;
   }
 
@@ -100,11 +102,12 @@ class OpInfo {
     std::string type = proto_ ? proto_->type() : "unknown";
     PADDLE_ENFORCE_NOT_NULL(
         dygraph_grad_op_maker_,
-        "Operator %s's DygraphGradOpMaker has not been "
-        "registered.\nPlease check whether %s_op has "
-        "grad_op.\nIf not, please set stop_gradient to True "
-        "for its input and output variables using var.stop_gradient=True.",
-        type.c_str(), type.c_str());
+        platform::errors::NotFound(
+            "Operator %s's DygraphGradOpMaker has not been "
+            "registered.\nPlease check whether (%s) operator has "
+            "gradient operator.\nIf not, please set stop_gradient to be True "
+            "for its input and output variables using var.stop_gradient=True.",
+            type.c_str(), type.c_str()));
     return dygraph_grad_op_maker_;
   }
 
@@ -130,14 +133,17 @@ class OpInfoMap {
   }
 
   void Insert(const std::string& type, const OpInfo& info) {
-    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
+    PADDLE_ENFORCE_NE(Has(type), true,
+                      platform::errors::AlreadyExists(
+                          "Operator (%s) has been registered.", type));
     map_.insert({type, info});
   }
 
   const OpInfo& Get(const std::string& type) const {
     auto op_info_ptr = GetNullable(type);
-    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been registered",
-                            type);
+    PADDLE_ENFORCE_NOT_NULL(
+        op_info_ptr,
+        platform::errors::NotFound("Operator (%s) is not registered.", type));
     return *op_info_ptr;
   }
 
diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc
index 6d4801e4a0e..e64c3674e74 100644
--- a/paddle/fluid/framework/op_kernel_type.cc
+++ b/paddle/fluid/framework/op_kernel_type.cc
@@ -33,10 +33,18 @@ size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
   cur_loc += OpKernelType::kLibBits;
 
   int customized_value = key.customized_type_value_;
-  PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits));
+  PADDLE_ENFORCE_LT(customized_value, (1 << OpKernelType::kCustomizeBits),
+                    platform::errors::Unavailable(
+                        "Too many custom OpKernel attribute values, expected "
+                        "maximum value is %d, received value is %d.",
+                        (1 << OpKernelType::kCustomizeBits), customized_value));
   customized_value = customized_value << cur_loc;
   cur_loc += OpKernelType::kCustomizeBits;
-  PADDLE_ENFORCE(cur_loc < 64);
+  PADDLE_ENFORCE_LT(cur_loc, 64,
+                    platform::errors::Unavailable(
+                        "Too many OpKernel attribute values, expected maximum "
+                        "value is 64, received value is %d.",
+                        cur_loc));
 
   std::hash<int> hasher;
   return hasher(place + data_type + data_layout + library_type +
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 3408ab262c1..357c4fb5e57 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -43,7 +43,9 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
 void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   std::unordered_set<std::string> names;
   auto checker = [&](const std::string& name) {
-    PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+    PADDLE_ENFORCE_EQ(
+        names.count(name), 0,
+        platform::errors::AlreadyExists("Attribute [%s] is duplicated.", name));
     names.insert(name);
   };
   for (auto& attr : proto_->attrs()) {
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index d8159d6a5c2..6408fadf90a 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -54,9 +54,10 @@ class Registrar {
 template <typename... ARGS>
 struct OperatorRegistrar : public Registrar {
   explicit OperatorRegistrar(const char* op_type) {
-    if (OpInfoMap::Instance().Has(op_type)) {
-      PADDLE_THROW("'%s' is registered more than once.", op_type);
-    }
+    PADDLE_ENFORCE_EQ(
+        OpInfoMap::Instance().Has(op_type), false,
+        platform::errors::AlreadyExists(
+            "Operator '%s' is registered more than once.", op_type));
     static_assert(sizeof...(ARGS) != 0,
                   "OperatorRegistrar should be invoked at least by OpClass");
     OpInfo info;
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index 21d34544676..45fe66d7db3 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -58,7 +58,8 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
     AddInput("input", "input of cosine op").AsDuplicable();
     AddOutput("output", "output of cosine op").AsIntermediate();
     auto my_checker = [](int i) {
-      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+      PADDLE_ENFORCE_EQ(i % 2, 0, platform::errors::InvalidArgument(
+                                      "'test_attr' must be even!"));
     };
     AddAttr<int>("test_attr", "a simple test attribute")
         .AddCustomChecker(my_checker);
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index 5edd70e035f..fea043a0ff3 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -152,10 +152,10 @@ class OpVersionRegistrar {
     return instance;
   }
   OpVersion& Register(const std::string& op_type) {
-    if (op_version_map_.find(op_type) != op_version_map_.end()) {
-      PADDLE_THROW("'%s' is registered in operator version more than once.",
-                   op_type);
-    }
+    PADDLE_ENFORCE_EQ(
+        op_version_map_.find(op_type), op_version_map_.end(),
+        platform::errors::AlreadyExists(
+            "'%s' is registered in operator version more than once.", op_type));
     op_version_map_.insert({op_type, OpVersion()});
     return op_version_map_[op_type];
   }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ca2705f154c..21fc293e841 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -164,15 +164,20 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     VLOG(4) << place << " " << DebugStringEx(&scope);
     if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("Cannot run operator on place %s", place);
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Cannot run operator on place %s, please recompile paddle or "
+          "reinstall Paddle with CUDA support.",
+          place));
 #else
       auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
       platform::SetDeviceId(dev_id);
 #endif
     } else if (platform::is_xpu_place(place)) {
 #ifndef PADDLE_WITH_XPU
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Cannot run operator on place %s", place));
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Cannot run operator on place %s, please recompile paddle or "
+          "reinstall Paddle with XPU support.",
+          place));
 #else
       auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
       platform::SetXPUDeviceId(dev_id);
@@ -214,7 +219,7 @@ std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(
       ins.size(), 1UL,
-      platform::errors::AlreadyExists(
+      platform::errors::InvalidArgument(
           "Operator %s's input %s should contain only one variable.", type_,
           name));
   return ins.empty() ? kEmptyVarName : ins[0];
@@ -223,8 +228,10 @@ std::string OperatorBase::Input(const std::string& name) const {
 const std::vector<std::string>& OperatorBase::Inputs(
     const std::string& name) const {
   auto it = inputs_.find(name);
-  PADDLE_ENFORCE(it != inputs_.end(), "Operator %s does not have the input %s.",
-                 type_, name);
+  PADDLE_ENFORCE_NE(
+      it, inputs_.end(),
+      platform::errors::NotFound("Operator %s does not have the input %s.",
+                                 type_, name));
   return it->second;
 }
 
@@ -238,17 +245,21 @@ bool OperatorBase::HasOutputs(const std::string& name) const {
 
 std::string OperatorBase::Output(const std::string& name) const {
   auto& outs = Outputs(name);
-  PADDLE_ENFORCE_LE(outs.size(), 1UL,
-                    "Operator %s's output %s should contain only one variable.",
-                    type_, name);
+  PADDLE_ENFORCE_LE(
+      outs.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "Operator %s's output %s should contain only one variable.", type_,
+          name));
   return outs.empty() ? kEmptyVarName : outs[0];
 }
 
 const std::vector<std::string>& OperatorBase::Outputs(
     const std::string& name) const {
   auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(),
-                 "Operator %s does not have an output called %s.", type_, name);
+  PADDLE_ENFORCE_NE(
+      it, outputs_.end(),
+      platform::errors::NotFound(
+          "Operator %s does not have an output called %s.", type_, name));
   return it->second;
 }
 
@@ -391,16 +402,19 @@ void OperatorBase::CheckAllInputOutputSet() const {
 
   for (auto& in : info_->Proto().inputs()) {
     if (!in.dispensable()) {
-      PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
-                     "Operator %s's input, %s, is not set", Type(), in.name());
+      PADDLE_ENFORCE_NE(
+          inputs_.find(in.name()), inputs_.end(),
+          platform::errors::NotFound("Operator %s's input (%s) is not set.",
+                                     Type(), in.name()));
     }
   }
 
   for (auto& out : info_->Proto().outputs()) {
     if (!out.dispensable()) {
-      PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
-                     "Operator %s's output, %s, is not set", Type(),
-                     out.name());
+      PADDLE_ENFORCE_NE(
+          outputs_.find(out.name()), outputs_.end(),
+          platform::errors::NotFound("Operator %s's output (%s) is not set.",
+                                     Type(), out.name()));
     }
   }
 }
@@ -428,8 +442,9 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
   } else if (var.IsType<SelectedRows>()) {
     return &(var.Get<SelectedRows>().value());
   } else {
-    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 ToTypeName(var.Type()));
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable type is %s, expect LoDTensor or SelectedRows.",
+        ToTypeName(var.Type())));
   }
 }
 
@@ -439,8 +454,9 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
   } else if (var->IsType<SelectedRows>()) {
     return var->GetMutable<SelectedRows>()->mutable_value();
   } else {
-    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 ToTypeName(var->Type()));
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable type is %s, expect LoDTensor or SelectedRows.",
+        ToTypeName(var->Type())));
   }
 }
 
@@ -462,7 +478,7 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
 
   PADDLE_ENFORCE_LE(
       it->second.size(), 1UL,
-      platform::errors::AlreadyExists(
+      platform::errors::InvalidArgument(
           "Operator %s's input %s should contain only one variable.",
           op_.Type(), name));
   return it->second.empty() ? nullptr : it->second[0];
@@ -472,9 +488,11 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
   auto it = ctx_.outputs.find(name);
   if (it == ctx_.outputs.end()) return nullptr;
 
-  PADDLE_ENFORCE_LE(it->second.size(), 1UL,
-                    "Operator %s's output %s should contain only one variable.",
-                    op_.Type(), name);
+  PADDLE_ENFORCE_LE(
+      it->second.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "Operator %s's output %s should contain only one variable.",
+          op_.Type(), name));
   return it->second.empty() ? nullptr : it->second[0];
 }
 
@@ -497,10 +515,11 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
   std::transform(vars.begin(), vars.end(), std::back_inserter(res),
                  [&](const Variable* var) -> const Tensor* {
                    if (var == nullptr) return nullptr;
-                   PADDLE_ENFORCE(
-                       var->IsType<LoDTensor>(),
-                       "should be LoDTensor, but the received type is %s",
-                       ToTypeName(var->Type()));
+                   PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
+                                     platform::errors::InvalidArgument(
+                                         "Input variable should be LoDTensor, "
+                                         "but the received type is %s.",
+                                         ToTypeName(var->Type())));
                    return &(var->Get<LoDTensor>());
                  });
   return res;
@@ -558,8 +577,10 @@ class RuntimeInferShapeContext : public InferShapeContext {
     }
     const auto& in = it->second;
     if (in.size() == 0) return false;
-    PADDLE_ENFORCE_EQ(in.size(), 1UL,
-                      "Input %s should not have more than one inputs", name);
+    PADDLE_ENFORCE_EQ(
+        in.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Input %s should not contain more than one inputs.", name));
     return in[0] != nullptr;
   }
 
@@ -574,8 +595,10 @@ class RuntimeInferShapeContext : public InferShapeContext {
     if (out.size() == 0) {
       return false;
     }
-    PADDLE_ENFORCE_EQ(out.size(), 1UL,
-                      "Output %s should not have more than one outputs", name);
+    PADDLE_ENFORCE_EQ(
+        out.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Output %s should not contain more than one outputs.", name));
     return out[0] != nullptr;
   }
 
@@ -644,16 +667,31 @@ class RuntimeInferShapeContext : public InferShapeContext {
                 size_t j = 0) override {
     auto in_it = ctx_.inputs.find(in);
     auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
-                   "Inputs %s should have %llu argument", in, i);
-    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
-                   "Outputs %s should have %llu argument", out, j);
+    PADDLE_ENFORCE_NE(
+        in_it, ctx_.inputs.end(),
+        platform::errors::NotFound("Input %s does not exist.", in));
+    PADDLE_ENFORCE_NE(
+        out_it, ctx_.outputs.end(),
+        platform::errors::NotFound("Output %s does not exist.", out));
+    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of input dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          in_it->second.size(), i));
+    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of output dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          out_it->second.size(), j));
 
     Variable* in_var = in_it->second[i];
     Variable* out_var = out_it->second[j];
 
-    PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
-                   "The type of %s and %s is not the same.", in, out);
+    PADDLE_ENFORCE_EQ(
+        in_var->Type(), out_var->Type(),
+        platform::errors::InvalidArgument(
+            "The type of input (%s) and output (%s) are inconsistent.", in,
+            out));
 
     if (in_var->IsType<framework::SelectedRows>()) {
       auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
@@ -666,9 +704,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
       auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
       out_lod_tensor->Resize(in_lod_tensor.dims());
     } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::Unimplemented(
           "Currently, the input type of ShareDim only can be LoDTensor "
-          "or SelectedRows.");
+          "or SelectedRows."));
     }
   }
 
@@ -721,16 +759,30 @@ class RuntimeInferShapeContext : public InferShapeContext {
                 size_t j = 0) const override {
     auto in_it = ctx_.inputs.find(in);
     auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
-                   "Inputs %s should have %llu argument", in, i);
-    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
-                   "Outputs %s should have %llu argument", out, j);
+    PADDLE_ENFORCE_NE(
+        in_it, ctx_.inputs.end(),
+        platform::errors::NotFound("Input %s does not exist.", in));
+    PADDLE_ENFORCE_NE(
+        out_it, ctx_.outputs.end(),
+        platform::errors::NotFound("Output %s does not exist.", out));
+    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of input dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          in_it->second.size(), i));
+    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of output dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          out_it->second.size(), j));
 
     Variable* in_var = in_it->second.at(i);
     if (!in_var->IsType<LoDTensor>()) return;
     Variable* out_var = out_it->second.at(j);
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    PADDLE_ENFORCE_EQ(
+        out_var->IsType<LoDTensor>(), true,
+        platform::errors::InvalidArgument(
+            "The %zu-th output of Output(%s) must be LoDTensor.", j, out));
     auto& in_tensor = in_var->Get<LoDTensor>();
     auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_lod(in_tensor.lod());
@@ -757,18 +809,18 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   int32_t GetLoDLevel(const std::string& in, size_t i = 0) const override {
-    PADDLE_THROW(
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
         "GetLoDLevel is only used in compile time. The calculation of "
         "output's actual lod is different among operators so that should be "
-        "set in the runtime kernel.");
+        "set in the runtime kernel."));
   }
 
   void SetLoDLevel(const std::string& out, int32_t lod_level,
                    size_t j = 0) const override {
-    PADDLE_THROW(
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
         "SetLoDLevel is only used in compile time. The calculation of "
         "output's actual lod is different among operators so that should be "
-        "set in the runtime kernel.");
+        "set in the runtime kernel."));
   }
 
   bool IsRuntime() const override { return true; }
@@ -794,9 +846,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   DDim GetInputDim(const std::string& name) const override {
     const std::vector<Variable*>& vars = InputVars(name);
-    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
-                      "Input(%s) should hold one element, but now it holds %d",
-                      name, vars.size());
+    PADDLE_ENFORCE_EQ(
+        vars.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Input(%s) should hold one element, but now it holds %zu elements.",
+            name, vars.size()));
     return this->GetDim(vars[0]);
   }
 
@@ -817,9 +871,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   void SetOutputDim(const std::string& name, const DDim& dim) override {
     auto& vars = OutputVars(name);
-    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
-                      "Output(%s) should hold one element, but now it holds %d",
-                      name, vars.size());
+    PADDLE_ENFORCE_EQ(
+        vars.size(), 1UL,
+        platform::errors::InvalidArgument("Output(%s) should hold one element, "
+                                          "but now it holds %zu elements.",
+                                          name, vars.size()));
     SetDim(vars[0], dim);
   }
 
@@ -831,16 +887,17 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
  protected:
   DDim GetDim(Variable* var) const {
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::InvalidArgument("Input variable is nullptr."));
     if (var->IsType<LoDTensor>()) {
       return var->Get<LoDTensor>().dims();
     } else if (var->IsType<SelectedRows>()) {
       return var->Get<SelectedRows>().GetCompleteDims();
     } else {
-      PADDLE_THROW(
-          "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
-          "type_id is %s.",
-          ToTypeName(var->Type()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Only LoDTensor or SelectedRows support 'GetDim', but input "
+          "Variable's type is %s.",
+          ToTypeName(var->Type())));
     }
   }
 
@@ -853,7 +910,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
-    PADDLE_THROW("Only compile time support this method");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "GetRepeatedDims method only ban be used in compile time."));
   }
 
   void SetDim(Variable* var, const DDim& dim) {
@@ -862,15 +920,22 @@ class RuntimeInferShapeContext : public InferShapeContext {
     } else if (var->IsType<SelectedRows>()) {
       var->GetMutable<SelectedRows>()->set_height(dim[0]);
     } else {
-      PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                   ToTypeName(var->Type()));
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Variable type error, expect LoDTensor or SelectedRows, but received "
+          "(%s).",
+          ToTypeName(var->Type())));
     }
   }
 
   void SetDims(const std::vector<Variable*>& vars,
                const std::vector<DDim>& dims) {
     size_t length = vars.size();
-    PADDLE_ENFORCE_EQ(length, dims.size());
+    PADDLE_ENFORCE_EQ(length, dims.size(),
+                      platform::errors::InvalidArgument(
+                          "The number of input variables do not match the "
+                          "number of input dimensions, the number of variables "
+                          "is %zu, the number of dimensions is %zu.",
+                          length, dims.size()));
     for (size_t i = 0; i < length; ++i) {
       if (vars[i] == nullptr) {
         continue;
@@ -881,7 +946,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   void SetRepeatedDims(const std::string& name,
                        const std::vector<DDim>& dims) override {
-    PADDLE_THROW("Only compile time support this method");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "SetRepeatedDims method only can be used in compile time."));
   }
 
   std::vector<proto::VarType::Type> GetVarTypes(
@@ -901,16 +967,19 @@ class RuntimeInferShapeContext : public InferShapeContext {
  private:
   const std::vector<Variable*>& InputVars(const std::string& name) const {
     auto it = ctx_.inputs.find(name);
-    PADDLE_ENFORCE(it != ctx_.inputs.end(),
-                   "Operator %s does not have the input %s.", op_.Type(), name);
+    PADDLE_ENFORCE_NE(
+        it, ctx_.inputs.end(),
+        platform::errors::NotFound(
+            "Operator (%s) does not have the input (%s).", op_.Type(), name));
     return it->second;
   }
 
   const std::vector<Variable*>& OutputVars(const std::string& name) const {
     auto it = ctx_.outputs.find(name);
-    PADDLE_ENFORCE(it != ctx_.outputs.end(),
-                   "Operator %s does not have the outputs %s.", op_.Type(),
-                   name);
+    PADDLE_ENFORCE_NE(
+        it, ctx_.outputs.end(),
+        platform::errors::NotFound(
+            "Operator (%s) does not have the outputs (%s).", op_.Type(), name));
     return it->second;
   }
 
@@ -928,10 +997,14 @@ static void CheckTensorNANOrInf(const std::string& op_type,
       tensor.type() != proto::VarType::FP64) {
     return;
   }
-  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
-                 "Operator %s output Tensor %s contains Inf", op_type, name);
-  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
-                 "Operator %s output Tensor %s contains NAN", op_type, name);
+  PADDLE_ENFORCE_NE(
+      framework::TensorContainsInf(tensor), true,
+      platform::errors::Fatal("Operator %s output Tensor %s contains Inf.",
+                              op_type, name));
+  PADDLE_ENFORCE_NE(
+      framework::TensorContainsNAN(tensor), true,
+      platform::errors::Fatal("Operator %s output Tensor %s contains NAN.",
+                              op_type, name));
 }
 
 void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
@@ -1074,10 +1147,11 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
-  if (kernels_iter == all_op_kernels.end()) {
-    PADDLE_THROW(
-        "There are no kernels which are registered in the %s operator.", type_);
-  }
+  PADDLE_ENFORCE_NE(
+      kernels_iter, all_op_kernels.end(),
+      platform::errors::Unavailable(
+          "There are no kernels which are registered in the %s operator.",
+          type_));
 
   OpKernelMap& kernels = kernels_iter->second;
 
@@ -1131,10 +1205,10 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("op %s does not have kernel for %s", type_,
-                 KernelTypeToString(expected_kernel_key));
-  }
+  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
+                    platform::errors::NotFound(
+                        "Operator (%s) does not have kernel for %s.", type_,
+                        KernelTypeToString(expected_kernel_key)));
 
   std::lock_guard<std::mutex> lock(cache_update_mutex_);
   if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
@@ -1149,13 +1223,14 @@ void OperatorWithKernel::TransferInplaceVarsBack(
   for (auto& var_name : inplace_vars) {
     VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
     auto* origin_var = scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(origin_var, "The var[%s] should not be nullptr.",
-                            var_name);
+    PADDLE_ENFORCE_NOT_NULL(origin_var,
+                            platform::errors::InvalidArgument(
+                                "The variable[%s] is nullptr.", var_name));
     auto* original_tensor =
         GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
     auto* var = transfer_scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "The var[%s] should not be nullptr.",
-                            var_name);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
+                                     "The variable[%s] is nullptr.", var_name));
     auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
     auto original_dims = original_tensor->dims();
     original_tensor->ShareDataWith(*transformed_tensor);
@@ -1380,9 +1455,11 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
   ParseInputDataType(ctx, name, &data_type);
   PADDLE_ENFORCE_NE(
       data_type, dafault_data_type,
-      "The Input Variable(%s) of %s Op used to determine kernel data type "
-      "is empty or not LoDTensor or SelectedRows or LoDTensorArray.",
-      name, Type());
+      platform::errors::InvalidArgument(
+          "The Input Variable(%s) of (%s) Operator used to determine kernel "
+          "data type is empty or not LoDTensor or SelectedRows or "
+          "LoDTensorArray.",
+          name, Type()));
   return data_type;
 }
 
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index c4ce627ff1f..218fc8880bb 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -495,9 +495,9 @@ TEST(IndicateVarDataTypeTest, other) {
     EXPECT_TRUE(
         ex_msg.find(
             "The Input Variable(Other) of "
-            "indicate_other_data_type_test Op used to "
+            "(indicate_other_data_type_test) Operator used to "
             "determine kernel data type "
-            "is empty or not LoDTensor or SelectedRows or LoDTensorArray") !=
+            "is empty or not LoDTensor or SelectedRows or LoDTensorArray.") !=
         std::string::npos);
   }
   ASSERT_TRUE(caught);
-- 
GitLab


From bd41c314ba8fd059bc1bbc45e6aaa7bcb535ed15 Mon Sep 17 00:00:00 2001
From: LoveAn <mr.avin0323@gmail.com>
Date: Tue, 15 Sep 2020 12:01:53 +0800
Subject: [PATCH 077/261] Add *.bat file for building compile environment on
 windows, test=develop (#27300)

Add *.bat file for building compile environment on windows
---
 tools/windows/build_compile_environment.bat | 190 ++++++++++++++++++++
 1 file changed, 190 insertions(+)
 create mode 100644 tools/windows/build_compile_environment.bat

diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
new file mode 100644
index 00000000000..16665ac4aaf
--- /dev/null
+++ b/tools/windows/build_compile_environment.bat
@@ -0,0 +1,190 @@
+:: Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+::
+:: ===============================
+:: Build Paddle compile enviroment
+:: ===============================
+:: Description:
+::   
+::   Install compile enviroment for xly CI.
+::
+::   Include:
+::     1. CMake 3.17.0
+::     2. Git 2.28.0
+::     3. Python 3.7.8
+::     4. Visual Studio 2015 with update 3
+::     5. CUDA 10 [miss cudnn]
+::     6. java jre [not complete]
+::     7. xly agent [not complete]
+
+:: Echo command is not required.
+@echo off
+
+:: ===== start step 0: wget tool =====
+:: Download wget for windows when there is not wget tool.
+echo ">>>>>>>> step [0/7]: wget tool"
+wget --help > nul 2> nul || call:install_wget
+goto cmake
+
+:install_wget
+echo There is not wget in this PC, will download wget 1.20.
+echo Download package from https://eternallybored.org/misc/wget/1.20/64/wget.exe ...
+certutil -urlcache -split -f https://eternallybored.org/misc/wget/1.20/64/wget.exe > nul 2> nul
+if %errorlevel% == 0 (
+  echo Download wget tool into %cd% success.
+) else (
+  echo Error***** Download wget tool failed, please download it before rerun.
+  exit /b 1
+) 
+goto :eof
+:: ===== end step 0: wget tool =====
+
+:: ===== start step 1: cmake =====
+:: Download CMake-3.17.0 and add in PATH when it not installed.
+:: TODO: limit version >= 3.17.0
+:cmake
+echo ">>>>>>>> step [1/7]: CMake 3.17.0"
+cmake --help > nul 2> nul || call :install_cmake
+goto git
+
+:install_cmake
+echo There is not cmake in this PC, will install cmake-3.17.0.
+echo Download package from https://cmake.org/files/v3.17/cmake-3.17.0-win64-x64.msi ...
+wget -O cmake-3.17.0-win64-x64.msi https://cmake.org/files/v3.17/cmake-3.17.0-win64-x64.msi
+echo Install cmake-3.17.0 ...
+:: /passive [silent installation]
+:: /norestart [do not restart]
+:: ADD_CMAKE_TO_PATH = System [add CMake to the system PATH for all users]
+start /wait cmake-3.17.0-win64-x64.msi /passive /norestart ADD_CMAKE_TO_PATH=System
+if %errorlevel% == 0 (
+  echo Install CMake-3.17.0 success!
+) else (
+  echo Error***** Install Cmake-3.17.0 failed, please re-install it manually.
+)
+del cmake-3.17.0-win64-x64.msi
+goto :eof
+:: ===== end step 1: cmake =====
+
+:: ===== start step 2: Git =====
+:: Download Git-2.28.0 and add in PATH when it not installed.
+:: TODO: limit version >= 2.28.0
+:git
+echo ">>>>>>>> step [2/8]: Git 2.28.0"
+git --help > nul 2> nul || call :install_git
+goto python
+
+:install_git
+echo There is not git in this PC, will install Git-2.28.0.
+echo Download package from https://github.com/git-for-windows/git/releases/download/v2.28.0.windows.1/Git-2.28.0-64-bit.exe ...
+wget -O Git-2.28.0-64-bit.exe https://github.com/git-for-windows/git/releases/download/v2.28.0.windows.1/Git-2.28.0-64-bit.exe
+echo Install Git-2.28.0 ...
+:: /SILENT [silent install]
+:: /ALLUSERS [add path for all users]
+:: /NORESTART [do not restart]
+start /wait Git-2.28.0-64-bit.exe /SILENT /ALLUSERS /NORESTART
+if %errorlevel% == 0 (
+  echo Install Git-2.28.0 success!
+) else (
+  echo Error***** Install Git-2.28.0 failed, please re-install it manually.
+)
+del Git-2.28.0-64-bit.exe
+goto :eof
+:: ===== end step 2: Git =====
+
+:: ===== start step 3: Python =====
+:: Download Python-3.7.8 and add in PATH when it not installed.
+:: TODO: limit version >= 3.7.8
+:python
+echo ">>>>>>>> step [3/7]: Python 3.7.8"
+python -V 2>&1 | findstr /C:"Python 3.7.8" > nul 2> nul || call :install_python
+goto vs2015
+
+:install_python
+echo There is not Python in this PC, will install Python-3.7.8.
+echo Download package from https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe ...
+wget -O python-3.7.8-amd64.exe https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe
+echo Install Python-3.7.8 ...
+:: /passive [silent install]
+:: InstallAllUsers [add path for all users]
+:: PrependPath [add script/install into PATH]
+:: TargetDir [install directory]
+start /wait python-3.7.8-amd64.exe /passive InstallAllUsers=1 PrependPath=1 TargetDir=C:\Python37
+if %errorlevel% == 0 (
+  echo Install python-3.7.8 success!
+) else (
+  echo Error***** Install python-3.7.8 failed, please re-install it manually.
+)
+del python-3.7.8-amd64.exe
+goto :eof
+:: ===== end step 3: Python =====
+
+:: ===== start step 4: Visual Studio 2015 =====
+:: Download Visual Studio 2015 when it not installed.
+:vs2015
+echo ">>>>>>>> step [4/7]: Visual Studio 2015"
+cmd /C "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 > nul 2> nul || call :install_visual_studio
+goto :cuda10
+
+:install_visual_studio
+echo There is not Visual Studio in this PC, will install VS2015.
+echo Download package from "https://download.my.visualstudio.com/pr/en_visual_studio_professional_2015_with_update_3_x86_x64_web_installer_8922978.exe"
+wget -O vs_installer.exe "https://download.my.visualstudio.com/pr/en_visual_studio_professional_2015_with_update_3_x86_x64_web_installer_8922978.exe?t=9ee7a96d-ca80-4b84-af2c-7dd86996a0aa&e=1600103404&h=3cdea1e81c04aa4e846f5314972c46eb&su=1"
+echo Install Visual Studio 2015 ...
+:: /passive [silent install]
+:: /norestart [no restart]
+:: /NoRefresh [no refresh]
+:: /InstallSelectableItems NativeLanguageSupport_Group [select Visual C++ for installing]
+start /wait visual_installer.exe /passive /norestart /NoRefresh /InstallSelectableItems NativeLanguageSupport_Group
+if %errorlevel% == 0 (
+  echo Install Visual Studio 2015 success!
+) else (
+  echo Error***** Install Visual Studio 2015 failed, please re-install it manually.
+)
+del vs_installer.exe
+goto :eof
+:: ===== end step 4: Visual Studio 2015 =====
+
+:: ===== start step 5: CUDA 10 =====
+:cuda10
+echo ">>>>>>>> step [5/7]: CUDA 10.0"
+nvcc --version > nul 2> nul || call :install_cuda
+goto java-jre
+
+:install_cuda
+echo There is not CUDA in this PC, will install CUDA-10.0.
+echo Download package from "https://developer.download.nvidia.cn/compute/cuda/10.0/secure/Prod/network_installers/cuda_10.0.130_win10_network.exe"
+wget -O cuda_installer.exe "https://developer.download.nvidia.cn/compute/cuda/10.0/secure/Prod/network_installers/cuda_10.0.130_win10_network.exe?hG7oBtA2CnxZG7d39onmBdtzrIa2cOukrmW8I0qk3h36vb2Sj0yYGjMElJlxlNhjx8Xu5RlbmdBhCWvP2QcEqMjCoKCXe5lOgr5uIIso_7LqrotgQHbZRZSVBYRT4bIAHPVSPrr4_4KczKvI9Nf3mbO9RJ2Vj6ECD5QphRMJBus0KKNVxO1gsplVL5qaCnE"
+echo Install CUDA-10.0 ...
+:: -s [silent install]
+start /wait cuda_installer.exe -s
+if %errorlevel% == 0 (
+  echo Install CUDA-10.0 success!
+) else (
+  echo Error***** Install CUDA-10.0 failed, please re-install it manually.
+)
+del cuda_installer.exe
+goto :eof
+:: ===== end step 5: CUDA 10 =====
+
+:: ===== start step 6: java jre =====
+:java-jre
+echo ">>>>>>>> step [6/7]: java jre"
+goto xly-agent
+:: ===== end step 6: java jre =====
+
+:: ===== start step 7: xly agent =====
+:xly-agent
+echo ">>>>>>>> step [7/7]: xly agent"
+goto :eof
+:: ===== end step 8: xly agent =====
\ No newline at end of file
-- 
GitLab


From bd77a4258d18328b4e2a9b47e111de803d69933c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Tue, 15 Sep 2020 14:45:15 +0800
Subject: [PATCH 078/261] error messages of inference/tests, test=develop
 (#27259)

---
 .../tests/api/analyzer_bert_tester.cc         | 10 +++++--
 .../api/analyzer_capi_pd_tensor_tester.cc     |  8 +++---
 .../tests/api/analyzer_capi_tester.cc         | 10 +++----
 .../tests/api/analyzer_dam_tester.cc          | 16 +++++++++---
 .../analyzer_int8_object_detection_tester.cc  | 10 ++++---
 .../tests/api/analyzer_lac_tester.cc          | 15 ++++++++---
 .../tests/api/analyzer_ner_tester.cc          | 12 ++++++---
 .../tests/api/analyzer_pyramid_dnn_tester.cc  | 12 ++++++---
 .../tests/api/analyzer_rnn2_tester.cc         | 12 ++++++---
 .../tests/api/analyzer_seq_conv1_tester.cc    | 15 ++++++++---
 .../tests/api/analyzer_seq_pool1_tester.cc    | 26 ++++++++++++-------
 .../analyzer_text_classification_tester.cc    |  8 ++++--
 .../tests/api/analyzer_vis_tester.cc          |  8 ++++--
 paddle/fluid/inference/tests/test_helper.h    |  1 +
 14 files changed, 115 insertions(+), 48 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
index f956c34f23a..2570325c24a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
@@ -245,8 +245,14 @@ TEST(Analyzer_bert, transfer_scope_cache) {
   // Since paddle::framework::global_transfer_scope_cache() and
   // paddle::framework::global_transfer_data_cache() are thread_local,
   // their pointer should be different among different thread id.
-  PADDLE_ENFORCE(global_transfer_scope_cache.size(), threads_num);
-  PADDLE_ENFORCE(global_transfer_data_cache.size(), threads_num);
+  PADDLE_ENFORCE_EQ(
+      global_transfer_scope_cache.size(), threads_num,
+      paddle::platform::errors::Fatal(
+          "The size of scope cache is not equal to thread number."));
+  PADDLE_ENFORCE_EQ(
+      global_transfer_data_cache.size(), threads_num,
+      paddle::platform::errors::Fatal(
+          "The size of data cache is not equal to thread number."));
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
index 0bc67aff7af..a9c24c4503f 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
@@ -69,11 +69,13 @@ void PD_run() {
   PD_DeletePaddleTensor(input);
   int size;
   const int* out_shape = PD_GetPaddleTensorShape(out_data, &size);
-  CHECK(size == 2) << "The Output shape's size is NOT match.";
+  PADDLE_ENFORCE_EQ(size, 2, paddle::platform::errors::InvalidArgument(
+                                 "The Output shape's size is NOT match."));
   std::vector<int> ref_outshape_size({9, 6});
   for (int i = 0; i < 2; ++i) {
-    CHECK(out_shape[i] == ref_outshape_size[i])
-        << "The Output's shape is NOT match.";
+    PADDLE_ENFORCE_EQ(out_shape[i], ref_outshape_size[i],
+                      paddle::platform::errors::InvalidArgument(
+                          "The Output shape's size is NOT match."));
   }
   PD_DeletePaddleBuf(buf);
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
index d76799a679c..fd20581123c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
@@ -36,9 +36,9 @@ void zero_copy_run() {
   PD_SwitchIrDebug(config, true);
   PD_SetModel(config, prog_file.c_str(), params_file.c_str());
   bool use_feed_fetch = PD_UseFeedFetchOpsEnabled(config);
-  CHECK(!use_feed_fetch) << "NO";
+  EXPECT_FALSE(use_feed_fetch);
   bool specify_input_names = PD_SpecifyInputName(config);
-  CHECK(specify_input_names) << "NO";
+  EXPECT_TRUE(specify_input_names);
 
   const int batch_size = 1;
   const int channels = 3;
@@ -85,13 +85,13 @@ TEST(PD_AnalysisConfig, profile_mkldnn) {
   PD_SwitchIrDebug(config, true);
   PD_EnableMKLDNN(config);
   bool mkldnn_enable = PD_MkldnnEnabled(config);
-  CHECK(mkldnn_enable) << "NO";
+  EXPECT_TRUE(mkldnn_enable);
   PD_EnableMkldnnQuantizer(config);
   bool quantizer_enable = PD_MkldnnQuantizerEnabled(config);
-  CHECK(quantizer_enable) << "NO";
+  EXPECT_TRUE(quantizer_enable);
   PD_EnableMkldnnBfloat16(config);
   bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
-  CHECK(bfloat16_enable) << "NO";
+  EXPECT_TRUE(bfloat16_enable);
   PD_SetMkldnnCacheCapacity(config, 0);
   PD_SetModel(config, prog_file.c_str(), params_file.c_str());
   PD_DeleteAnalysisConfig(config);
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 00a475b6047..d61c28c30d2 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -126,7 +126,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   std::string turn_mask_pre = "turn_mask_";
 
   auto one_batch = data->NextBatch();
-  PADDLE_ENFORCE(!one_batch.response.empty());
+  PADDLE_ENFORCE(
+      !one_batch.response.empty(),
+      paddle::platform::errors::Fatal("The response of one batch is empty."));
   int size = one_batch.response[0].size();
   CHECK_EQ(size, kMaxTurnLen);
   // turn tensor assignment
@@ -214,11 +216,17 @@ void profile(bool use_mkldnn = false) {
                  input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of outputs should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_GT(output.size(), 0);
+    PADDLE_ENFORCE_GT(output.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     float *result = static_cast<float *>(output[0].data.data());
     for (size_t i = 0; i < size; i++) {
       EXPECT_NEAR(result[i], result_data[i], 1e-3);
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
index 7f06a3b9023..91a3233b985 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
@@ -146,8 +146,9 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
   auto iterations = test_data.size();
   PADDLE_ENFORCE_LE(
       static_cast<size_t>(num_images), iterations * test_data_batch_size,
-      "The requested quantization warmup data size " +
-          std::to_string(num_images) + " is bigger than all test data size.");
+      paddle::platform::errors::Fatal(
+          "The requested quantization warmup data size " +
+          std::to_string(num_images) + " is bigger than all test data size."));
 
   PaddleTensor images;
   images.name = "image";
@@ -237,8 +238,9 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
   }
   PADDLE_ENFORCE_EQ(
       static_cast<size_t>(num_objects), static_cast<size_t>(objects_accum),
-      "The requested num of objects " + std::to_string(num_objects) +
-          " is the same as objects_accum.");
+      paddle::platform::errors::Fatal("The requested num of objects " +
+                                      std::to_string(num_objects) +
+                                      " is the same as objects_accum."));
 
   auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(4);
   (*warmup_data)[0] = std::move(images);
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 142905dcd8d..bd3a1d737af 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -98,7 +98,9 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   input_tensor.name = "word";
   input_tensor.dtype = PaddleDType::INT64;
   TensorAssignData<int64_t>(&input_tensor, {one_batch.data}, one_batch.lod);
-  PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
+  PADDLE_ENFORCE_EQ(
+      batch_size, static_cast<int>(one_batch.lod.size() - 1),
+      paddle::platform::errors::Fatal("The lod size of one batch is invaild."));
   input_slots->assign({input_tensor});
 }
 
@@ -137,12 +139,17 @@ TEST(Analyzer_LAC, profile) {
         24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
         44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
         15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 1."));
     size_t size = GetSize(output[0]);
     size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
-    PADDLE_ENFORCE_GE(size, batch1_size);
+    PADDLE_ENFORCE_GE(size, batch1_size, paddle::platform::errors::Fatal(
+                                             "The size of batch is invaild."));
     int64_t *pdata = static_cast<int64_t *>(output[0].data.data());
     for (size_t i = 0; i < batch1_size; ++i) {
       EXPECT_EQ(pdata[i], lac_ref_data[i]);
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 2a862b1395c..50a68361d53 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -117,11 +117,17 @@ void profile(bool memory_load = false) {
     // the first inference result
     const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
                                            48, 39, 38, 16, 25};
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 1."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     int64_t *result = static_cast<int64_t *>(output[0].data.data());
     for (size_t i = 0; i < std::min<size_t>(11, size); i++) {
       EXPECT_EQ(result[i], chinese_ner_result_data[i]);
diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
index 06a8e01b10c..bb1f0e8cd63 100644
--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
@@ -136,11 +136,17 @@ TEST(Analyzer_Pyramid_DNN, profile) {
                  input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) {
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 1."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     float *result = static_cast<float *>(output[0].data.data());
     // output is probability, which is in (0, 1).
     for (size_t i = 0; i < size; i++) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
index 9ccbf58cbd2..34a0a5f398d 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -135,11 +135,17 @@ TEST(Analyzer_rnn2, profile) {
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_GT(output.size(), 0);
+    PADDLE_ENFORCE_GT(output.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     float *result = static_cast<float *>(output[0].data.data());
     for (size_t i = 0; i < size; i++) {
       EXPECT_NEAR(result[i], result_data[i], 1e-3);
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index e3f8b835f78..978aaf1c6a3 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -47,7 +47,8 @@ struct DataRecord {
       num_lines++;
       std::vector<std::string> data;
       split(line, '\t', &data);
-      PADDLE_ENFORCE(data.size() >= 4);
+      PADDLE_ENFORCE_GT(data.size(), 4, paddle::platform::errors::Fatal(
+                                            "The size of data is invaild."));
       // load title1 data
       std::vector<int64_t> title1_data;
       split_to_int64(data[0], ' ', &title1_data);
@@ -120,11 +121,17 @@ TEST(Analyzer_seq_conv1, profile) {
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 0."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     float *result = static_cast<float *>(output[0].data.data());
     // output is probability, which is in (0, 1).
     for (size_t i = 0; i < size; i++) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index 56f706ae56b..9f1556cdb87 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -56,20 +56,26 @@ struct DataRecord {
       std::vector<float> slot_data;
       split_to_float(data[1], ' ', &slot_data);
       std::string name = data[0];
-      PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0UL,
-                        "line %d, %s should be divisible", num_lines, name);
+      PADDLE_ENFORCE_EQ(
+          slot_data.size() % 11, 0UL,
+          paddle::platform::errors::Fatal("line %d, %s should be divisible",
+                                          num_lines, name));
       datasets[name].emplace_back(std::move(slot_data));
     }
     num_samples = num_lines / num_slots;
-    PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast<size_t>(num_lines),
-                      "num samples should be divisible");
-    PADDLE_ENFORCE_GT(num_samples, 0UL);
+    PADDLE_ENFORCE_EQ(
+        num_samples * num_slots, static_cast<size_t>(num_lines),
+        paddle::platform::errors::Fatal("num samples should be divisible"));
+    PADDLE_ENFORCE_GT(num_samples, 0UL,
+                      paddle::platform::errors::Fatal(
+                          "The num of samples should be greater than 0."));
   }
 
   void Prepare(int bs) {
     for (auto it = datasets.begin(); it != datasets.end(); ++it) {
-      PADDLE_ENFORCE_EQ(it->second.size(), num_samples,
-                        "size of each slot should be equal");
+      PADDLE_ENFORCE_EQ(
+          it->second.size(), num_samples,
+          paddle::platform::errors::Fatal("size of each slot should be equal"));
     }
     size_t num_batches = num_samples / bs;
     EXPECT_GT(num_batches, 0UL);
@@ -90,8 +96,10 @@ struct DataRecord {
           std::copy(datas[id].begin(), datas[id].end(),
                     std::back_inserter(slot.data[k]));
           size_t len = datas[id].size() / 11;
-          PADDLE_ENFORCE_EQ(len * 11, datas[id].size(),
-                            "%s %d size should be divisible", slot.name, id);
+          PADDLE_ENFORCE_EQ(
+              len * 11, datas[id].size(),
+              paddle::platform::errors::Fatal("%s %d size should be divisible",
+                                              slot.name, id));
           lod[k + 1] = lod[k] + len;
         }
         slot.shape.assign({static_cast<int>(lod[bs]), 11});
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 78e500b2ed5..ae38bcbc20a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -22,7 +22,9 @@ struct DataReader {
       : file(new std::ifstream(path)) {}
 
   bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
-    PADDLE_ENFORCE_EQ(batch_size, 1);
+    PADDLE_ENFORCE_EQ(batch_size, 1,
+                      paddle::platform::errors::Fatal(
+                          "The size of batch should be equal to 1."));
     std::string line;
     PaddleTensor tensor;
     tensor.dtype = PaddleDType::INT64;
@@ -81,7 +83,9 @@ TEST(Analyzer_Text_Classification, profile) {
 
   if (FLAGS_num_threads == 1) {
     // Get output
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     LOG(INFO) << "get outputs " << outputs.back().size();
     for (auto &output : outputs.back()) {
       LOG(INFO) << "output.shape: " << to_string(output.shape);
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 65755b7b15a..a2ced21a9ac 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -59,7 +59,9 @@ void SetConfig(AnalysisConfig *cfg) {
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
+  PADDLE_ENFORCE_EQ(
+      FLAGS_test_all_data, 0,
+      paddle::platform::errors::Fatal("Only have single batch of data."));
   std::string line;
   std::ifstream file(FLAGS_infer_data);
   std::getline(file, line);
@@ -99,7 +101,9 @@ void profile(bool use_mkldnn = false) {
     auto refer = ProcessALine(line);
     file.close();
 
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto &output = outputs.back().front();
     size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
     CHECK_EQ(numel, refer.data.size());
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 7183cbac715..d27959aff6f 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -21,6 +21,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
 
-- 
GitLab


From f827665ae650597c680010e77b8ad12db9897a9e Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 15 Sep 2020 14:50:24 +0800
Subject: [PATCH 079/261] [Pass Compatible] Bind python compatible. (#27262)

---
 .../ir/transpose_flatten_concat_fuse_pass.cc  |  9 +++++
 paddle/fluid/pybind/CMakeLists.txt            |  1 +
 paddle/fluid/pybind/compatible.cc             | 38 +++++++++++++++++++
 paddle/fluid/pybind/compatible.h              | 23 +++++++++++
 paddle/fluid/pybind/pybind.cc                 |  2 +
 ...test_transpose_flatten_concat_fuse_pass.py | 32 ++++++++++++++++
 6 files changed, 105 insertions(+)
 create mode 100644 paddle/fluid/pybind/compatible.cc
 create mode 100644 paddle/fluid/pybind/compatible.h

diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index 9a0a5f07a70..405cefa99eb 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -145,3 +146,11 @@ void TransposeFlattenConcatFusePass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(transpose_flatten_concat_fuse_pass,
               paddle::framework::ir::TransposeFlattenConcatFusePass);
+REGISTER_PASS_CAPABILITY(transpose_flatten_concat_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("transpose", 0)
+            .EQ("transpose2", 0)
+            .EQ("flatten", 0)
+            .EQ("concat", 0)
+            .EQ("fusion_transpose_flatten_concat", 0));
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index d733cf26ed2..92d94731410 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -38,6 +38,7 @@ set(PYBIND_SRCS
   imperative.cc
   ir.cc
   inference_api.cc
+  compatible.cc
   generator_py.cc)
 
 if(WITH_GLOO)
diff --git a/paddle/fluid/pybind/compatible.cc b/paddle/fluid/pybind/compatible.cc
new file mode 100644
index 00000000000..971d230458d
--- /dev/null
+++ b/paddle/fluid/pybind/compatible.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/compatible.h"
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace py = pybind11;
+
+using paddle::framework::compatible::PassVersionCheckerRegistrar;
+
+namespace paddle {
+namespace pybind {
+
+void BindCompatible(py::module* m) {
+  py::class_<PassVersionCheckerRegistrar>(*m, "PassVersionChecker")
+      .def_static("IsCompatible", [](const std::string& name) -> bool {
+        auto instance = PassVersionCheckerRegistrar::GetInstance();
+        return instance.IsPassCompatible(name);
+      });
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/compatible.h b/paddle/fluid/pybind/compatible.h
new file mode 100644
index 00000000000..f9d4cf5888f
--- /dev/null
+++ b/paddle/fluid/pybind/compatible.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace paddle {
+namespace pybind {
+void BindCompatible(pybind11::module *m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 4b8f7c853ce..330254ecaaf 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -60,6 +60,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/box_helper_py.h"
+#include "paddle/fluid/pybind/compatible.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/data_set_py.h"
 #include "paddle/fluid/pybind/exception.h"
@@ -2619,6 +2620,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindGraph(&m);
   BindNode(&m);
   BindInferenceApi(&m);
+  BindCompatible(&m);
   BindDataset(&m);
   BindGenerator(&m);
 #ifdef PADDLE_WITH_CRYPTO
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
index dfcd1758db2..34a52e7aed3 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
@@ -17,6 +17,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 
 
 class TransposeFlattenConcatFusePassTest(InferencePassTest):
@@ -45,6 +46,37 @@ class TransposeFlattenConcatFusePassTest(InferencePassTest):
             use_gpu = True
             self.check_output_with_option(use_gpu)
 
+        PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass')
+
+
+class TransposeFlattenConcatFusePassWithAxisTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32")
+            data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32")
+            trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0])
+            trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0])
+            flatt1 = fluid.layers.flatten(trans1, axis=2)
+            flatt2 = fluid.layers.flatten(trans2, axis=2)
+            concat_out = fluid.layers.concat([flatt1, flatt2], axis=1)
+            # There is no parameters for above structure. 
+            # Hence, append a batch_norm to avoid failure caused by load_combined. 
+            out = fluid.layers.batch_norm(concat_out, is_test=True)
+
+        self.feeds = {
+            "data1": np.random.random([5, 5, 5]).astype("float32"),
+            "data2": np.random.random([5, 5, 5]).astype("float32")
+        }
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        # There is no cpu pass for transpose_flatten_concat_fuse
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+
+        PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass')
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From cb34cf18c0a6c2e60a70f939f70976b440deaff1 Mon Sep 17 00:00:00 2001
From: chalsliu <45041955+chalsliu@users.noreply.github.com>
Date: Tue, 15 Sep 2020 15:19:22 +0800
Subject: [PATCH 080/261] Set timeout value on windows and mac (#27197)

---
 cmake/generic.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 1956e5c39ea..b0a6dfe2902 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -386,7 +386,7 @@ function(cc_test_run TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     # No unit test should exceed 2 minutes.
     if (APPLE OR WIN32)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     else()
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
     endif()
@@ -748,7 +748,7 @@ function(py_test TARGET_NAME)
     endif()
     
     if (APPLE OR WIN32)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     else()
         # No unit test should exceed 2 minutes in Linux.
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
-- 
GitLab


From 47fdc60ecc1890e667f6301813818b386486afaa Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Tue, 15 Sep 2020 15:27:32 +0800
Subject: [PATCH 081/261] Optimize slice trt plugin (#26970)

* optimize slice TRT plugin

This patch removes unnecessary barrier for data transfer of needed offset,
so data transfer can be overlap with GPU kernel execution.

This patch also fixes incorrect name of slice plugin. That is, replaces
"layernorm" with "slice"

test=develop

* add serialize/deserialize to slice plugin

* add static shape slice trt plugin

* fix slice trt op convertor dynamic shape bug

* fix format by clang-format

* fix pylint format error

* fix problems commented by peiyang

Co-authored-by: Ryan Jeng <rjeng@nvidia.com>
---
 .../inference/tensorrt/convert/slice_op.cc    |  67 ++++--
 paddle/fluid/inference/tensorrt/op_teller.cc  |   1 +
 .../tensorrt/plugin/slice_op_plugin.cu        | 221 ++++++++++++++++--
 .../tensorrt/plugin/slice_op_plugin.h         |  88 ++++++-
 .../ir/inference/test_trt_slice_plugin.py     | 150 ++++++++++++
 5 files changed, 490 insertions(+), 37 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py

diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index 2a76317eea1..3c3fead3d36 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -23,9 +23,8 @@ class SliceOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-// This OP is implemented by trt dynamic shpae plugin.
-// Dynamic shape plugin requires TRT version greater than 6.0.
-#if IS_TRT_VERSION_GE(6000)
+    // This OP is implemented by trt dynamic shpae plugin.
+    // Dynamic shape plugin requires TRT version greater than 6.0.
     VLOG(4) << "convert slice op to tensorrt layer";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
@@ -38,27 +37,65 @@ class SliceOpConverter : public OpConverter {
     std::vector<int> ends =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("ends"));
 
+    PADDLE_ENFORCE_EQ(
+        starts.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of starts must be equal to the size of axes."));
+    PADDLE_ENFORCE_EQ(
+        ends.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of ends must be equal to the size of axes."));
+
+    auto input_dims = input->getDimensions();
+    if (!engine_->with_dynamic_shape()) {
+      // notice that input shape is [CHW] without batch axis when input has
+      // static shape
+      for (size_t i = input_dims.nbDims; i > 0; i--) {
+        input_dims.d[i] = input_dims.d[i - 1];
+      }
+      input_dims.d[0] = 1;  // fake batchsize, not useful here
+      for (size_t i = 0; i < axes.size(); i++) {
+        // split on batch is not supported in TensorRT
+        PADDLE_ENFORCE_NE(axes[i], 0, platform::errors::InvalidArgument(
+                                          "Invalid slice axis. Slice on batch "
+                                          "axis is not supported in TensorRT"));
+        if (starts[i] < 0) {
+          starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0);
+        }
+        if (ends[i] < 0) {
+          ends[i] = std::max(ends[i] + input_dims.d[axes[i]], 0);
+        }
+        ends[i] = std::min(ends[i], input_dims.d[axes[i]]);
+        PADDLE_ENFORCE_GT(
+            ends[i], starts[i],
+            platform::errors::InvalidArgument(
+                "Attr(ends) should be greater than attr(starts) in "
+                "slice op. But received ends = %d, starts = %d.",
+                ends[i], starts[i]));
+      }
+    }
+
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
       bool ban_fp16 = engine_->disable_trt_plugin_fp16();
       plugin::SlicePluginDynamic* plugin =
-          new plugin::SlicePluginDynamic(starts, ends, ends, ban_fp16);
+          new plugin::SlicePluginDynamic(starts, ends, axes, ban_fp16);
       layer = engine_->AddPluginV2(&input, 1, plugin);
-    } else {
+#else
       PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
+          "You are running the TRT Dynamic Shape mode, need to confirm that "
+          "your TRT version is no less than 6.0"));
+#endif
+    } else {
+      bool ban_fp16 = engine_->disable_trt_plugin_fp16();
+      plugin::SlicePlugin* plugin =
+          new plugin::SlicePlugin(starts, ends, axes, ban_fp16);
+      layer = engine_->AddPlugin(&input, 1, plugin);
     }
 
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "skip_layernorm", {output_name}, test_mode);
-#else
-    PADDLE_THROW(platform::errors::Fatal(
-        "You are running the TRT Dynamic Shape mode, need to confirm that "
-        "your TRT version is no less than 6.0"));
-#endif
+    RreplenishLayerAndOutput(layer, "slice", {output_name}, test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index a5b71356d0e..31128ba8c5d 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -31,6 +31,7 @@ struct SimpleOpTypeSetTeller : public Teller {
     teller_set.insert("fused_embedding_eltwise_layernorm");
     teller_set.insert("multihead_matmul");
     teller_set.insert("skip_layernorm");
+    teller_set.insert("slice");
 #endif
   }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
index 4fb1d824108..5c56270627a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
@@ -26,8 +26,10 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-// Dynamic Plugin below.
-#if IS_TRT_VERSION_GE(6000)
+SlicePlugin *CreateSlicePluginDeserialize(const void *buffer, size_t length) {
+  return new SlicePlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("slice_plugin", CreateSlicePluginDeserialize);
 
 template <typename T>
 __global__ void SliceKernel(int num, int dims, const T *input,
@@ -56,11 +58,196 @@ __global__ void SliceKernel(int num, int dims, const T *input,
   }
 }
 
+SlicePlugin::SlicePlugin(std::vector<int> starts, std::vector<int> ends,
+                         std::vector<int> axes, bool ban_fp16)
+    : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) {
+  deserializeBase(serial_data, serial_length);
+  DeserializeValue(&serial_data, &serial_length, &starts_);
+  DeserializeValue(&serial_data, &serial_length, &ends_);
+  DeserializeValue(&serial_data, &serial_length, &axes_);
+  DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+SlicePlugin::~SlicePlugin() {
+  cudaStreamDestroy(copy_stream_);
+  cudaEventDestroy(copy_event_);
+  cudaFree(offset_temp_data_);
+}
+
+SlicePlugin *SlicePlugin::clone() const {
+  return new SlicePlugin(starts_, ends_, axes_, ban_fp16_);
+}
+
+bool SlicePlugin::supportsFormat(nvinfer1::DataType type,
+                                 nvinfer1::PluginFormat format) const {
+#ifdef SUPPORTS_CUDA_FP16
+  return ((type == nvinfer1::DataType::kFLOAT ||
+           type == nvinfer1::DataType::kHALF) &&
+          (format == nvinfer1::PluginFormat::kNCHW));
+#else
+  return ((type == nvinfer1::DataType::kFLOAT) &&
+          (format == nvinfer1::PluginFormat::kNCHW));
+#endif
+}
+
+nvinfer1::Dims SlicePlugin::getOutputDimensions(int index,
+                                                const nvinfer1::Dims *inputs,
+                                                int nb_input_dims) {
+  auto in_dims = inputs[0];
+  nvinfer1::Dims out_dims = in_dims;
+  for (size_t i = 0; i < axes_.size(); i++) {
+    int start = starts_[i];
+    int end = ends_[i];
+    out_dims.d[axes_[i] - 1] = end - start;
+  }
+  return out_dims;
+}
+
+int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
+                         void **outputs, void *workspace, cudaStream_t stream) {
+  auto input_dims = getInputDims(0);
+
+  // notice input dims is [C, H, W], add input batch dim here
+  auto out_dims = getOutputDimensions(0, &input_dims, 1);
+  input_dims.nbDims += 1;
+  out_dims.nbDims += 1;
+  for (auto i = input_dims.nbDims; i > 0; --i) {
+    input_dims.d[i] = input_dims.d[i - 1];
+    out_dims.d[i] = out_dims.d[i - 1];
+  }
+  input_dims.d[0] = batch_size;
+  out_dims.d[0] = batch_size;
+
+  auto num_dims = input_dims.nbDims;
+  size_t out_num = ProductDim(out_dims);
+
+  std::vector<int> seg_offsets;
+  std::vector<int> offsets;
+  std::vector<int> extends;
+
+  offsets.resize(num_dims);
+  extends.resize(num_dims);
+  seg_offsets.resize(num_dims);
+
+  seg_offsets[num_dims - 1] = 1;
+  for (int i = num_dims - 2; i >= 0; i--) {
+    seg_offsets[i] = input_dims.d[i + 1] * seg_offsets[i + 1];
+  }
+  for (size_t i = 0; i < num_dims; ++i) {
+    offsets[i] = 0;
+    extends[i] = out_dims.d[i];
+  }
+  for (size_t i = 0; i < axes_.size(); ++i) {
+    offsets[axes_[i]] = starts_[i];
+  }
+
+  std::vector<int> offset_info;
+  for (size_t i = 0; i < num_dims; ++i) {
+    offset_info.push_back(offsets[i]);
+    offset_info.push_back(extends[i]);
+    offset_info.push_back(seg_offsets[i]);
+  }
+
+  if (offset_temp_data_ == nullptr) {
+    cudaMalloc(&offset_temp_data_, 3 * num_dims * sizeof(int));
+  }
+
+  cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
+                  copy_stream_);
+
+  cudaEventRecord(copy_event_, copy_stream_);
+  cudaStreamWaitEvent(stream, copy_event_, 0);
+
+  int threads = 256;
+  int blocks = (out_num + threads - 1) / threads;
+  auto input_type = getDataType();
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    const float *input1 = static_cast<const float *>(inputs[0]);
+    float *output = static_cast<float *>(outputs[0]);
+    SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
+        out_num, num_dims, input1, offset_temp_data_, output);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+#ifdef SUPPORTS_CUDA_FP16
+    const half *input1 = static_cast<const half *>(inputs[0]);
+    half *output = static_cast<half *>(outputs[0]);
+    SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
+        out_num, num_dims, input1, offset_temp_data_, output);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "The cuda archs you specific should greater than 600."));
+#endif
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "The Slice TRT Plugin's input type should be float or half."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+
+size_t SlicePlugin::getSerializationSize() {
+  return getBaseSerializationSize() + SerializedSize(getPluginType()) +
+         SerializedSize(starts_) + SerializedSize(ends_) +
+         SerializedSize(axes_) + SerializedSize(ban_fp16_);
+}
+
+void SlicePlugin::serialize(void *buffer) {
+  SerializeValue(&buffer, getPluginType());
+  serializeBase(buffer);
+  SerializeValue(&buffer, starts_);
+  SerializeValue(&buffer, ends_);
+  SerializeValue(&buffer, axes_);
+  SerializeValue(&buffer, ban_fp16_);
+}
+
+// Dynamic Plugin below.
+#if IS_TRT_VERSION_GE(6000)
+SlicePluginDynamic::SlicePluginDynamic(std::vector<int> starts,
+                                       std::vector<int> ends,
+                                       std::vector<int> axes, bool ban_fp16)
+    : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+SlicePluginDynamic::SlicePluginDynamic(void const *serialData,
+                                       size_t serialLength) {
+  DeserializeValue(&serialData, &serialLength, &starts_);
+  DeserializeValue(&serialData, &serialLength, &ends_);
+  DeserializeValue(&serialData, &serialLength, &axes_);
+  DeserializeValue(&serialData, &serialLength, &ban_fp16_);
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+void SlicePluginDynamic::destroy() {
+  cudaStreamDestroy(copy_stream_);
+  cudaEventDestroy(copy_event_);
+  cudaFree(offset_temp_data_);
+  delete this;
+}
+
 int SlicePluginDynamic::initialize() { return 0; }
 
-size_t SlicePluginDynamic::getSerializationSize() const { return 0; }
+size_t SlicePluginDynamic::getSerializationSize() const {
+  size_t size = SerializedSize(starts_) + SerializedSize(ends_) +
+                SerializedSize(axes_) + SerializedSize(ban_fp16_);
 
-void SlicePluginDynamic::serialize(void *buffer) const {}
+  return size;
+}
+
+void SlicePluginDynamic::serialize(void *buffer) const {
+  SerializeValue(&buffer, starts_);
+  SerializeValue(&buffer, ends_);
+  SerializeValue(&buffer, axes_);
+  SerializeValue(&buffer, ban_fp16_);
+}
 
 nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
@@ -136,9 +323,9 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
   std::vector<int> offsets;
   std::vector<int> extends;
 
-  offsets.reserve(num_dims);
-  extends.reserve(num_dims);
-  seg_offsets.reserve(num_dims);
+  offsets.resize(num_dims);
+  extends.resize(num_dims);
+  seg_offsets.resize(num_dims);
 
   seg_offsets[num_dims - 1] = 1;
   for (int i = num_dims - 2; i >= 0; i--) {
@@ -160,16 +347,16 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
     offset_info.push_back(seg_offsets[i]);
   }
 
-  framework::Tensor offset_temp_tensor;
+  if (offset_temp_data_ == nullptr) {
+    cudaMalloc(&offset_temp_data_, 3 * num_dims * sizeof(int));
+  }
 
-  int device_id;
-  cudaGetDevice(&device_id);
-  offset_temp_tensor.Resize({3 * num_dims});
-  auto *offset_temp_data =
-      offset_temp_tensor.mutable_data<int>(platform::CUDAPlace(device_id));
+  cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
+                  copy_stream_);
 
-  cudaMemcpyAsync(offset_temp_data, offset_info.data(),
-                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice, stream);
+  cudaEventRecord(copy_event_, copy_stream_);
+  cudaStreamWaitEvent(stream, copy_event_, 0);
 
   int threads = 256;
   int blocks = (out_num + threads - 1) / threads;
@@ -178,13 +365,13 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
     const float *input1 = static_cast<const float *>(inputs[0]);
     float *output = static_cast<float *>(outputs[0]);
     SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
-        out_num, num_dims, input1, offset_temp_data, output);
+        out_num, num_dims, input1, offset_temp_data_, output);
   } else if (input_type == nvinfer1::DataType::kHALF) {
 #ifdef SUPPORTS_CUDA_FP16
     const half *input1 = static_cast<const half *>(inputs[0]);
     half *output = static_cast<half *>(outputs[0]);
     SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
-        out_num, num_dims, input1, offset_temp_data, output);
+        out_num, num_dims, input1, offset_temp_data_, output);
 #else
     PADDLE_THROW(platform::errors::Fatal(
         "The cuda archs you specific should greater than 600."));
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
index 13d86df131f..e36a270f05d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
@@ -26,17 +26,56 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+class SlicePlugin : public PluginTensorRT {
+ public:
+  explicit SlicePlugin(std::vector<int> starts, std::vector<int> ends,
+                       std::vector<int> axes, bool ban_fp16);
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  SlicePlugin(void const* serial_data, size_t serial_length);
+  ~SlicePlugin();
+  SlicePlugin* clone() const override;
+
+  const char* getPluginType() const override { return "slice_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override { return 0; }
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::PluginFormat format) const override;
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nb_input_dims) override;
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+
+ protected:
+  size_t getSerializationSize() override;
+
+  // TRT will call this func  to serialize the configuration of TRT
+  // It should not be called by users.
+  void serialize(void* buffer) override;
+
+ private:
+  std::vector<int> starts_;
+  std::vector<int> ends_;
+  std::vector<int> axes_;
+  bool ban_fp16_{false};
+  int* offset_temp_data_{nullptr};
+  cudaEvent_t copy_event_;
+  cudaStream_t copy_stream_;
+};
+
 #if IS_TRT_VERSION_GE(6000)
 class SlicePluginDynamic : public DynamicPluginTensorRT {
  public:
   explicit SlicePluginDynamic(std::vector<int> starts, std::vector<int> ends,
-                              std::vector<int> axes, bool ban_fp16)
-      : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {}
-  SlicePluginDynamic(void const* serialData, size_t serialLength) {}
+                              std::vector<int> axes, bool ban_fp16);
+
   nvinfer1::IPluginV2DynamicExt* clone() const override {
     return new SlicePluginDynamic(starts_, ends_, axes_, ban_fp16_);
   }
 
+  SlicePluginDynamic(void const* serialData, size_t serialLength);
+
   const char* getPluginType() const override { return "slice_plugin"; }
   int getNbOutputs() const override { return 1; }
   int initialize() override;
@@ -72,15 +111,54 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
                                        const nvinfer1::DataType* inputTypes,
                                        int nbInputs) const override;
 
-  void destroy() override { delete this; }
+  void destroy() override;
 
  private:
   std::vector<int> starts_;
   std::vector<int> ends_;
   std::vector<int> axes_;
-
   bool ban_fp16_{false};
+  int* offset_temp_data_{nullptr};
+  cudaEvent_t copy_event_;
+  cudaStream_t copy_stream_;
 };
+
+class SlicePluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  SlicePluginV2Creator() {}
+  const char* getPluginName() const override { return "slice_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serialData,
+                                         size_t serialLength) override {
+    auto plugin = new SlicePluginDynamic(serialData, serialLength);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* libNamespace) override {
+    namespace_ = libNamespace;
+  }
+
+  const char* getPluginNamespace() const override { return namespace_.c_str(); }
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+
+REGISTER_TRT_PLUGIN_V2(SlicePluginV2Creator);
+
 #endif
 
 }  // namespace plugin
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
new file mode 100644
index 00000000000..660a9c93e66
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+
+
+#normal starts && ends
+class SlicePluginTRTTest1(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
+            axes = [1, 3]
+            starts = [0, 1]
+            ends = [2, 3]
+            slice_out = fluid.layers.slice(
+                data, axes=axes, starts=starts, ends=ends)
+            out = fluid.layers.batch_norm(slice_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((3, 3, 3, 3)).astype("float32"),
+        }
+        # Diff occurred between GPU and TRT. 
+        # In order to provide TRT CI ASAP, this test for trt part 
+        # is disabled temporarily. 
+        self.enable_trt = True
+        self.trt_parameters = SlicePluginTRTTest1.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+#negative starts && ends
+class SlicePluginTRTTest2(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
+            axes = [2, 3]
+            starts = [-3, -2]
+            ends = [-1, 3]
+            slice_out = fluid.layers.slice(
+                data, axes=axes, starts=starts, ends=ends)
+            out = fluid.layers.batch_norm(slice_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((3, 3, 3, 3)).astype("float32"),
+        }
+        # Diff occurred between GPU and TRT. 
+        # In order to provide TRT CI ASAP, this test for trt part 
+        # is disabled temporarily. 
+        self.enable_trt = True
+        self.trt_parameters = SlicePluginTRTTest2.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+#exceeded bound starts && ends
+class SlicePluginTRTTest3(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
+            axes = [2, 3]
+            starts = [-5, -2]
+            ends = [-1, 8]
+            slice_out = fluid.layers.slice(
+                data, axes=axes, starts=starts, ends=ends)
+            out = fluid.layers.batch_norm(slice_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((3, 3, 3, 3)).astype("float32"),
+        }
+        # Diff occurred between GPU and TRT. 
+        # In order to provide TRT CI ASAP, this test for trt part 
+        # is disabled temporarily. 
+        self.enable_trt = True
+        self.trt_parameters = SlicePluginTRTTest3.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+#fp16
+class SlicePluginTRTTest4(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
+            axes = [2, 3]
+            starts = [-5, -2]
+            ends = [-1, 8]
+            slice_out = fluid.layers.slice(
+                data, axes=axes, starts=starts, ends=ends)
+            out = fluid.layers.batch_norm(slice_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((3, 3, 3, 3)).astype("float32"),
+        }
+        # Diff occurred between GPU and TRT. 
+        # In order to provide TRT CI ASAP, this test for trt part 
+        # is disabled temporarily. 
+        self.enable_trt = True
+        self.trt_parameters = SlicePluginTRTTest3.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From c8e54c5e42c64ab53c85d955bf1c85f396f009ed Mon Sep 17 00:00:00 2001
From: chalsliu <45041955+chalsliu@users.noreply.github.com>
Date: Wed, 16 Sep 2020 00:16:05 +0800
Subject: [PATCH 082/261] Disable unit-test test_fleet_rolemaker_new

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 8f3945a48e3..d50356e0e95 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -102,6 +102,7 @@ if(WIN32)
 endif()
 
 
+LIST(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_new)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint1)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint2)
-- 
GitLab


From 9dedafa0df88042b348c502da07e55841df4a7a9 Mon Sep 17 00:00:00 2001
From: mapingshuo <mps2012@yeah.net>
Date: Wed, 16 Sep 2020 06:53:17 +0800
Subject: [PATCH 083/261] fix strategy, test=develop (#27323)

* fix strategy, test=develop

* fix can_apply
---
 python/paddle/distributed/fleet/base/strategy_compiler.py       | 2 +-
 .../distributed/fleet/meta_optimizers/recompute_optimizer.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py
index 4097fc1237f..29e10661888 100644
--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
@@ -60,7 +60,7 @@ class StrategyCompiler(StrategyCompilerBase):
 
     def _get_valid_strategy(self, dist_strategy, can_not_apply_optimizer_list):
         import copy
-        valid_strategy = copy.copy(dist_strategy)
+        valid_strategy = copy.deepcopy(dist_strategy)
         invalid_optimizers = []
         for candidate in self._meta_optimizer_candidates:
             is_valid = False
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 8f959548692..59ca7e63309 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -38,7 +38,7 @@ class RecomputeOptimizer(MetaOptimizerBase):
             list(user_defined_strategy.recompute_configs["checkpoints"]))
 
     def _can_apply(self):
-        if self.role_maker._is_collective:
+        if not self.role_maker._is_collective:
             return False
 
         if self.user_defined_strategy.recompute == True:
-- 
GitLab


From 696a39e2f3c273472145ebf88002a45c58db71c7 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Wed, 16 Sep 2020 09:37:29 +0800
Subject: [PATCH 084/261] use clcache in windows (#27279)

---
 paddle/scripts/paddle_build.bat | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 15610abef0f..9e150763dbb 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -51,6 +51,17 @@ if %ERRORLEVEL% NEQ 0 (
     exit /b 7
 )
 
+rem ------pre install clcache and init config----------
+pip install clcache
+:: set USE_CLCACHE to enable clcache
+set USE_CLCACHE=1
+:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
+set CLCACHE_HARDLINK=1
+:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
+set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
+:: set maximum cache size to 20G
+clcache.exe -M 21474836480
+
 rem ------initialize common variable------
 if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
 if not defined BRANCH set BRANCH=develop
@@ -173,7 +184,7 @@ echo Build third_party successfully!
 set build_times=1
 :build_paddle
 echo Build Paddle the %build_times% time:
-msbuild /m:%PARALLEL_PROJECT_COUNT% /p:Configuration=Release /verbosity:minimal paddle.sln
+msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
     if %build_times% GTR 2 (
-- 
GitLab


From 4c8ea492cdc95f61b279c6006ed642b4a0b063a3 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 16 Sep 2020 10:00:01 +0800
Subject: [PATCH 085/261] use shared dev_ctx (#27313)

---
 paddle/fluid/memory/allocation/best_fit_allocator_test.cu | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
index eb24ba84c88..59c14103ca6 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -16,6 +16,7 @@
 #include <random>
 #include <thread>  // NOLINT
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
@@ -41,12 +42,14 @@ TEST(BestFitAllocator, concurrent_cuda) {
   LockedAllocator concurrent_allocator(
       std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));
 
+  platform::CUDAPlace gpu(0);
+  platform::CUDADeviceContext dev_ctx(gpu);
+
   auto th_main = [&](std::random_device::result_type seed) {
     std::default_random_engine engine(seed);
     std::uniform_int_distribution<size_t> dist(1U, 1024U);
-    platform::CUDAPlace gpu(0);
-    platform::CUDADeviceContext dev_ctx(gpu);
     std::array<size_t, 1024> buf;
+
     for (size_t i = 0; i < 128; ++i) {
       size_t allocate_size = dist(engine);
 
-- 
GitLab


From c89f269c4d32447eccbd1e53d8d324936c1cd5ba Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 16 Sep 2020 10:00:45 +0800
Subject: [PATCH 086/261] Fix bug of handling blank characters in
 operators.cmake (#27310)

---
 cmake/operators.cmake | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index f60a6dc3f0c..aea972ab3db 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -138,12 +138,17 @@ function(op_library TARGET)
     # And for detail pybind information, please see generated paddle/pybind/pybind.h.
     file(READ ${TARGET}.cc TARGET_CONTENT)
     string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
+    # [ \t\r\n]* is used for blank characters
+    string(REGEX MATCH "REGISTER_OPERATOR\\([ \t\r\n]*[a-z0-9_]*," one_register "${multi_register}")
+
     if (one_register STREQUAL "")
         string(REPLACE "_op" "" TARGET "${TARGET}")
     else ()
         string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
         string(REPLACE "," "" TARGET "${TARGET}")
+        # [ \t\r\n]+ is used for blank characters.
+        # Here we use '+' instead of '*' since it is a REPLACE operation.
+        string(REGEX REPLACE "[ \t\r\n]+" "" TARGET "${TARGET}")
     endif()
 
     # pybind USE_NO_KERNEL_OP
-- 
GitLab


From dae62556cb3f3b87e1ba87c30a6db6ffa9209100 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Wed, 16 Sep 2020 10:30:34 +0800
Subject: [PATCH 087/261] Enhance infer error info message (#26731)

---
 paddle/fluid/inference/analysis/analyzer.cc   |   8 +-
 .../inference/analysis/analyzer_tester.cc     |   9 +-
 paddle/fluid/inference/analysis/argument.h    |  99 ++++++++-------
 paddle/fluid/inference/analysis/helper.h      |  31 +++--
 .../inference/analysis/ir_pass_manager.cc     |  22 ++--
 .../analysis/ir_passes/subgraph_util.cc       |   9 +-
 .../ir_passes/tensorrt_subgraph_pass.cc       |  17 ++-
 .../analysis/passes/ir_analysis_pass.cc       |   7 +-
 .../analysis/passes/ir_graph_build_pass.cc    |  13 +-
 .../analysis/passes/ir_graph_clean_pass.cc    |   3 +-
 .../ir_params_sync_among_devices_pass.cc      |  15 ++-
 .../analysis/passes/memory_optimize_pass.cc   |   8 +-
 paddle/fluid/inference/api/analysis_config.cc |   3 +-
 .../fluid/inference/api/analysis_predictor.cc |  73 +++++++----
 paddle/fluid/inference/api/api.cc             |  13 +-
 paddle/fluid/inference/api/api_impl.cc        |  44 +++++--
 paddle/fluid/inference/api/helper.h           |  12 +-
 .../fluid/inference/api/mkldnn_quantizer.cc   | 116 +++++++++++-------
 paddle/fluid/inference/tests/test_helper.h    |   3 +-
 19 files changed, 331 insertions(+), 174 deletions(-)

diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index d6d0371edaa..be7d6ab8680 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -27,8 +27,9 @@ Analyzer::Analyzer() {}
 void Analyzer::Run(Argument *argument) { RunAnalysis(argument); }
 
 void Analyzer::RunAnalysis(Argument *argument) {
-  PADDLE_ENFORCE(argument->analysis_passes_valid(),
-                 "analsis_passes is not valid in the argument.");
+  PADDLE_ENFORCE_EQ(argument->analysis_passes_valid(), true,
+                    platform::errors::InvalidArgument(
+                        "analsis_passes is not valid in the argument."));
   const bool disable_logs = argument->disable_logs();
   for (auto &pass : argument->analysis_passes()) {
     if (!disable_logs) {
@@ -38,7 +39,8 @@ void Analyzer::RunAnalysis(Argument *argument) {
       continue;
 
     auto *ptr = PassRegistry::Global().Retreive(pass);
-    PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass);
+    PADDLE_ENFORCE_NOT_NULL(ptr, platform::errors::PreconditionNotMet(
+                                     "no analysis pass called %s", pass));
     ptr->Run(argument);
   }
 }
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 79784fcb9bf..135ef6a9706 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -75,9 +75,14 @@ void TestWord2vecPrediction(const std::string& model_path) {
   std::vector<PaddleTensor> outputs;
   CHECK(predictor->Run(slots, &outputs));
 
-  PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+  PADDLE_ENFORCE_EQ(outputs.size(), 1UL,
+                    platform::errors::PreconditionNotMet(
+                        "Output size should be 1, but got %d", outputs.size()));
   // Check the output buffer size and result of each tid.
-  PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL);
+  PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL,
+                    platform::errors::PreconditionNotMet(
+                        "Output's data length should be 33168 but got %d",
+                        outputs.front().data.length()));
   float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
                      0.000932706};
   const size_t num_elements = outputs.front().data.length() / sizeof(float);
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 8d28b8ace26..40ca3e85868 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -76,53 +76,62 @@ struct Argument {
     }
   }
 
-#define DECL_ARGUMENT_FIELD(field__, Field, type__)          \
- public:                                                     \
-  type__& field__() {                                        \
-    PADDLE_ENFORCE(Has(#field__), "There is no such field"); \
-    return field__##_;                                       \
-  }                                                          \
-  void Set##Field(const type__& x) {                         \
-    field__##_ = x;                                          \
-    valid_fields_.insert(#field__);                          \
-  }                                                          \
-  DECL_ARGUMENT_FIELD_VALID(field__);                        \
-  type__* field__##_ptr() { return &field__##_; }            \
-                                                             \
- private:                                                    \
+#define DECL_ARGUMENT_FIELD(field__, Field, type__)                      \
+ public:                                                                 \
+  type__& field__() {                                                    \
+    PADDLE_ENFORCE_EQ(                                                   \
+        Has(#field__), true,                                             \
+        platform::errors::PreconditionNotMet("There is no such field")); \
+    return field__##_;                                                   \
+  }                                                                      \
+  void Set##Field(const type__& x) {                                     \
+    field__##_ = x;                                                      \
+    valid_fields_.insert(#field__);                                      \
+  }                                                                      \
+  DECL_ARGUMENT_FIELD_VALID(field__);                                    \
+  type__* field__##_ptr() { return &field__##_; }                        \
+                                                                         \
+ private:                                                                \
   type__ field__##_;
 
 #define DECL_ARGUMENT_FIELD_VALID(field__) \
   bool field__##_valid() { return Has(#field__); }
 
-#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__)                \
- public:                                                                  \
-  type__& field__() {                                                     \
-    PADDLE_ENFORCE_NOT_NULL(field__##_);                                  \
-    PADDLE_ENFORCE(Has(#field__));                                        \
-    return *static_cast<type__*>(field__##_.get());                       \
-  }                                                                       \
-  void Set##Field(type__* x) {                                            \
-    field__##_ =                                                          \
-        unique_ptr_t(x, [](void* x) { delete static_cast<type__*>(x); }); \
-    valid_fields_.insert(#field__);                                       \
-  }                                                                       \
-  void Set##Field##NotOwned(type__* x) {                                  \
-    valid_fields_.insert(#field__);                                       \
-    field__##_ = unique_ptr_t(x, [](void* x) {});                         \
-  }                                                                       \
-  DECL_ARGUMENT_FIELD_VALID(field__);                                     \
-  type__* field__##_ptr() {                                               \
-    PADDLE_ENFORCE(Has(#field__));                                        \
-    return static_cast<type__*>(field__##_.get());                        \
-  }                                                                       \
-  type__* Release##Field() {                                              \
-    PADDLE_ENFORCE(Has(#field__));                                        \
-    valid_fields_.erase(#field__);                                        \
-    return static_cast<type__*>(field__##_.release());                    \
-  }                                                                       \
-                                                                          \
- private:                                                                 \
+#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__)                    \
+ public:                                                                      \
+  type__& field__() {                                                         \
+    PADDLE_ENFORCE_NOT_NULL(field__##_, platform::errors::PreconditionNotMet( \
+                                            "filed should not be null."));    \
+    PADDLE_ENFORCE_EQ(                                                        \
+        Has(#field__), true,                                                  \
+        platform::errors::PreconditionNotMet("There is no such field"));      \
+    return *static_cast<type__*>(field__##_.get());                           \
+  }                                                                           \
+  void Set##Field(type__* x) {                                                \
+    field__##_ =                                                              \
+        unique_ptr_t(x, [](void* x) { delete static_cast<type__*>(x); });     \
+    valid_fields_.insert(#field__);                                           \
+  }                                                                           \
+  void Set##Field##NotOwned(type__* x) {                                      \
+    valid_fields_.insert(#field__);                                           \
+    field__##_ = unique_ptr_t(x, [](void* x) {});                             \
+  }                                                                           \
+  DECL_ARGUMENT_FIELD_VALID(field__);                                         \
+  type__* field__##_ptr() {                                                   \
+    PADDLE_ENFORCE_EQ(                                                        \
+        Has(#field__), true,                                                  \
+        platform::errors::PreconditionNotMet("There is no such field"));      \
+    return static_cast<type__*>(field__##_.get());                            \
+  }                                                                           \
+  type__* Release##Field() {                                                  \
+    PADDLE_ENFORCE_EQ(                                                        \
+        Has(#field__), true,                                                  \
+        platform::errors::PreconditionNotMet("There is no such field"));      \
+    valid_fields_.erase(#field__);                                            \
+    return static_cast<type__*>(field__##_.release());                        \
+  }                                                                           \
+                                                                              \
+ private:                                                                     \
   unique_ptr_t field__##_;
 
   DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int);
@@ -227,8 +236,10 @@ struct Argument {
 };
 
 #define ARGUMENT_CHECK_FIELD(argument__, fieldname__) \
-  PADDLE_ENFORCE(argument__->Has(#fieldname__),       \
-                 "the argument field [%s] should be set", #fieldname__);
+  PADDLE_ENFORCE_EQ(                                  \
+      argument__->Has(#fieldname__), true,            \
+      platform::errors::PreconditionNotMet(           \
+          "the argument field [%s] should be set", #fieldname__));
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index a4805840024..730fe35853a 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -73,12 +73,15 @@ struct DataTypeNamer {
   template <typename T>
   const std::string &repr() const {
     auto x = std::type_index(typeid(T));
-    PADDLE_ENFORCE(dic_.count(x), "unknown type for representation");
+    PADDLE_ENFORCE_GT(dic_.count(x), 0, platform::errors::PreconditionNotMet(
+                                            "unknown type for representation"));
     return dic_.at(x);
   }
 
   const std::string &repr(const std::type_index &type) const {  // NOLINT
-    PADDLE_ENFORCE(dic_.count(type), "unknown type for representation");
+    PADDLE_ENFORCE_GT(dic_.count(type), 0,
+                      platform::errors::PreconditionNotMet(
+                          "unknown type for representation"));
     return dic_.at(type);
   }
 
@@ -116,7 +119,9 @@ template <typename T>
 class OrderedRegistry {
  public:
   T *Register(const std::string &name, T *x) {
-    PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
+    PADDLE_ENFORCE_EQ(dic_.count(name), 0,
+                      platform::errors::PreconditionNotMet(
+                          "There exists duplicate key [%s]", name));
     dic_[name] = elements_.size();
     elements_.emplace_back(std::unique_ptr<T>(x));
     return elements_.back().get();
@@ -136,14 +141,20 @@ class OrderedRegistry {
 template <typename T>
 T &GetFromScope(const framework::Scope &scope, const std::string &name) {
   framework::Variable *var = scope.FindVar(name);
-  PADDLE_ENFORCE(var != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::PreconditionNotMet(
+               "The var which name is %s should not be nullptr.", name));
   return *var->GetMutable<T>();
 }
 
 static framework::proto::ProgramDesc LoadProgramDesc(
     const std::string &model_path) {
   std::ifstream fin(model_path, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::NotFound(
+          "Cannot open file %s, please confirm whether the file exists",
+          model_path));
   fin.seekg(0, std::ios::end);
   std::string buffer(fin.tellg(), ' ');
   fin.seekg(0, std::ios::beg);
@@ -188,10 +199,12 @@ static std::string GetDirRoot(const std::string &path) {
 static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) {
   std::string opt_cache_dir = model_root + "/_opt_cache/";
   if (!PathExists(opt_cache_dir)) {
-    PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1,
-                   "Can not create optimize cache directory: %s, Make sure you "
-                   "have permission to write",
-                   opt_cache_dir);
+    PADDLE_ENFORCE_NE(
+        MKDIR(opt_cache_dir.c_str()), -1,
+        platform::errors::PreconditionNotMet(
+            "Can not create optimize cache directory: %s, Make sure you "
+            "have permission to write",
+            opt_cache_dir));
   }
   return opt_cache_dir;
 }
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index d52d71f148c..d136f5033e7 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -38,7 +38,9 @@ IRPassManager::IRPassManager(Argument *argument) {
   graph_ = std::unique_ptr<Graph>(new Graph(argument->main_program()));
   if (argument->Has("scope")) {
     auto *scope_ptr = argument->scope_ptr();
-    PADDLE_ENFORCE(scope_ptr);
+    PADDLE_ENFORCE_NOT_NULL(scope_ptr,
+                            platform::errors::PreconditionNotMet(
+                                "The scope ptr should not be nullptr."));
     graph_->SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
   }
 
@@ -101,13 +103,17 @@ void IRPassManager::CreatePasses(Argument *argument,
       std::string optim_cache_dir = argument->optim_cache_dir();
       bool int8_valid =
           !(model_from_memory && optim_cache_dir.empty() && enable_int8);
-      PADDLE_ENFORCE(int8_valid,
-                     "When you are in TRT INT8 mode, and load model from "
-                     "memory, you should set optim_cache_dir using "
-                     "config.SetOptimCacheDir()");
-      PADDLE_ENFORCE(!(model_from_memory && use_static_engine),
-                     "When you are using Paddle-TRT, and also using load model "
-                     "from memory, you should set the use_static to false.");
+      PADDLE_ENFORCE_EQ(
+          int8_valid, true,
+          platform::errors::PreconditionNotMet(
+              "When you are in TRT INT8 mode, and load model from "
+              "memory, you should set optim_cache_dir using "
+              "config.SetOptimCacheDir()"));
+      PADDLE_ENFORCE_EQ(
+          !(model_from_memory && use_static_engine), true,
+          platform::errors::PreconditionNotMet(
+              "When you are using Paddle-TRT, and also using load model "
+              "from memory, you should set the use_static to false."));
 
       if (!optim_cache_dir.empty()) {
         pass->Set("model_opt_cache_dir", new std::string(optim_cache_dir));
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
index b3bfafb0a11..ebb19fd486c 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -123,7 +123,9 @@ void RenameAndGetOutputs(
   auto add_block_var = [&](const std::string &graph_arg,
                            const std::string &block_arg) {
     auto arg_var_node = graph_var_map.find(graph_arg);
-    PADDLE_ENFORCE(arg_var_node != graph_var_map.end());
+    PADDLE_ENFORCE_NE(arg_var_node, graph_var_map.end(),
+                      platform::errors::InvalidArgument(
+                          "Can not find %s in graph_var_map", graph_arg));
     auto *var_t = block_desc->Var(block_arg);
     var_t->SetShape(arg_var_node->second->Var()->GetShape());
     var_t->SetDataType(arg_var_node->second->Var()->GetDataType());
@@ -133,7 +135,10 @@ void RenameAndGetOutputs(
     framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
     framework::OpDesc op_desc(*op, nullptr);
     auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
+    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type(),
+                      platform::errors::PreconditionNotMet(
+                          "We should get %s, but get %s", op->type(),
+                          correspond_node->Name()));
 
     std::unordered_map<std::string, size_t> var2id;
     std::unordered_map<std::string, framework::ir::Node *> in_vars;
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 7ef072277fb..46612c1c5b7 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -97,7 +97,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
     std::vector<std::string> *repetitive_params) const {
   auto *op_desc = node->Op();
   auto &subgraph = *framework::ir::Agent(node).subgraph();
-  PADDLE_ENFORCE(!subgraph.empty());
+  PADDLE_ENFORCE_EQ(subgraph.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "The subgraph should not be empty."));
 
   framework::ProgramDesc *program_desc =
       Get<framework::ProgramDesc *>("program");
@@ -194,12 +196,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // to Tensor.
   std::vector<std::string> output_mapping;
   for (auto name : output_names) {
-    PADDLE_ENFORCE(output_name_map.count(name) != 0);
+    PADDLE_ENFORCE_NE(output_name_map.count(name), 0,
+                      platform::errors::PreconditionNotMet(
+                          "The output_name_map should have %s", name));
     output_mapping.push_back(output_name_map[name]);
   }
-  PADDLE_ENFORCE(!output_mapping.empty());
-  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
-                 "the block has no var-desc");
+  PADDLE_ENFORCE_EQ(output_mapping.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "The output_mapping should not be empty."));
+  PADDLE_ENFORCE_EQ(
+      !block_desc.Proto()->vars().empty(), true,
+      platform::errors::PreconditionNotMet("the block has no var-desc"));
 
   // Set attrs
   op_desc->SetType("tensorrt_engine");
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
index d986811a827..34192965297 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include <memory>
+#include <utility>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 
@@ -31,7 +33,10 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
   // Apply passes.
   IRPassManager the_ir_manager(argument);
   graph = the_ir_manager.Apply(std::move(graph));
-  PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
+  PADDLE_ENFORCE_GT(
+      graph->Nodes().size(), 0,
+      platform::errors::PreconditionNotMet(
+          "The graph nodes size should be greater than 0, but got 0"));
   argument->SetMainGraph(graph.release());
   CollectFusionStatis(argument);
 }
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index 970ecdbbeb0..188b2ff851d 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -31,7 +31,9 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
   if (!argument->scope_valid()) {
     argument->SetScope(new framework::Scope);
   }
-  PADDLE_ENFORCE(argument->use_gpu_valid());
+  PADDLE_ENFORCE_EQ(argument->use_gpu_valid(), true,
+                    platform::errors::PreconditionNotMet(
+                        "The use_gpu field should be valid"));
 
   // The load program should run on the same device with the inference program,
   // so that the parameters will on the same device, or they will keep copying
@@ -51,14 +53,17 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
         argument->model_from_memory_valid() && argument->model_from_memory());
     argument->SetMainProgram(program.release());
   } else {
-    PADDLE_THROW(
-        "either model_dir or (program path and parameter path) should be set.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "either model_dir or (program path and parameter path) should be "
+        "set."));
   }
 
   auto graph = std::unique_ptr<Graph>(new Graph(argument->main_program()));
   argument->SetMainGraph(graph.release());
   auto *scope_ptr = argument->scope_ptr();
-  PADDLE_ENFORCE(scope_ptr);
+  PADDLE_ENFORCE_NOT_NULL(scope_ptr,
+                          platform::errors::PreconditionNotMet(
+                              "The scope ptr should not be nullptr."));
   argument->main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
 }
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
index 1f888a28da0..c30aa2a1629 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
@@ -31,7 +31,8 @@ void IrInferCleanGraphPass::RunImpl(Argument* argument) {
   std::unordered_set<const framework::ir::Node*> invalid_nodes;
   int valid_op = 0;
   for (auto* node : graph.Nodes()) {
-    PADDLE_ENFORCE_NOT_NULL(node);
+    PADDLE_ENFORCE_NOT_NULL(node, platform::errors::PreconditionNotMet(
+                                      "The node should not be nullptr."));
     if (is_valid_node(node)) {
       invalid_nodes.insert(node);
     } else if (node->IsOp()) {
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index fedee3ff95f..f127478b5f2 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -23,8 +23,12 @@ namespace inference {
 namespace analysis {
 
 void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
-  PADDLE_ENFORCE(argument->scope_valid());
-  PADDLE_ENFORCE(argument->use_gpu_valid());
+  PADDLE_ENFORCE_EQ(
+      argument->scope_valid(), true,
+      platform::errors::PreconditionNotMet("The scope field should be valid"));
+  PADDLE_ENFORCE_EQ(argument->use_gpu_valid(), true,
+                    platform::errors::PreconditionNotMet(
+                        "The use_gpu field should be valid"));
 
   platform::Place place;
 
@@ -40,7 +44,9 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
 
   LOG(INFO) << "Sync params from CPU to GPU";
 
-  PADDLE_ENFORCE(argument->gpu_device_id_valid());
+  PADDLE_ENFORCE_EQ(argument->gpu_device_id_valid(), true,
+                    platform::errors::PreconditionNotMet(
+                        "The gpu_device_id field should be valid"));
   place = platform::CUDAPlace(argument->gpu_device_id());
 
   auto *scope = argument->scope_ptr();
@@ -56,7 +62,8 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
       continue;
     }
     auto *var = scope->FindLocalVar(var_name);
-    PADDLE_ENFORCE(var != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet(
+                                     "The var should not be nullptr"));
     if (var->IsType<framework::LoDTensor>() ||
         var->IsType<framework::Tensor>()) {
       auto *t = var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 9eb84785157..f432188131e 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -224,7 +224,9 @@ void UpdateOpDescsByReuse(
 
       // modify the graph
       for (auto input_node : node->inputs) {
-        PADDLE_ENFORCE(input_node->IsVar());
+        PADDLE_ENFORCE_EQ(input_node->IsVar(), true,
+                          platform::errors::PreconditionNotMet(
+                              "The input node should be a variable."));
         std::string input_node_name = input_node->Name();
         if (reuse_table.count(input_node_name) &&
             reuse_table.at(input_node_name) != input_node_name) {
@@ -246,7 +248,9 @@ void UpdateOpDescsByReuse(
 
       // modify the graph
       for (auto out_node : node->outputs) {
-        PADDLE_ENFORCE(out_node->IsVar());
+        PADDLE_ENFORCE_EQ(out_node->IsVar(), true,
+                          platform::errors::PreconditionNotMet(
+                              "The output node should be a variable."));
         std::string out_node_name = out_node->Name();
         if (reuse_table.count(out_node_name) &&
             reuse_table.at(out_node_name) != out_node_name) {
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 9fbc97d5509..2184574aa1f 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -230,7 +230,8 @@ void AnalysisConfig::EnableMkldnnBfloat16() {
 
 MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
   PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
-                          "MkldnnQuantizer was not enabled yet.");
+                          platform::errors::PreconditionNotMet(
+                              "MkldnnQuantizer was not enabled yet."));
   return mkldnn_quantizer_config_.get();
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 64dfdda54ac..ac914700643 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -169,7 +169,8 @@ bool AnalysisPredictor::PrepareScope(
   if (parent_scope) {
     PADDLE_ENFORCE_NOT_NULL(
         parent_scope,
-        "Both program and parent_scope should be set in Clone mode.");
+        platform::errors::PreconditionNotMet(
+            "Both program and parent_scope should be set in Clone mode."));
     scope_ = parent_scope;
     status_is_cloned_ = true;
   } else {
@@ -235,7 +236,9 @@ bool AnalysisPredictor::PrepareExecutor() {
   executor_->Prepare(sub_scope_, *inference_program_, 0,
                      config_.use_feed_fetch_ops_);
 
-  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
+  PADDLE_ENFORCE_NOT_NULL(sub_scope_,
+                          platform::errors::PreconditionNotMet(
+                              "The sub_scope should not be nullptr."));
 
   return true;
 }
@@ -297,7 +300,8 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   timer.tic();
   // set feed variable
   framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
-  PADDLE_ENFORCE_NOT_NULL(scope, "The scope should not be nullptr.");
+  PADDLE_ENFORCE_NOT_NULL(scope, platform::errors::PreconditionNotMet(
+                                     "The scope should not be nullptr."));
   if (!SetFeed(inputs, scope)) {
     LOG(ERROR) << "fail to set feed";
     return false;
@@ -399,7 +403,11 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   outputs->resize(fetches_.size());
   for (size_t i = 0; i < fetches_.size(); ++i) {
     int idx = BOOST_GET_CONST(int, fetches_[i]->GetAttr("col"));
-    PADDLE_ENFORCE((size_t)idx == i);
+    PADDLE_ENFORCE_EQ(
+        static_cast<size_t>(idx), i,
+        platform::errors::InvalidArgument(
+            "Fetch op's col attr(%d) should be equal to the index(%d)", idx,
+            i));
     framework::FetchType &fetch_var =
         framework::GetFetchVariable(*scope, "fetch", idx);
     auto &fetch = BOOST_GET(framework::LoDTensor, fetch_var);
@@ -435,10 +443,12 @@ void AnalysisPredictor::PrepareArgument() {
   if (!config_.model_dir().empty()) {
     argument_.SetModelDir(config_.model_dir());
   } else {
-    PADDLE_ENFORCE(
-        !config_.params_file().empty(),
-        "Either model_dir or (param_file, prog_file) should be set.");
-    PADDLE_ENFORCE(!config_.prog_file().empty());
+    PADDLE_ENFORCE_EQ(config_.params_file().empty(), false,
+                      platform::errors::PreconditionNotMet(
+                          "Either model_dir or param_file should be set."));
+    PADDLE_ENFORCE_EQ(config_.prog_file().empty(), false,
+                      platform::errors::PreconditionNotMet(
+                          "Either model_dir or prog_file should be set."));
     std::string dir = inference::analysis::GetDirRoot(config_.prog_file());
 
     argument_.SetModelProgramPath(config_.prog_file());
@@ -503,7 +513,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   PrepareArgument();
   Analyzer().Run(&argument_);
 
-  PADDLE_ENFORCE(argument_.scope_valid());
+  PADDLE_ENFORCE_EQ(
+      argument_.scope_valid(), true,
+      platform::errors::InvalidArgument("The argument scope should be valid."));
   VLOG(5) << "to prepare executor";
   ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program);
   inference_program_.reset(
@@ -525,8 +537,10 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     FLAGS_minloglevel = 2;  // GLOG_ERROR
   }
   VLOG(3) << "create AnalysisConfig";
-  PADDLE_ENFORCE(config.is_valid(),
-                 "Note: Each config can only be used for one predictor.");
+  PADDLE_ENFORCE_EQ(
+      config.is_valid(), true,
+      platform::errors::InvalidArgument(
+          "Note: Each config can only be used for one predictor."));
 
   if (config.use_gpu()) {
     static std::once_flag gflags_initialized;
@@ -623,7 +637,9 @@ bool AnalysisPredictor::MkldnnQuantize() {
 }
 
 void AnalysisPredictor::PrepareFeedFetch() {
-  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
+  PADDLE_ENFORCE_NOT_NULL(sub_scope_,
+                          platform::errors::InvalidArgument(
+                              "The sub_scope should not be nullptr."));
   CreateFeedFetchVar(sub_scope_);
   for (auto *op : inference_program_->Block(0).AllOps()) {
     if (op->Type() == "feed") {
@@ -646,7 +662,8 @@ void AnalysisPredictor::PrepareFeedFetch() {
 }
 
 void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
-  PADDLE_ENFORCE_NOT_NULL(scope);
+  PADDLE_ENFORCE_NOT_NULL(scope, platform::errors::InvalidArgument(
+                                     "The scope should not be nullptr."));
   auto *var = scope->Var("feed");
   var->GetMutable<framework::FeedList>();
   var = scope->Var("fetch");
@@ -667,7 +684,8 @@ AnalysisPredictor::GetInputTensorShape() {
   std::vector<std::string> names = GetInputNames();
   for (std::string name : names) {
     auto *var = inference_program_->Block(0).FindVar(name);
-    PADDLE_ENFORCE_NOT_NULL(var, "input %s does not exist.", name);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet(
+                                     "Input %s does not exist.", name));
     input_shapes[name] = var->GetShape();
   }
   return input_shapes;
@@ -683,7 +701,11 @@ std::vector<std::string> AnalysisPredictor::GetOutputNames() {
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
-  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      executor_->scope()->FindVar(name),
+      platform::errors::PreconditionNotMet(
+          "The variable named %s is not found in the scope of the exector.",
+          name));
   std::unique_ptr<ZeroCopyTensor> res(
       new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
   res->input_or_output_ = true;
@@ -700,7 +722,11 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     const std::string &name) {
-  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      executor_->scope()->FindVar(name),
+      platform::errors::PreconditionNotMet(
+          "he variable named %s is not found in the scope of the exector.",
+          name));
   std::unique_ptr<ZeroCopyTensor> res(
       new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
   res->input_or_output_ = false;
@@ -761,8 +787,11 @@ bool AnalysisPredictor::LoadProgramDesc() {
     std::string pb_content;
     // Read binary
     std::ifstream fin(filename, std::ios::in | std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin.is_open()), "Cannot open file %s",
-                   filename);
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(fin.is_open()), true,
+        platform::errors::NotFound(
+            "Cannot open file %s, please confirm whether the file is normal.",
+            filename));
     fin.seekg(0, std::ios::end);
     pb_content.resize(fin.tellg());
     fin.seekg(0, std::ios::beg);
@@ -779,7 +808,8 @@ bool AnalysisPredictor::LoadProgramDesc() {
 
 bool AnalysisPredictor::LoadParameters() {
   PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
-                          "The inference program should be loaded first.");
+                          platform::errors::PreconditionNotMet(
+                              "The inference program should be loaded first."));
 
   const auto &global_block = inference_program_->MutableBlock(0);
 
@@ -855,8 +885,9 @@ void AnalysisPredictor::ClearIntermediateTensor() {
 
 #if PADDLE_WITH_TENSORRT
 bool AnalysisPredictor::SaveTrtCalibToDisk() {
-  PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
-                 "This func can be invoked only in trt mode");
+  PADDLE_ENFORCE_EQ(config_.tensorrt_engine_enabled(), true,
+                    platform::errors::PreconditionNotMet(
+                        "This func can be invoked only in trt mode"));
   auto &block = inference_program_->Block(0);
   for (auto &op_desc : block.AllOps()) {
     if (op_desc->Type() == "tensorrt_engine") {
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 2f608da531f..840541246af 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -62,9 +62,9 @@ PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
     if (other.length() && other.data())
       memcpy(data_, other.data(), other.length());
     else if (other.length())
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::InvalidArgument(
           "Invalid argument, null pointer data with length %u is passed",
-          other.length());
+          other.length()));
 
     length_ = other.length();
     memory_owned_ = true;
@@ -92,7 +92,8 @@ void PaddleBuf::Resize(size_t length) {
     length_ = length;
     memory_owned_ = true;
   } else {
-    PADDLE_THROW("The memory is allocated externally, can not Resized");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "The memory is allocated externally, can not Resized"));
   }
 }
 
@@ -105,7 +106,11 @@ void PaddleBuf::Reset(void *data, size_t length) {
 
 void PaddleBuf::Free() {
   if (memory_owned_ && data_) {
-    PADDLE_ENFORCE_GT(length_, 0UL);
+    PADDLE_ENFORCE_GT(
+        length_, 0UL,
+        platform::errors::PreconditionNotMet(
+            "The memory used in PaddleBuf %d should be greater than 0",
+            length_));
     delete[] static_cast<char *>(data_);
     data_ = nullptr;
     length_ = 0;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 07d6dcf86e9..ca0a5148f06 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -87,7 +87,9 @@ bool NativePaddlePredictor::Init(
   if (parent_scope) {
     scope_ = parent_scope;
     sub_scope_ = &(parent_scope->NewScope());
-    PADDLE_ENFORCE_NOT_NULL(sub_scope_, "create sub scope fail");
+    PADDLE_ENFORCE_NOT_NULL(sub_scope_,
+                            platform::errors::PreconditionNotMet(
+                                "The sub_scope should not be nullptr."));
   } else {
     paddle::framework::InitDevices(false);
     scope_.reset(new paddle::framework::Scope());
@@ -182,7 +184,10 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
   std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
   // Hot fix the bug that result diff in multi-thread.
   // TODO(Superjomn) re-implement a real clone here.
-  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<NativePaddlePredictor *>(cls.get()));
+  PADDLE_ENFORCE_NOT_NULL(
+      dynamic_cast<NativePaddlePredictor *>(cls.get()),
+      platform::errors::PreconditionNotMet(
+          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
   if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
     LOG(ERROR) << "fail to call Init";
     return nullptr;
@@ -224,8 +229,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
       return false;
     }
 
-    PADDLE_ENFORCE_NOT_NULL(input_ptr);
-    PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
+    PADDLE_ENFORCE_NOT_NULL(input_ptr,
+                            platform::errors::InvalidArgument(
+                                "The input_ptr should not be nullptr."));
+    PADDLE_ENFORCE_NOT_NULL(
+        inputs[i].data.data(),
+        platform::errors::InvalidArgument(
+            "The data of input tensor should not be null."));
     if (platform::is_cpu_place(place_)) {
       // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
       std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
@@ -241,7 +251,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                    platform::CPUPlace(), inputs[i].data.data(),
                    inputs[i].data.length(), dev_ctx->stream());
 #else
-      PADDLE_THROW("Not compile with CUDA, should not reach here.");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Not compile with CUDA, should not reach here."));
 #endif
     }
 
@@ -287,7 +298,11 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   outputs->resize(fetchs_.size());
   for (size_t i = 0; i < fetchs_.size(); ++i) {
     int idx = BOOST_GET_CONST(int, fetchs_[i]->GetAttr("col"));
-    PADDLE_ENFORCE((size_t)idx == i);
+    PADDLE_ENFORCE_EQ(
+        static_cast<size_t>(idx), i,
+        platform::errors::InvalidArgument(
+            "Fetch op's col attr(%d) should be equal to the index(%d)", idx,
+            i));
     framework::FetchType &fetch_var =
         framework::GetFetchVariable(*scope, "fetch", idx);
     auto fetch = BOOST_GET_CONST(framework::LoDTensor, fetch_var);
@@ -318,10 +333,15 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
     // 1. GPU memory
-    PADDLE_ENFORCE_GE(
-        config.fraction_of_gpu_memory, 0.f,
-        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
-    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    PADDLE_ENFORCE_GE(config.fraction_of_gpu_memory, 0.f,
+                      platform::errors::InvalidArgument(
+                          "fraction_of_gpu_memory in the config should be set "
+                          "to range (0., 1.]"));
+    PADDLE_ENFORCE_GE(config.device, 0,
+                      platform::errors::PreconditionNotMet(
+                          "Invalid device id %d, the device id should be "
+                          "greater than or equal to 0.",
+                          config.device));
     std::vector<std::string> flags;
     if (config.fraction_of_gpu_memory >= 0.0f ||
         config.fraction_of_gpu_memory <= 0.95f) {
@@ -336,7 +356,9 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
 
   std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
   PADDLE_ENFORCE_NOT_NULL(
-      dynamic_cast<NativePaddlePredictor *>(predictor.get()));
+      dynamic_cast<NativePaddlePredictor *>(predictor.get()),
+      platform::errors::PreconditionNotMet(
+          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
   if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
     return nullptr;
   }
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index cddb0c8daf9..014985661fd 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -112,16 +112,19 @@ static T convert(const std::string &item,
     std::string message =
         "invalid_argument exception when try to convert : " + item;
     LOG(ERROR) << message;
-    PADDLE_THROW(message);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "invalid_argument exception when try to convert %s.", item));
   } catch (std::out_of_range &e) {
     std::string message =
         "out_of_range exception when try to convert : " + item;
     LOG(ERROR) << message;
-    PADDLE_THROW(message);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "out_of_range exception when try to convert %s.", item));
   } catch (...) {
     std::string message = "unexpected exception when try to convert " + item;
     LOG(ERROR) << message;
-    PADDLE_THROW(message);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "unexpected exception when try to convert %s.", item));
   }
   return res;
 }
@@ -353,7 +356,8 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                       double batch_latency, int epoch = 1,
                       const framework::proto::VarType::Type data_type =
                           framework::proto::VarType::FP32) {
-  PADDLE_ENFORCE_GT(batch_size, 0, "Non-positive batch size.");
+  PADDLE_ENFORCE_GT(batch_size, 0, platform::errors::InvalidArgument(
+                                       "Non-positive batch size."));
   double sample_latency = batch_latency / batch_size;
   LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid
             << " ======";
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 9be12ff309a..793fc53d90b 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -62,9 +62,12 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
             if (scales_.find(var_name) != scales_.end()) continue;
 
             auto* var = predictor_.sub_scope_->FindVar(var_name);
-            PADDLE_ENFORCE(var, "%s is not in the scope", var_name);
-            PADDLE_ENFORCE(var->IsType<LoDTensor>(),
-                           "Only support lod tensor now.");
+            PADDLE_ENFORCE_NOT_NULL(var,
+                                    platform::errors::PreconditionNotMet(
+                                        "%s is not in the scope", var_name));
+            PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "Only support lod tensor now."));
             LoDTensor* var_tensor = var->GetMutable<LoDTensor>();
 
             // force unsigned type if already know it
@@ -82,9 +85,11 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
               } else if (op->Type() == "transpose2" ||
                          op->Type() == "reshape2" || op->Type() == "pool2d") {
                 auto input_var_name = op->Input("X")[0];
-                PADDLE_ENFORCE(scales_.find(input_var_name) != scales_.end(),
-                               "Input scales must be calculated before the "
-                               "output scales to infer if output is unsigned.");
+                PADDLE_ENFORCE_NE(
+                    scales_.find(input_var_name), scales_.end(),
+                    platform::errors::PreconditionNotMet(
+                        "Input scales must be calculated before the "
+                        "output scales to infer if output is unsigned."));
                 if (scales_.find(input_var_name) != scales_.end()) {
                   scales_[var_name] = scales_[input_var_name];
                 }
@@ -94,10 +99,11 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
                 is_unsigned = true;
                 double min_scale = std::numeric_limits<double>::max();
                 for (auto input_var_name : op->Input("X")) {
-                  PADDLE_ENFORCE(
-                      scales_.find(input_var_name) != scales_.end(),
-                      "Input scales must be calculated before the "
-                      "output scales to infer if output is unsigned.");
+                  PADDLE_ENFORCE_NE(
+                      scales_.find(input_var_name), scales_.end(),
+                      platform::errors::PreconditionNotMet(
+                          "Input scales must be calculated before the "
+                          "output scales to infer if output is unsigned."));
                   is_unsigned = is_unsigned && scales_[input_var_name].first;
                   min_scale = std::min(
                       min_scale,
@@ -132,11 +138,12 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale(
   auto rule = qconfig_->scale_algo(op_type_name, conn_name);
   if (rule == ScaleAlgo::NONE) return;
 
-  PADDLE_ENFORCE(
-      var_tensor.numel() > 0,
-      "MkldnnQuantizer: LoDTensor of variable %s for quantization of op "
-      "%s of connection %s should not be empty.",
-      var_name, op_type_name, conn_name);
+  PADDLE_ENFORCE_GT(
+      var_tensor.numel(), 0,
+      platform::errors::InvalidArgument(
+          "MkldnnQuantizer: LoDTensor of variable %s for quantization of op "
+          "%s of connection %s should not be empty.",
+          var_name, op_type_name, conn_name));
 
   switch (rule) {
     case ScaleAlgo::MAX:
@@ -205,10 +212,11 @@ AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor(
   float min_val = eigen_tensor.minCoeff();
   bool is_positive = min_val >= 0.0f;
   if (is_unsigned)
-    PADDLE_ENFORCE(
-        is_positive,
-        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
+    PADDLE_ENFORCE_EQ(
+        is_positive, true,
+        platform::errors::InvalidArgument(
+            "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
+            min_val));
 
   int num_quantized_bins = 255;
 
@@ -316,10 +324,11 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
   float max_abs = eigen_tensor.abs().maxCoeff();
   float min_val = eigen_tensor.minCoeff();
   if (is_unsigned)
-    PADDLE_ENFORCE(
-        min_val >= 0.0f,
-        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
+    PADDLE_ENFORCE_GE(
+        min_val, 0.0f,
+        platform::errors::InvalidArgument(
+            "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
+            min_val));
 
   LoDTensor scale_tensor = CreateScaleTensor();
   scale_tensor.data<double>()[0] = 1.0 / max_abs;
@@ -330,16 +339,19 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
 std::pair<bool, LoDTensor>
 AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
     const LoDTensor& var_tensor, bool is_unsigned, bool is_transposed) const {
-  PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty.");
+  PADDLE_ENFORCE_GT(
+      var_tensor.dims().size(), 0,
+      platform::errors::InvalidArgument("Tensor dimension is empty."));
 
   ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
                                         var_tensor.numel(), 1};
   float min_val = eigen_tensor.minCoeff();
   if (is_unsigned)
-    PADDLE_ENFORCE(
-        min_val >= 0.0f,
-        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
+    PADDLE_ENFORCE_GE(
+        min_val, 0.0f,
+        platform::errors::InvalidArgument(
+            "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
+            min_val));
 
   auto dims = var_tensor.dims();
   constexpr int num_col_dims = 1;
@@ -367,17 +379,19 @@ AnalysisPredictor::MkldnnQuantizer::Histogram(
     const framework::LoDTensor& var_tensor, float min_val, float max_val,
     size_t num_bins) const {
   PADDLE_ENFORCE_GT(num_bins, 0,
-                    "MkldnnQuantizer: To calculate Histogram, num_bins (" +
-                        std::to_string(num_bins) + ") must be positive.");
-  PADDLE_ENFORCE_GT(
-      var_tensor.numel(), 0,
-      "MkldnnQuantizer: To calculate Histogram, the tensor must not be empty.");
-  PADDLE_ENFORCE(max_val >= min_val,
-                 "MkldnnQuantizer: To calculate Histogram, max_val (" +
-                     std::to_string(max_val) +
-                     ") must be greater or equal"
-                     "to min_val (" +
-                     std::to_string(min_val) + ").");
+                    platform::errors::InvalidArgument(
+                        "MkldnnQuantizer: To calculate Histogram, num_bins (" +
+                        std::to_string(num_bins) + ") must be positive."));
+  PADDLE_ENFORCE_GT(var_tensor.numel(), 0,
+                    platform::errors::InvalidArgument(
+                        "MkldnnQuantizer: To calculate Histogram, the tensor "
+                        "must not be empty."));
+  PADDLE_ENFORCE_GE(max_val, min_val,
+                    platform::errors::InvalidArgument(
+                        "MkldnnQuantizer: To calculate Histogram, max_val (" +
+                        std::to_string(max_val) + ") must be greater or equal"
+                                                  "to min_val (" +
+                        std::to_string(min_val) + ")."));
   ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
                                         var_tensor.numel(), 1};
   auto bin_width = std::abs(max_val - min_val) / num_bins;
@@ -407,7 +421,8 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
   auto graph = std::unique_ptr<Graph>(new Graph(arg.main_program()));
   arg.SetMainGraph(graph.release());
   auto* scope_ptr = arg.scope_ptr();
-  PADDLE_ENFORCE(scope_ptr);
+  PADDLE_ENFORCE_NOT_NULL(scope_ptr, platform::errors::PreconditionNotMet(
+                                         "The scope should not be nullptr."));
   arg.main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
 
   auto* builder = predictor_.config_.pass_builder();
@@ -441,7 +456,9 @@ bool AnalysisPredictor::MkldnnQuantizer::RunQuantizePasses() const {
   PrepareArgument();
   auto& arg = predictor_.argument_;
   Analyzer().Run(&arg);
-  PADDLE_ENFORCE(arg.scope_valid());
+  PADDLE_ENFORCE_EQ(
+      arg.scope_valid(), true,
+      platform::errors::PreconditionNotMet("The scope should be valid."));
   VLOG(5) << "to prepare executor";
   ARGUMENT_CHECK_FIELD((&arg), ir_analyzed_program);
   predictor_.inference_program_.reset(
@@ -456,7 +473,8 @@ bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const {
   VLOG(3) << "Predictor: run a quantization warmup iteration";
   auto warmup_data = qconfig_->warmup_data();
   PADDLE_ENFORCE_NOT_NULL(warmup_data,
-                          "Warmup data cannot be NULL in the config.");
+                          platform::errors::PreconditionNotMet(
+                              "Warmup data cannot be NULL in the config."));
   PrettyLogH1("--- Running warmup iteration for quantization");
 
   // Run the inference program
@@ -469,7 +487,10 @@ bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const {
 float AnalysisPredictor::MkldnnQuantizer::SafeEntropy(
     std::vector<int> reference_distr_P, int P_sum,
     std::vector<int> candidate_distr_Q, int Q_sum) const {
-  PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size());
+  PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size(),
+                    platform::errors::InvalidArgument(
+                        "The P size %d should be equal to Q size %d",
+                        reference_distr_P.size(), candidate_distr_Q.size()));
   float tmp_sum1 = 0;
   float tmp_sum2 = 0;
   for (size_t idx = 0; idx < reference_distr_P.size(); idx++) {
@@ -479,10 +500,11 @@ float AnalysisPredictor::MkldnnQuantizer::SafeEntropy(
       tmp_sum1 += 0;
       tmp_sum2 += 0;
     } else {
-      PADDLE_ENFORCE(q_idx != 0, "MkldnnQuantizer: Fatal error!, idx = " +
-                                     std::to_string(idx) +
-                                     " qindex = 0! p_idx = " +
-                                     std::to_string(p_idx));
+      PADDLE_ENFORCE_NE(
+          q_idx, 0,
+          platform::errors::PreconditionNotMet(
+              "MkldnnQuantizer: Fatal error!, idx = " + std::to_string(idx) +
+              " qindex = 0! p_idx = " + std::to_string(p_idx)));
     }
     tmp_sum1 += p_idx * (log(Q_sum * p_idx));
     tmp_sum2 += p_idx * (log(P_sum * q_idx));
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index d27959aff6f..1457f5337e3 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -163,7 +163,8 @@ void TestInference(const std::string& dirname,
     //   int device_id = place.GetDeviceId();
     paddle::platform::SetDeviceId(0);
 #else
-    PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "'CUDAPlace' is not supported in CPU only device."));
 #endif
   }
 
-- 
GitLab


From 389a9a7e0ec6908cf82ece30a8ba528c6c957ca6 Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Wed, 16 Sep 2020 10:37:26 +0800
Subject: [PATCH 088/261] fix ports conflict when use paddlecloud to launch
 analogue multi-nodes (#26191)

* fix ports conflict when launching multi-nodes in paddlecloud;test=develop

* add DISTRIBUTED_TRAINER_ENDPOINTS env for cloud;test=develop
---
 python/paddle/distributed/cloud_utils.py      | 72 +++++++++++-------
 .../paddle/distributed/fleet/cloud_utils.py   | 75 ++++++++++++-------
 python/paddle/distributed/fleet/launch.py     | 17 +++--
 .../paddle/distributed/fleet/launch_utils.py  | 15 ++--
 python/paddle/distributed/launch.py           | 17 +++--
 python/paddle/distributed/utils.py            | 14 +++-
 .../tests/unittests/test_fleet_launch.sh      |  4 +-
 .../fluid/tests/unittests/test_launch.sh      |  4 +-
 python/paddle/tests/test_dist_hapi_model.py   |  6 +-
 9 files changed, 143 insertions(+), 81 deletions(-)

diff --git a/python/paddle/distributed/cloud_utils.py b/python/paddle/distributed/cloud_utils.py
index 345b783d60b..5b7268e4b64 100644
--- a/python/paddle/distributed/cloud_utils.py
+++ b/python/paddle/distributed/cloud_utils.py
@@ -19,7 +19,7 @@ from paddle.distributed.utils import get_cluster, logger
 
 def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus):
     """
-    args_node_ips, args_node_ip:string
+    args_node_ips:string, args_node_ip:string, args_port: int, selected_gpus:list
     """
     #you can automatically get ip info while using paddlecloud multi nodes mode.
     node_ips = os.getenv("PADDLE_TRAINERS")
@@ -31,6 +31,9 @@ def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus):
     node_rank = os.getenv("PADDLE_TRAINER_ID")
     assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"
 
+    paddle_ports_num = int(os.getenv("TRAINER_PORTS_NUM"))
+    assert paddle_ports_num is not None, "TRAINER_PORTS_NUM should not be None"
+
     node_ips = node_ips.split(",")
     num_nodes = len(node_ips)
     node_rank = int(node_rank)
@@ -47,32 +50,47 @@ automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
 Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
 paddlecloud environment.".format(args_node_ips, node_ips))
 
-    started_port = args_port
-    print("num_nodes:", num_nodes)
-    if num_nodes > 1:
-        try:
-            paddle_port = int(os.getenv("PADDLE_PORT", ""))
-            paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))
-
-            if paddle_port_num >= len(
-                    selected_gpus) and paddle_port != args_port:
-                logger.warning("Use Cloud specified port:{}.".format(
-                    paddle_port))
-                started_port = paddle_port
-
-        except Exception as e:
-            print(e)
-            pass
-
-    if started_port is None:
-        started_port = 6170
-
-    logger.debug("parsed from args:node_ips:{} \
-        node_ip:{} node_rank:{} started_port:{}"
-                 .format(node_ips, node_ip, node_rank, started_port))
-
-    ports = [x for x in range(started_port, started_port + len(selected_gpus))]
-    cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
+    # DISTRIBUTED_TRAINER_ENDPOINTS: new environment since paddlecloud 1.8.4
+    # e.g: DISTRIBUTED_TRAINER_ENDPOINTS="ip1:port1,ip1:port2,ip1:port3,ip1:port4,ip2:port5,ip2:port6,ip2:port7,ip2:port8"
+    trainer_endpoints = os.getenv("DISTRIBUTED_TRAINER_ENDPOINTS")
+    if trainer_endpoints is None:
+        started_port = args_port
+        if num_nodes > 1:
+            try:
+                paddle_port = int(os.getenv("PADDLE_PORT", ""))
+
+                if paddle_ports_num >= len(
+                        selected_gpus) and paddle_port != args_port:
+                    logger.warning("Use Cloud specified port:{}.".format(
+                        paddle_port))
+                    started_port = paddle_port
+
+            except Exception as e:
+                print(e)
+                pass
+
+        if started_port is None:
+            started_port = 6170
+        ports = [
+            x for x in range(started_port, started_port + len(selected_gpus))
+        ]
+        trainer_endpoints = []
+        for ip in node_ips:
+            trainer_endpoints.append(["%s:%d" % (ip, port) for port in ports])
+    else:
+        trainer_endpoints_ori = trainer_endpoints.split(",")
+        trainer_endpoints = []
+        assert num_nodes * paddle_ports_num == len(trainer_endpoints_ori)
+        for i in range(num_nodes):
+            trainer_endpoints.append(trainer_endpoints_ori[
+                i * paddle_ports_num:(i + 1) * paddle_ports_num])
+
+    logger.debug("parsed from args: node_ips:{} \
+        node_ip:{} node_rank:{} trainer_endpoints:{}"
+                 .format(node_ips, node_ip, node_rank, trainer_endpoints))
+
+    cluster, pod = get_cluster(node_ips, node_ip, trainer_endpoints,
+                               selected_gpus)
     return cluster, cluster.pods[node_rank]
 
 
diff --git a/python/paddle/distributed/fleet/cloud_utils.py b/python/paddle/distributed/fleet/cloud_utils.py
index 49d66118d90..a1203bed85c 100644
--- a/python/paddle/distributed/fleet/cloud_utils.py
+++ b/python/paddle/distributed/fleet/cloud_utils.py
@@ -19,7 +19,7 @@ from paddle.distributed.fleet.launch_utils import get_cluster, logger
 
 def get_cloud_cluster(args_node_ips, selected_gpus, args_port=6170):
     """
-    args_node_ips, args_node_ip:string
+    args_node_ips:string, selected_gpus:list, args_port: int
     """
     #you can automatically get ip info while using paddlecloud multi nodes mode.
     node_ips = os.getenv("PADDLE_TRAINERS")
@@ -31,6 +31,9 @@ def get_cloud_cluster(args_node_ips, selected_gpus, args_port=6170):
     node_rank = os.getenv("PADDLE_TRAINER_ID")
     assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"
 
+    paddle_ports_num = int(os.getenv("TRAINER_PORTS_NUM"))
+    assert paddle_ports_num is not None, "TRAINER_PORTS_NUM should not be None"
+
     node_ips = node_ips.split(",")
     num_nodes = len(node_ips)
     node_rank = int(node_rank)
@@ -42,32 +45,47 @@ automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
 Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
 paddlecloud environment.".format(args_node_ips, node_ips))
 
-    started_port = args_port
-    print("num_nodes:", num_nodes)
-    if num_nodes > 1:
-        try:
-            paddle_port = int(os.getenv("PADDLE_PORT", ""))
-            paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))
-
-            if paddle_port_num >= len(
-                    selected_gpus) and paddle_port != args_port:
-                logger.warning("Use Cloud specified port:{}.".format(
-                    paddle_port))
-                started_port = paddle_port
-
-        except Exception as e:
-            print(e)
-            pass
-
-    if started_port is None:
-        started_port = 6170
-
-    logger.debug("parsed from args:node_ips:{} \
-        node_ip:{} node_rank:{} started_port:{}"
-                 .format(node_ips, node_ip, node_rank, started_port))
-
-    ports = [x for x in range(started_port, started_port + len(selected_gpus))]
-    cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
+    # DISTRIBUTED_TRAINER_ENDPOINTS: new environment since paddlecloud 1.8.4
+    # e.g: DISTRIBUTED_TRAINER_ENDPOINTS="ip1:port1,ip1:port2,ip1:port3,ip1:port4,ip2:port5,ip2:port6,ip2:port7,ip2:port8"
+    trainer_endpoints = os.getenv("DISTRIBUTED_TRAINER_ENDPOINTS")
+    if trainer_endpoints is None:
+        started_port = args_port
+        if num_nodes > 1:
+            try:
+                paddle_port = int(os.getenv("PADDLE_PORT", ""))
+
+                if paddle_ports_num >= len(
+                        selected_gpus) and paddle_port != args_port:
+                    logger.warning("Use Cloud specified port:{}.".format(
+                        paddle_port))
+                    started_port = paddle_port
+
+            except Exception as e:
+                print(e)
+                pass
+
+        if started_port is None:
+            started_port = 6170
+        ports = [
+            x for x in range(started_port, started_port + len(selected_gpus))
+        ]
+        trainer_endpoints = []
+        for ip in node_ips:
+            trainer_endpoints.append(["%s:%d" % (ip, port) for port in ports])
+    else:
+        trainer_endpoints_ori = trainer_endpoints.split(",")
+        trainer_endpoints = []
+        assert num_nodes * paddle_ports_num == len(trainer_endpoints_ori)
+        for i in range(num_nodes):
+            trainer_endpoints.append(trainer_endpoints_ori[
+                i * paddle_ports_num:(i + 1) * paddle_ports_num])
+
+    logger.debug("parsed from args: node_ips:{} \
+        node_ip:{} node_rank:{} trainer_endpoints:{}"
+                 .format(node_ips, node_ip, node_rank, trainer_endpoints))
+
+    cluster, pod = get_cluster(node_ips, node_ip, trainer_endpoints,
+                               selected_gpus)
     return cluster, cluster.pods[node_rank]
 
 
@@ -75,7 +93,8 @@ def use_paddlecloud():
     node_ips = os.getenv("PADDLE_TRAINERS")
     node_ip = os.getenv("POD_IP")
     node_rank = os.getenv("PADDLE_TRAINER_ID")
-    if node_ips is None or node_ip is None or node_rank is None:
+    paddle_ports_num = os.getenv("TRAINER_PORTS_NUM")
+    if node_ips is None or node_ip is None or node_rank is None or paddle_ports_num is None:
         return False
     else:
         return True
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 7778acaf83b..6dba385c569 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -157,17 +157,20 @@ def get_cluster_from_args(args, gpus):
 
         free_ports = [x for x in range(start_port, start_port + len(gpus))]
 
-    return get_cluster(node_ips, node_ip, free_ports, gpus)
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+    return get_cluster(node_ips, node_ip, trainer_endpoints, gpus)
 
 
 def get_gpus(gpus):
     if gpus is None:
         gpus_num = fluid.core.get_cuda_device_count()
-        gpus = [str(x) for x in range(0, gpus_num)]
+        res_gpus = [str(x) for x in range(0, gpus_num)]
     else:
         cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
         if cuda_visible_devices is None or cuda_visible_devices == "":
-            gpus = [x.strip() for x in gpus.split(',')]
+            res_gpus = [x.strip() for x in gpus.split(',')]
         else:
             # change gpus into relative values
             # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.gpus=4,5,6,7;
@@ -177,12 +180,16 @@ def get_gpus(gpus):
                 assert x in cuda_visible_devices_list, "Can't find "\
                 "your gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
                 % (x, cuda_visible_devices)
-            gpus = [
+            res_gpus = [
                 cuda_visible_devices_list.index(x.strip())
                 for x in gpus.split(',')
             ]
+            logger.info("Change selected_gpus into reletive values. --ips:{} "
+                        "will change into relative_ips:{} according to your "
+                        "CUDA_VISIBLE_DEVICES:{}".format(
+                            gpus, res_gpus, cuda_visible_devices_list))
 
-    return gpus
+    return res_gpus
 
 
 def launch_collective(args):
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 0e995200dde..b6f4c75a276 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -227,18 +227,23 @@ def get_logger(log_level=20, name="root"):
     return logger
 
 
-def get_cluster(node_ips, node_ip, paddle_ports, selected_gpus):
-    assert type(paddle_ports) is list, "paddle_ports must be list"
+def get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus):
+    assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
     cluster = Cluster(hdfs=None)
     trainer_rank = 0
     for node_rank, ip in enumerate(node_ips):
         pod = Pod()
         pod.rank = node_rank
         pod.addr = ip
+        cur_node_endpoints = trainer_endpoints[node_rank]
+        # when use paddlecloud, endpoints may > selected_gpus(user_defined)
+        assert len(cur_node_endpoints) >= len(
+            selected_gpus
+        ), "current trainer_endpoints size should be greater equal than selected_gpus size."
         for i in range(len(selected_gpus)):
             trainer = Trainer()
             trainer.gpus.append(selected_gpus[i])
-            trainer.endpoint = "%s:%d" % (ip, paddle_ports[i])
+            trainer.endpoint = "%s" % (cur_node_endpoints[i])
             trainer.rank = trainer_rank
             trainer_rank += 1
 
@@ -424,10 +429,6 @@ def start_local_trainers(cluster,
                             len(pod.trainers),
                             pretty_print_envs(proc_env, ("Distributed Envs",
                                                          "Value"))))
-            logger.info(
-                "More details for debug about commands and environments are written in {}/run.sh".
-                format(log_dir))
-
         fn = None
         if log_dir is not None:
             os.system("mkdir -p {}".format(log_dir))
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index e2ab321f9ae..9b969cf3002 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -160,18 +160,21 @@ def get_cluster_from_args(args, selected_gpus):
             x for x in range(started_port, started_port + len(selected_gpus))
         ]
 
-    return get_cluster(node_ips, node_ip, free_ports, selected_gpus)
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+    return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
 
 
 def get_gpus(selected_gpus):
     if selected_gpus is None:
         from paddle.fluid import core
         gpus_num = core.get_cuda_device_count()
-        selected_gpus = [str(x) for x in range(0, gpus_num)]
+        gpus = [str(x) for x in range(0, gpus_num)]
     else:
         cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
         if cuda_visible_devices is None or cuda_visible_devices == "":
-            selected_gpus = [x.strip() for x in selected_gpus.split(',')]
+            gpus = [x.strip() for x in selected_gpus.split(',')]
         else:
             # change selected_gpus into relative values
             # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
@@ -181,12 +184,16 @@ def get_gpus(selected_gpus):
                 assert x in cuda_visible_devices_list, "Can't find "\
                 "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
                 % (x, cuda_visible_devices)
-            selected_gpus = [
+            gpus = [
                 cuda_visible_devices_list.index(x.strip())
                 for x in selected_gpus.split(',')
             ]
+            logger.info("Change selected_gpus into reletive values. --ips:{} "
+                        "will change into relative_ips:{} according to your "
+                        "CUDA_VISIBLE_DEVICES:{}".format(
+                            selected_gpus, gpus, cuda_visible_devices_list))
 
-    return selected_gpus
+    return gpus
 
 
 def get_cluster_and_pod(args):
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 1fa307c4d1b..be144a55b86 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -227,18 +227,23 @@ def get_logger(log_level, name="root"):
     return logger
 
 
-def get_cluster(node_ips, node_ip, paddle_ports, selected_gpus):
-    assert type(paddle_ports) is list, "paddle_ports must be list"
+def get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus):
+    assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
     cluster = Cluster(hdfs=None)
     trainer_rank = 0
     for node_rank, ip in enumerate(node_ips):
         pod = Pod()
         pod.rank = node_rank
         pod.addr = ip
+        cur_node_endpoints = trainer_endpoints[node_rank]
+        # when use paddlecloud, endpoints may > selected_gpus(user_defined)
+        assert len(cur_node_endpoints) >= len(
+            selected_gpus
+        ), "current trainer_endpoints size should be greater equal than selected_gpus size."
         for i in range(len(selected_gpus)):
             trainer = Trainer()
             trainer.gpus.append(selected_gpus[i])
-            trainer.endpoint = "%s:%d" % (ip, paddle_ports[i])
+            trainer.endpoint = "%s" % (cur_node_endpoints[i])
             trainer.rank = trainer_rank
             trainer_rank += 1
 
@@ -253,7 +258,8 @@ def terminate_local_procs(procs):
     for p in procs:
         if p.proc.poll() is None:
             p.proc.terminate()
-            p.log_fn.close()
+            if p.log_fn:
+                p.log_fn.close()
             logger.debug("terminate process id:{}".format(p.proc.pid))
 
     #wait all process terminiated
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
index c5edc969634..e717962ead2 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
@@ -79,9 +79,9 @@ if [ -f $file_1 ]; then
     rm $file_1
 fi
 
-
+# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud
 unset PADDLE_PORT
-unset TRAINER_PORTS_NUM
+export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
 
 echo ""
 echo "paddle.distributed.launch async poll process test"
diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh
index 98c907a5519..958d7824662 100644
--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
@@ -48,9 +48,9 @@ if [ -f $file_1 ]; then
     rm $file_1
 fi
 
-
+# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud
 unset PADDLE_PORT
-unset TRAINER_PORTS_NUM
+export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
 
 echo ""
 echo "paddle.distributed.launch async poll process test"
diff --git a/python/paddle/tests/test_dist_hapi_model.py b/python/paddle/tests/test_dist_hapi_model.py
index e75e08e3749..db5b63c5ae0 100644
--- a/python/paddle/tests/test_dist_hapi_model.py
+++ b/python/paddle/tests/test_dist_hapi_model.py
@@ -37,7 +37,11 @@ def get_cluster_from_args(selected_gpus):
     free_ports = find_free_ports(len(selected_gpus))
     if free_ports is not None:
         free_ports = list(free_ports)
-    return get_cluster(node_ips, node_ip, free_ports, selected_gpus)
+
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+    return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
 
 
 def get_gpus(selected_gpus):
-- 
GitLab


From d003573f90537392a064e8284d7fe95d170796fd Mon Sep 17 00:00:00 2001
From: wawltor <fangzeyang0904@hotmail.com>
Date: Wed, 16 Sep 2020 10:40:25 +0800
Subject: [PATCH 089/261] add the error message check for the some operator

add the error message check for the some operator
---
 paddle/fluid/operators/arg_min_max_op_base.h | 22 ++++++++++++--------
 paddle/fluid/operators/assign_op.h           |  5 ++++-
 paddle/fluid/operators/isfinite_op.cc        |  6 +++++-
 paddle/fluid/operators/isfinite_op.h         |  6 +++++-
 paddle/fluid/operators/linspace_op.cc        |  2 --
 paddle/fluid/operators/linspace_op.cu        |  5 ++++-
 paddle/fluid/operators/linspace_op.h         |  5 ++++-
 paddle/fluid/operators/scale_op.h            |  5 ++++-
 8 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index c296ddcfbef..57e1c06f73c 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -110,10 +110,12 @@ struct VisitDataArgMinMaxFunctor {
         CALL_ARG_MINMAX_FUNCTOR(6);
         break;
       default:
-        PADDLE_THROW(
-            "%s operator doesn't supports tensors whose ranks are greater "
-            "than 6.",
-            (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"));
+        PADDLE_ENFORCE_LE(
+            x_dims.size(), 6,
+            platform::errors::InvalidArgument(
+                "%s operator doesn't supports tensors whose ranks are greater "
+                "than 6.",
+                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
         break;
 #undef CALL_ARG_MINMAX_FUNCTOR
     }
@@ -164,7 +166,8 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LT(
         axis, x_dims.size(),
         platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
+            "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis,
+            x_dims.size()));
 
     const int& dtype = ctx->Attrs().Get<int>("dtype");
     PADDLE_ENFORCE_EQ(
@@ -192,10 +195,11 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
         }
         PADDLE_ENFORCE_LE(
             all_element_num, INT_MAX,
-            "The element num of the argmin/argmax input at axis is "
-            "%d, is larger than int32 maximum value:%d, you must "
-            "set the dtype of argmin/argmax to 'int64'.",
-            all_element_num, INT_MAX);
+            platform::errors::InvalidArgument(
+                "The element num of the argmin/argmax input at axis is "
+                "%d, is larger than int32 maximum value:%d, you must "
+                "set the dtype of argmin/argmax to 'int64'.",
+                all_element_num, INT_MAX));
       }
     }
     std::vector<int64_t> vec;
diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h
index 6ce04d19fc4..c2154f78bbe 100644
--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@@ -52,7 +52,10 @@ class AssignFunctor {
 
   template <typename T>
   void operator()(const T &v) const {
-    PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
+    PADDLE_ENFORCE_EQ(
+        true, false,
+        platform::errors::PermissionDenied(
+            "Not support type for assign op with type %s", typeid(T).name()));
   }
 
  private:
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index af737ec42f6..9b92ce3e538 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -43,7 +43,11 @@ class OverflowOp : public framework::OperatorWithKernel {
     } else if (x_var->IsType<framework::SelectedRows>()) {
       dtype = x_var->Get<framework::SelectedRows>().value().type();
     } else {
-      PADDLE_THROW("Cannot find the input data type by all input data");
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The input type mismatch, the type of Input(X) must be Tensor or "
+              "SelectedRows, please check your input."));
     }
     return framework::OpKernelType(framework::proto::VarType::Type(dtype),
                                    ctx.GetPlace());
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index 83b08085636..2fc0d58669b 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -57,7 +57,11 @@ class OverflowKernel : public framework::OpKernel<T> {
       auto& in = ctx.Input<framework::SelectedRows>("X")->value();
       functor(in, out);
     } else {
-      PADDLE_THROW("Unsupported input type.");
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The input type mismatch, the type of Input(X) must be Tensor or "
+              "SelectedRows, please check your input."));
     }
   }
 };
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index 2c3172d2a11..667c6e89295 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -22,8 +22,6 @@ class LinspaceOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Start"),
-                   "Input(Start) of LinspaceOp should not be null.");
     OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace");
     OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace");
     OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace");
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
index 793253b6b88..c51e8785263 100644
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -63,7 +63,10 @@ class CUDALinspaceKernel : public framework::OpKernel<T> {
     framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
     int32_t num = n.data<int32_t>()[0];
 
-    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
+                                  "The num of linspace op should be larger "
+                                  "than 0, but received num is %d",
+                                  num));
 
     out->Resize(framework::make_ddim({num}));
     T* out_data = out->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
index 898f611f864..2c30a66ef8e 100644
--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -46,7 +46,10 @@ class CPULinspaceKernel : public framework::OpKernel<T> {
 
     T start = start_t.data<T>()[0];
     T stop = stop_t.data<T>()[0];
-    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
+                                  "The num of linspace op should be larger "
+                                  "than 0, but received num is %d",
+                                  num));
 
     out->Resize(framework::make_ddim({num}));
 
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index 64ee868fb6d..11c81d23b2e 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -60,7 +60,10 @@ class ScaleKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(in->place());
 
     PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
-                      "in and out should have the same dim");
+                      paddle::platform::errors::InvalidArgument(
+                          "the input and output should have the same dim"
+                          "but input dim is %s, output dim is %s",
+                          in->dims(), out->dims()));
 
     auto eigen_out = framework::EigenVector<T>::Flatten(*out);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-- 
GitLab


From 4e8582fe5adffb2d135c78a86aa54d29b068c0a6 Mon Sep 17 00:00:00 2001
From: wawltor <fangzeyang0904@hotmail.com>
Date: Wed, 16 Sep 2020 10:40:53 +0800
Subject: [PATCH 090/261] update the error message check for the some ops

update the error message check for the some ops
---
 .../test_elementwise_add_op_inplace.cc        | 24 +++++++++++++++----
 .../test_elementwise_op_grad_grad.h           |  9 +++++--
 paddle/fluid/operators/sum_op.cc              | 13 +++++++---
 paddle/fluid/operators/sum_op.cu              | 21 ++++++++++++----
 paddle/fluid/operators/sum_op.h               | 13 +++++++---
 paddle/fluid/operators/uniform_random_op.cc   |  8 ++++---
 paddle/fluid/operators/uniform_random_op.cu   |  8 ++++---
 paddle/fluid/operators/uniform_random_op.h    | 11 +++++++--
 8 files changed, 82 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index b8163169734..6ec8f2c2355 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -33,9 +33,12 @@ namespace operators {
 static void Memcpy(void *dst, const void *src, size_t n, bool copy_to_gpu) {
   if (copy_to_gpu) {
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE(cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
 #else
-    PADDLE_THROW("Not compiled with cuda");
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("Check your paddle version, current "
+                                          "version is not compiled with cuda"));
 #endif
   } else {
     std::memcpy(dst, src, n);
@@ -88,11 +91,22 @@ bool TestMain(const platform::Place &place, const framework::DDim &dims,
 
   framework::LoDTensor cpu_out;
   auto &out_tensor = scope.FindVar(out_name)->Get<framework::LoDTensor>();
-  PADDLE_ENFORCE(scope.kids().empty());
+  PADDLE_ENFORCE_EQ(scope.kids().empty(), true,
+                    platform::errors::InvalidArgument(
+                        "The scope can not have the child scopes,"
+                        "please check your code."));
   if (inplace) {
-    PADDLE_ENFORCE_EQ(&out_tensor, x);
+    PADDLE_ENFORCE_EQ(
+        &out_tensor, x,
+        platform::errors::InvalidArgument(
+            "The output tensor should be same as input x in inplace mode,"
+            " but now is not same."));
   } else {
-    PADDLE_ENFORCE_EQ(&out_tensor, z);
+    PADDLE_ENFORCE_EQ(
+        &out_tensor, z,
+        platform::errors::InvalidArgument(
+            "The output tensor should be same as output z in normal mode,"
+            " but now is not same."));
   }
 
   if (is_gpu_place) {
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
index 89849ef92cd..54e7c7d1b6a 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
+++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
@@ -92,7 +92,9 @@ class TestElementwiseOpGradGrad {
         auto dst_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
         memory::Copy(dst_place, dst, src_place, src, bytes, nullptr);
 #else
-        PADDLE_THROW("Not compiled with cuda");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Check your paddle version, current version is not compiled with "
+            "cuda"));
 #endif
       }
     }
@@ -107,7 +109,10 @@ class TestElementwiseOpGradGrad {
     op->Run(scope_, place_);
     platform::DeviceContextPool::Instance().Get(place_)->Wait();
     framework::LoDTensor cpu_out;
-    PADDLE_ENFORCE_EQ(scope_.kids().empty(), true, "scope has child scopes");
+    PADDLE_ENFORCE_EQ(scope_.kids().empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The scope can not have the child scopes,"
+                          "please check your code."));
 
     // get outputs from scope and compare them with expected_outs
     bool all_equal = true;
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index b06e8202cc7..52c4c63b473 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -186,10 +186,17 @@ class SumOp : public framework::OperatorWithKernel {
           }
         }
       }
-      PADDLE_THROW("Cannot find the input data type by all input data");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected each tensor in Input(x) in sum op has be initialized, but "
+          "some tensor in Input(x) is not be initialized, please check your "
+          "code.",
+          framework::ToTypeName(x_vars[0]->Type())));
     }
-    PADDLE_THROW("Unexpected branch. Input type is %s",
-                 framework::ToTypeName(x_vars[0]->Type()));
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Expected type of Input(X) must be Tensor,  SelectedRows or "
+        "LodTensorArray. But got "
+        "unsupport type: %s.",
+        framework::ToTypeName(x_vars[0]->Type())));
   }
 };
 
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index d0bf3a0abf5..6034cda50c3 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -169,8 +169,18 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
       auto row_numel = sr_value.numel() / sr_rows.size();
       auto out_dims = out->dims();
 
-      PADDLE_ENFORCE_EQ(sr.height(), out_dims[0]);
-      PADDLE_ENFORCE_EQ(row_numel, out->numel() / sr.height());
+      PADDLE_ENFORCE_EQ(sr.height(), out_dims[0],
+                        platform::errors::InvalidArgument(
+                            "The table height of input must be same as output, "
+                            "but received input height is %d"
+                            ", output height is %d",
+                            sr.height(), out_dims[0]));
+      PADDLE_ENFORCE_EQ(row_numel, out->numel() / sr.height(),
+                        platform::errors::InvalidArgument(
+                            "The table width of input must be same as output, "
+                            "but received input width is %d"
+                            ", output width is %d",
+                            row_numel, out->numel() / sr.height()));
 
       auto *sr_data = sr_value.data<T>();
       auto *sr_out_data = out->data<T>();
@@ -231,8 +241,11 @@ class SumKernel<platform::CUDADeviceContext, T>
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
       LodTensorArrayCompute<platform::CUDADeviceContext, T>(context);
     } else {
-      PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   framework::ToTypeName(out_var->Type()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Ouput(out) must be Tensor,  SelectedRows or "
+          "LodTensorArray. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 6847a813779..4c8f7be6ea2 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -182,7 +182,11 @@ class SumKernel : public framework::OpKernel<T> {
           auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
           functor(context.template device_context<DeviceContext>(), in_t, out);
         } else {
-          PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Expected type of Input(X) of %d-th must be Tensor, "
+              "SelectedRows. But got "
+              "unsupport type: %s.",
+              framework::ToTypeName(in_vars[i]->Type())));
         }
       }
     } else if (out_var->IsType<framework::SelectedRows>()) {
@@ -190,8 +194,11 @@ class SumKernel : public framework::OpKernel<T> {
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
       LodTensorArrayCompute<DeviceContext, T>(context);
     } else {
-      PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   framework::ToTypeName(out_var->Type()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) must be Tensor, SelectedRows, "
+          "LoDTensorArray. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 9cffe09a33a..6efada4343c 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -54,9 +54,11 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = out_var->GetMutable<framework::LoDTensor>();
       if (!new_shape.empty()) tensor->Resize(framework::make_ddim(new_shape));
     } else {
-      PADDLE_THROW(
-          "uniform_random_op's output only"
-          "supports SelectedRows and LoDTensor");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) in uniform_random_op must be Tensor, "
+          "SelectedRows. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
     }
     T *data = tensor->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 6237137cccb..563a6c165b7 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -116,9 +116,11 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = out_var->GetMutable<framework::LoDTensor>();
       if (!new_shape.empty()) tensor->Resize(framework::make_ddim(new_shape));
     } else {
-      PADDLE_THROW(
-          "uniform_random_op's output only"
-          "supports SelectedRows and LoDTensor");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) in uniform_random_op must be Tensor, "
+          "SelectedRows. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
     }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index d263dd03dd0..6052e533643 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -50,7 +50,10 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
     }
     return vec_new_data;
   } else {
-    PADDLE_THROW("The dtype of shape tensor must be int32 or int64.");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Expected dtype of ShapeTensor must be int32, int64. But got "
+        "unsupport dtype: %s.",
+        paddle::framework::DataTypeToString(new_data_tensor->type())));
   }
 }
 
@@ -84,7 +87,11 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
         vec_new_shape.push_back(*tensor->data<int64_t>());
       }
     } else {
-      PADDLE_THROW("The dtype of shape tensor must be int32 or int64.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected dtype of ShapeTensorList of %d-th must be int32, int64. "
+          "But got "
+          "unsupport dtype: %s.",
+          i, paddle::framework::DataTypeToString(tensor->type())));
     }
   }
 
-- 
GitLab


From ef6dd6b8e60871bb766e8c0744e143e4a680aa87 Mon Sep 17 00:00:00 2001
From: Zhen Wang <wangzhen31@baidu.com>
Date: Wed, 16 Sep 2020 10:54:02 +0800
Subject: [PATCH 091/261] fix the test_fleet_lars_meta_optimizer ut. (#27291)

---
 .../fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py     | 2 +-
 .../fluid/tests/unittests/test_fleet_lars_meta_optimizer.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
index ff305fb9523..ec055178d90 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
@@ -141,7 +141,7 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
         ops = [op.type for op in avg_cost.block.ops]
         self.assertIn('lamb', ops)
         self.assertIn('cast', ops)
-        self.assertIn('isfinite', ops)
+        self.assertIn('check_finite_and_unscale', ops)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
index 34ab423e064..0a70710b459 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
@@ -145,7 +145,7 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
         ops = [op.type for op in avg_cost.block.ops]
         self.assertIn('lars_momentum', ops)
         self.assertIn('cast', ops)
-        self.assertIn('isfinite', ops)
+        self.assertIn('check_finite_and_unscale', ops)
 
 
 if __name__ == "__main__":
-- 
GitLab


From 8fe1c2d1c278ed9b8956e1d886de5c7287c9231e Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Wed, 16 Sep 2020 11:09:32 +0800
Subject: [PATCH 092/261] move three ut to execute only at night (#27314)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index d50356e0e95..fa092ffb191 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -400,17 +400,17 @@ py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${G
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
     FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_resnet PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_imperative_resnet PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_resnet_sorted_gradient MODULES test_imperative_resnet_sorted_gradient ENVS
         FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
     FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_mnist_sorted_gradient MODULES test_imperative_mnist_sorted_gradient ENVS
         FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS
     FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_se_resnext PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_imperative_se_resnext PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_attention_model ENVS
         FLAGS_cudnn_deterministic=1 SERIAL)
 py_test_modules(test_install_check MODULES test_install_check ENVS
-- 
GitLab


From 18fc92756274e537eb1edf548f16ae1af72893be Mon Sep 17 00:00:00 2001
From: littletomatodonkey <2120160898@bit.edu.cn>
Date: Wed, 16 Sep 2020 13:02:31 +0800
Subject: [PATCH 093/261] add regularizer api (#27292)

---
 python/paddle/__init__.py                     |   1 +
 .../tests/unittests/test_regularizer_api.py   | 204 ++++++++++++++++++
 python/paddle/regularizer.py                  | 136 +++++++++++-
 python/paddle/utils/__init__.py               |   7 +-
 4 files changed, 340 insertions(+), 8 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_regularizer_api.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index ed0b415d0bf..016726633ea 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -49,6 +49,7 @@ import paddle.optimizer
 import paddle.metric
 import paddle.device
 import paddle.incubate.complex as complex
+import paddle.regularizer
 
 # TODO: define alias in tensor and framework directory
 
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer_api.py b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
new file mode 100644
index 00000000000..76186d2e39f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
@@ -0,0 +1,204 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from functools import partial
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.optimizer as optimizer
+import paddle.regularizer as regularizer
+from paddle.fluid.backward import append_backward
+
+
+def bow_net(data,
+            label,
+            dict_dim,
+            is_sparse=False,
+            emb_dim=8,
+            hid_dim=8,
+            hid_dim2=6,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    fluid/PaddleNLP/text_classification/nets.py
+    """
+    emb = fluid.layers.embedding(
+        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    return avg_cost
+
+
+class TestRegularizer(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        reader = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict), batch_size=1)()
+        self.train_data = [next(reader) for _ in range(1)]
+
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self, main_prog, startup_prog):
+        scope = fluid.core.Scope()
+        with fluid.unique_name.guard():
+            with fluid.scope_guard(scope):
+                with fluid.program_guard(main_prog, startup_prog):
+                    yield
+
+    def run_program(self, place, feed_list):
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
+        exe.run(fluid.default_startup_program())
+
+        main_prog = fluid.default_main_program()
+        param_list = [var.name for var in main_prog.block(0).all_parameters()]
+
+        param_sum = []
+        for data in self.train_data:
+            out = exe.run(main_prog,
+                          feed=feeder.feed(data),
+                          fetch_list=param_list)
+            p_sum = 0
+            for v in out:
+                p_sum += np.sum(np.abs(v))
+            param_sum.append(p_sum)
+        return param_sum
+
+    def check_l2decay_regularizer(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+        with self.scope_prog_guard(
+                main_prog=main_prog, startup_prog=startup_prog):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+            avg_cost = model(data, label, len(self.word_dict))
+
+            optimizer = fluid.optimizer.Adagrad(
+                learning_rate=0.1,
+                regularization=paddle.regularizer.L2Decay(1.0))
+            optimizer.minimize(avg_cost)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+
+    def check_l2decay(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+
+        with self.scope_prog_guard(
+                main_prog=main_prog, startup_prog=startup_prog):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+            avg_cost_l2 = model(data, label, len(self.word_dict))
+
+            param_list = fluid.default_main_program().block(0).all_parameters()
+            para_sum = []
+            for para in param_list:
+                para_mul = fluid.layers.square(x=para)
+                para_sum.append(fluid.layers.reduce_sum(input=para_mul))
+            avg_cost_l2 += fluid.layers.sums(para_sum) * .5
+
+            optimizer = fluid.optimizer.Adagrad(learning_rate=0.1)
+            optimizer.minimize(avg_cost_l2)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+
+    def test_l2(self):
+        for place in self.get_places():
+            dense_sparse_p_sum = []
+            for sparse in [True, False]:
+                model = partial(bow_net, is_sparse=sparse)
+                framework_l2 = self.check_l2decay_regularizer(place, model)
+                l2 = self.check_l2decay(place, model)
+                assert len(l2) == len(framework_l2)
+                for i in range(len(l2)):
+                    assert np.isclose(a=framework_l2[i], b=l2[i], rtol=5e-5)
+                dense_sparse_p_sum.append(framework_l2)
+
+            assert len(dense_sparse_p_sum[0]) == len(dense_sparse_p_sum[1])
+            for i in range(len(dense_sparse_p_sum[0])):
+                assert np.isclose(
+                    a=dense_sparse_p_sum[0][i],
+                    b=dense_sparse_p_sum[1][i],
+                    rtol=5e-5)
+
+    def test_repeated_regularization(self):
+        l1 = paddle.regularizer.L1Decay(0.1)
+        l2 = paddle.regularizer.L2Decay(0.01)
+        fc_param_attr = fluid.ParamAttr(regularizer=l1)
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.layers.uniform_random([2, 2, 3])
+            out = fluid.layers.fc(x, 5, param_attr=fc_param_attr)
+            loss = fluid.layers.reduce_sum(out)
+            sgd = fluid.optimizer.SGD(learning_rate=0.1, regularization=l2)
+            sgd.minimize(loss)
+        with fluid.dygraph.guard():
+            input = fluid.dygraph.to_variable(
+                np.random.randn(3, 2).astype('float32'))
+            paddle.manual_seed(1)
+            paddle.framework.random._manual_program_seed(1)
+
+            linear1 = fluid.dygraph.Linear(
+                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+            linear2 = fluid.dygraph.Linear(
+                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+
+            loss1 = linear1(input)
+            loss1.backward()
+            # set l2 regularizer in optimizer, but l1 in fluid.ParamAttr
+
+            fluid.optimizer.SGD(parameter_list=linear1.parameters(),
+                                learning_rate=1e-2,
+                                regularization=l2).minimize(loss1)
+            # only set l1 in fluid.ParamAttr
+            loss2 = linear2(input)
+            loss2.backward()
+            fluid.optimizer.SGD(parameter_list=linear2.parameters(),
+                                learning_rate=1e-2).minimize(loss2)
+            # they should both be applied by l1, and keep the same
+            self.assertTrue(
+                np.allclose(linear1.weight.numpy(), linear2.weight.numpy()),
+                "weight should use the regularization in fluid.ParamAttr!")
+            self.assertTrue(
+                np.allclose(linear1.bias.numpy(), linear2.bias.numpy()),
+                "bias should use the regularization in fluid.ParamAttr!")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/regularizer.py b/python/paddle/regularizer.py
index 2b20bb41970..b3f483fd891 100644
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
@@ -12,8 +12,134 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the regularizer functions 
-# __all__ = ['L1Decay',
-#            'L1DecayRegularizer',
-#            'L2Decay',
-#            'L2DecayRegularizer']
+__all__ = ['L1Decay', 'L2Decay']
+
+import paddle.fluid as fluid
+
+
+class L1Decay(fluid.regularizer.L1Decay):
+    """
+    Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
+    
+    It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
+    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined 
+    in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
+    in Optimizer will be used.
+    
+    In the implementation, the formula of L1 Weight Decay Regularization is as follows:
+	
+    .. math::
+
+        L1WeightDecay = reg\_coeff * sign(parameter)
+
+    Args:
+        coeff(float, optional): regularization coeff. Default:0.0.
+	
+    Examples:
+        .. code-block:: python
+
+            # Example1: set Regularizer in optimizer
+            import paddle
+            from paddle.regularizer import L1Decay
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            momentum = paddle.optimizer.Momentum(
+                learning_rate=0.1,
+                parameters=linear.parameters(),
+                weight_decay=L1Decay(0.0001))
+            back = out.backward()
+            momentum.step()
+            momentum.clear_grad()
+
+            # Example2: set Regularizer in parameters
+            # Set L1 regularization in parameters.
+            # Global regularizer does not take effect on my_conv2d for this case.
+            from paddle.nn import Conv2d
+            from paddle import ParamAttr
+            from paddle.regularizer import L2Decay
+
+            my_conv2d = Conv2d(
+                    in_channels=10,
+                    out_channels=10,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    weight_attr=ParamAttr(regularizer=L2Decay(coeff=0.01)),
+                    bias_attr=False)
+    """
+
+    def __init__(self, coeff=0.0):
+        super(L1Decay, self).__init__(coeff)
+
+
+class L2Decay(fluid.regularizer.L2Decay):
+    """
+    Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
+    
+    It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
+    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined 
+    in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
+    in Optimizer will be used.
+    
+    In the implementation, the formula of L2 Weight Decay Regularization is as follows:
+
+    .. math::
+
+        L2WeightDecay = reg\_coeff * parameter
+
+    Args:
+        regularization_coeff(float, optional): regularization coeff. Default:0.0
+	
+    Examples:
+        .. code-block:: python
+
+            # Example1: set Regularizer in optimizer
+            import paddle
+            from paddle.regularizer import L2Decay
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            momentum = paddle.optimizer.Momentum(
+                learning_rate=0.1,
+                parameters=linear.parameters(),
+                weight_decay=L2Decay(0.0001))
+            back = out.backward()
+            momentum.step()
+            momentum.clear_grad()
+
+            # Example2: set Regularizer in parameters
+            # Set L2 regularization in parameters.
+            # Global regularizer does not take effect on my_conv2d for this case.
+            from paddle.nn import Conv2d
+            from paddle import ParamAttr
+            from paddle.regularizer import L2Decay
+
+            my_conv2d = Conv2d(
+                    in_channels=10,
+                    out_channels=10,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    weight_attr=ParamAttr(regularizer=L2Decay(coeff=0.01)),
+                    bias_attr=False)
+    """
+
+    def __init__(self, coeff=0.0):
+        super(L2Decay, self).__init__(coeff)
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 2a649c776b4..4a786679727 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -16,12 +16,13 @@ from .profiler import ProfilerOptions
 from .profiler import Profiler
 from .profiler import get_profiler
 from .deprecated import deprecated
+from ..fluid.framework import unique_name
+from ..fluid.framework import load_op_library
+from ..fluid.framework import require_version
 
 from . import download
 
 __all__ = ['dump_config', 'deprecated', 'download']
 
 #TODO: define new api under this directory
-# __all__ = ['unique_name',
-#            'load_op_library',
-#            'require_version']
+__all__ += ['unique_name', 'load_op_library', 'require_version']
-- 
GitLab


From 950301bfa1426016eba3ac110c51e1d1d4453d44 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 16 Sep 2020 13:17:56 +0800
Subject: [PATCH 094/261] Add input_spec & output_spec for TranslatedLayer
 (#27284)

* add input_spec & output_spec for translated_layer

* update error message
---
 python/paddle/fluid/dygraph/io.py             | 69 +++++++++++++++----
 .../tests/unittests/test_translated_layer.py  | 33 ++++++++-
 2 files changed, 86 insertions(+), 16 deletions(-)

diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 335ac500c89..4391843b0ef 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -19,6 +19,7 @@ import six
 import pickle
 import numpy as np
 
+import paddle
 from paddle import compat as cpt
 from paddle.fluid import core
 from paddle.fluid import framework
@@ -182,9 +183,9 @@ class _ProgramHolder(object):
         super(_ProgramHolder, self).__init__()
 
         # input, output, persistable var info
-        self._input_names = []
-        self._persistable_names = []
+        self._input_descs = []
         self._output_descs = []
+        self._persistable_names = []
 
         # execution scope
         self._inner_scope = core.Scope()
@@ -207,11 +208,11 @@ class _ProgramHolder(object):
         return self._train_program_desc
 
     @property
-    def input_names(self):
-        return self._input_names
+    def input_descs(self):
+        return self._input_descs
 
     @property
-    def output_decs(self):
+    def output_descs(self):
         return self._output_descs
 
     @property
@@ -233,7 +234,8 @@ class _ProgramHolder(object):
                 ops_to_remove.append(i)
                 feed_var_name = cpt.to_bytes(op.input('X')[0])
                 root_block._remove_var(feed_var_name)
-                self._input_names.append(cpt.to_bytes(op.output('Out')[0]))
+                self._input_descs.append(
+                    root_block.find_var(cpt.to_bytes(op.output('Out')[0])))
             elif op.type() == 'scale' and op.output('Out')[0].startswith(
                     'save_infer_model/scale_'):
                 ops_to_remove.append(i)
@@ -257,7 +259,7 @@ class _ProgramHolder(object):
             root_block._remove_op(op_idx, op_idx + 1)
 
         # 2. Input processing, reverse feed vars
-        self._input_names.reverse()
+        self._input_descs.reverse()
 
         # 3. Output processing, add scale for outputs
         tmp_program = _build_program_by_desc(program_desc)
@@ -738,7 +740,7 @@ class TranslatedLayer(layers.Layer):
                 if isinstance(value, np.ndarray):
                     var = core.VarBase(
                         value=value,
-                        name=program_holder.input_names[i],
+                        name=program_holder.input_descs[i].name(),
                         persistable=False,
                         place=framework._current_expected_place(),
                         zero_copy=True)
@@ -746,7 +748,7 @@ class TranslatedLayer(layers.Layer):
                     var = value
                     # NOTE: we changed var name here, 
                     # but it may be an important name set by user
-                    var.name = program_holder.input_names[i]
+                    var.name = program_holder.input_descs[i].name()
                 input_vars.append(var)
 
             persistable_vars = []
@@ -762,7 +764,7 @@ class TranslatedLayer(layers.Layer):
                         % var_name)
 
             output_vars = []
-            for var_desc in program_holder.output_decs:
+            for var_desc in program_holder.output_descs:
                 var = core.VarBase(var_desc.dtype(),
                                    var_desc.shape(),
                                    var_desc.name(), var_desc.type(), False)
@@ -913,11 +915,7 @@ class TranslatedLayer(layers.Layer):
                 program = translated_layer.program()
         """
         # 1. get program holder
-        program_holder = self._program_holder_dict.get(method_name, None)
-        if program_holder is None:
-            raise ValueError(
-                "The method `%s` is not exists in loaded TranslatedLayer." %
-                method_name)
+        program_holder = self._get_program_holder(method_name)
 
         # 2. get inference program desc
         program_desc = program_holder.infer_program
@@ -925,3 +923,44 @@ class TranslatedLayer(layers.Layer):
         # 3. construct program
         program = _build_program_by_desc(program_desc)
         return program
+
+    def _get_program_holder(self, method_name='forward'):
+        program_holder = self._program_holder_dict.get(method_name, None)
+        if program_holder is None:
+            raise ValueError(
+                "The method `%s` does not exist in loaded TranslatedLayer." %
+                method_name)
+        return program_holder
+
+    def _input_spec(self, method_name='forward'):
+        # 1. get program holder
+        program_holder = self._get_program_holder(method_name)
+
+        # 2. build input spec by input desc
+        input_spec = []
+        for var_desc in program_holder.input_descs:
+            spec = paddle.static.InputSpec(
+                shape=var_desc.shape(),
+                dtype=var_desc.dtype(),
+                name=var_desc.name())
+            input_spec.append(spec)
+
+        return input_spec
+
+    def _output_spec(self, method_name='forward'):
+        # 1. get program holder
+        program_holder = self._get_program_holder(method_name)
+
+        # 2. build output spec by output desc
+        output_spec = []
+        for var_desc in program_holder.output_descs:
+            # NOTE(chenweihang): InputSpec describes a tensor, not just input. 
+            # Maybe the name is not good enough. Here we use InputSpec to 
+            # construct the description of Output tensor
+            spec = paddle.static.InputSpec(
+                shape=var_desc.shape(),
+                dtype=var_desc.dtype(),
+                name=var_desc.name())
+            output_spec.append(spec)
+
+        return output_spec
diff --git a/python/paddle/fluid/tests/unittests/test_translated_layer.py b/python/paddle/fluid/tests/unittests/test_translated_layer.py
index 20c51b9afba..e5dc279750d 100644
--- a/python/paddle/fluid/tests/unittests/test_translated_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_translated_layer.py
@@ -49,7 +49,10 @@ class LinearNet(nn.Layer):
         super(LinearNet, self).__init__()
         self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
 
-    @paddle.jit.to_static
+    @paddle.jit.to_static(input_spec=[
+        paddle.static.InputSpec(
+            shape=[None, IMAGE_SIZE], dtype='float32', name='x')
+    ])
     def forward(self, x):
         return self._linear(x)
 
@@ -152,6 +155,34 @@ class TestTranslatedLayer(unittest.TestCase):
         with self.assertRaises(ValueError):
             program = translated_layer.program('not_exists')
 
+    def test_get_input_spec(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        expect_spec = [
+            paddle.static.InputSpec(
+                shape=[None, IMAGE_SIZE], dtype='float32', name='x')
+        ]
+        actual_spec = translated_layer._input_spec()
+
+        for spec_x, spec_y in zip(expect_spec, actual_spec):
+            self.assertEqual(spec_x, spec_y)
+
+    def test_get_output_spec(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        expect_spec = [
+            paddle.static.InputSpec(
+                shape=[None, CLASS_NUM],
+                dtype='float32',
+                name='translated_layer/scale_0.tmp_1')
+        ]
+        actual_spec = translated_layer._output_spec()
+
+        for spec_x, spec_y in zip(expect_spec, actual_spec):
+            self.assertEqual(spec_x, spec_y)
+
 
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From 4f9d6529feb259c148ab2b2bf80643a211fb62ae Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 16 Sep 2020 13:19:53 +0800
Subject: [PATCH 095/261] Polish framework error message part 7 (#27266)

* polish framework error message part 7

* fix typo

* polish by reviewes comment
---
 paddle/fluid/framework/reader.cc          |   5 +-
 paddle/fluid/framework/rw_lock.h          |  12 ++-
 paddle/fluid/framework/save_load_util.cc  | 120 ++++++++++++----------
 paddle/fluid/framework/selected_rows.cc   |  36 ++++---
 paddle/fluid/framework/selected_rows.h    |   3 +-
 paddle/fluid/framework/shape_inference.cc |  18 ++--
 paddle/fluid/framework/tensor_util.cc     |  66 ++++++++----
 paddle/fluid/framework/tensor_util.h      |   6 +-
 8 files changed, 167 insertions(+), 99 deletions(-)

diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index d3513fb7dbe..b418339bf32 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -20,7 +20,10 @@ namespace framework {
 
 void ReaderBase::ReadNext(std::vector<LoDTensor> *out) {
   std::lock_guard<std::mutex> lock(mu_);
-  PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning);
+  PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning,
+                    platform::errors::Unavailable(
+                        "The current reader has stopped running and cannot "
+                        "continue to read the next batch of data."));
   ReadNextImpl(out);
 }
 
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
index f8aa87519a2..9b74a553040 100644
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -32,17 +32,21 @@ struct RWLock {
   ~RWLock() { pthread_rwlock_destroy(&lock_); }
 
   inline void RDLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
-                      "acquire read lock failed");
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_rdlock(&lock_), 0,
+        platform::errors::External("The pthread failed to acquire read lock."));
   }
 
   inline void WRLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
-                      "acquire write lock failed");
+                      platform::errors::External(
+                          "The pthread failed to acquire write lock."));
   }
 
   inline void UNLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_unlock(&lock_), 0,
+        platform::errors::External("The pthread failed to unlock."));
   }
 
  private:
diff --git a/paddle/fluid/framework/save_load_util.cc b/paddle/fluid/framework/save_load_util.cc
index fbbbfd66b3d..602b431995c 100644
--- a/paddle/fluid/framework/save_load_util.cc
+++ b/paddle/fluid/framework/save_load_util.cc
@@ -33,7 +33,8 @@ void CheckInStreamState(std::istream& istre, size_t length) {
     VLOG(5) << "Can't read [" << length << "] from file"
             << "file seems breakem";
 
-    PADDLE_THROW("Model load error, file seems breaken");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Model load failed, istream state error."));
   }
 }
 
@@ -58,10 +59,11 @@ size_t ReadTensorNumber(std::istream& istre) {
              sizeof(char) * tensor_number_mark.size());
   std::string str_read_tensor_number_mark(tensor_number_mark_buffer,
                                           tensor_number_mark.size());
-  PADDLE_ENFORCE_EQ(
-      tensor_number_mark, str_read_tensor_number_mark,
-      "Tensor number mark not match, expect [%s], but read from file is [%]",
-      tensor_number_mark, str_read_tensor_number_mark);
+  PADDLE_ENFORCE_EQ(tensor_number_mark, str_read_tensor_number_mark,
+                    platform::errors::InvalidArgument(
+                        "Tensor number mark does not match, expect mark is "
+                        "[%s], but the mark read from file is [%s].",
+                        tensor_number_mark, str_read_tensor_number_mark));
 
   size_t tensor_number = 0;
   istre.read(reinterpret_cast<char*>(&tensor_number), sizeof(tensor_number));
@@ -79,10 +81,11 @@ std::string ReadTensorName(std::istream& istre) {
 
   std::string str_read_tensor_name_mark(name_mark_buffer,
                                         tensor_name_mark.size());
-  PADDLE_ENFORCE_EQ(
-      tensor_name_mark, str_read_tensor_name_mark,
-      "Tensor name mark not match, expect [%s], but read from file is [%]",
-      tensor_name_mark, str_read_tensor_name_mark);
+  PADDLE_ENFORCE_EQ(tensor_name_mark, str_read_tensor_name_mark,
+                    platform::errors::InvalidArgument(
+                        "Tensor name mark does not match, expect mark is [%s], "
+                        "but the mark read from file is [%s].",
+                        tensor_name_mark, str_read_tensor_name_mark));
 
   size_t tensor_name_length = 0;
   istre.read(reinterpret_cast<char*>(&tensor_name_length),
@@ -117,16 +120,18 @@ bool SaveStaticNameListToDisk(
 
   for (size_t i = 0; i < vec_tensor_name_list.size(); ++i) {
     auto var_ptr = scope.FindVar(vec_tensor_name_list[i]);
-    PADDLE_ENFORCE_NE(
-        var_ptr, nullptr,
-        "Variable find error, when save model, can't not find vairable [%s], "
-        "Please make sure you have run StartUpProgram",
-        vec_tensor_name_list[i]);
+    PADDLE_ENFORCE_NOT_NULL(
+        var_ptr, platform::errors::NotFound("Variable (%s) is not found when "
+                                            "saving model, please make sure "
+                                            "that exe.run(startup_program) has "
+                                            "been executed.",
+                                            vec_tensor_name_list[i]));
     Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
     PADDLE_ENFORCE_EQ(tensor->IsInitialized(), true,
-                      "Paramter [%s] not initialzed,"
-                      "Please make sure you have run StartUpProgram",
-                      vec_tensor_name_list[i]);
+                      platform::errors::PreconditionNotMet(
+                          "Paramter [%s] is not initialzed, please make sure "
+                          "that exe.run(startup_program) has been executed.",
+                          vec_tensor_name_list[i]));
 
     map_tensor[vec_tensor_name_list[i]] = tensor;
   }
@@ -145,9 +150,10 @@ bool SaveDygraphVarBaseListToDisk(
     Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
 
     PADDLE_ENFORCE_EQ(tensor->IsInitialized(), true,
-                      "Paramter [%s] not initialzed,"
-                      "Please make sure you have run StartUpProgram",
-                      vec_var_base_list[i]->Name());
+                      platform::errors::PreconditionNotMet(
+                          "Paramter [%s] is not initialzed, please make sure "
+                          "that exe.run(startup_program) has been executed.",
+                          vec_var_base_list[i]->Name()));
 
     map_tensor[vec_var_base_list[i]->Name()] = tensor;
   }
@@ -185,34 +191,41 @@ bool LoadStaticNameListFromDisk(
 
   for (size_t i = 0; i < vec_tensor_name_list.size(); ++i) {
     auto it = map_load_tensor.find(vec_tensor_name_list[i]);
-    PADDLE_ENFORCE(it != map_load_tensor.end(),
-                   "Paramete not found in Model file, "
-                   "Can not find [%s] in model file [%s]",
-                   vec_tensor_name_list[i], file_name);
+    PADDLE_ENFORCE_NE(it, map_load_tensor.end(),
+                      platform::errors::NotFound(
+                          "Parameter (%s) not found in model file (%s).",
+                          vec_tensor_name_list[i], file_name));
     auto var_ptr = scope.FindVar(vec_tensor_name_list[i]);
 
-    PADDLE_ENFORCE_NE(
-        var_ptr, nullptr,
-        "Parameter not created, when load model, can't not find parameter [%s] "
-        "please make sure you have run StartUpProgram",
-        vec_tensor_name_list[i]);
+    PADDLE_ENFORCE_NOT_NULL(
+        var_ptr,
+        platform::errors::PreconditionNotMet(
+            "Parameter (%s) is not created when loading model, "
+            "please make sure that exe.run(startup_program) has been executed.",
+            vec_tensor_name_list[i]));
 
     Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NE(tensor, nullptr,
-                      "Paramter [%s] not initialzed "
-                      "please make sure you have run startUpProgram",
-                      vec_tensor_name_list[i]);
+    PADDLE_ENFORCE_NOT_NULL(
+        tensor,
+        platform::errors::PreconditionNotMet(
+            "Paramter [%s] is not initialzed, "
+            "please make sure that exe.run(startup_program) has been executed.",
+            vec_tensor_name_list[i]));
 
     PADDLE_ENFORCE_EQ(tensor->IsInitialized(), true,
-                      "Paramter [%s] not initialzed "
-                      "please make sure you have run StartUpProgram",
-                      vec_tensor_name_list[i]);
+                      platform::errors::PreconditionNotMet(
+                          "Paramter [%s] is not initialzed, "
+                          "please make sure that exe.run(startup_program) has "
+                          "been executed.v",
+                          vec_tensor_name_list[i]));
     PADDLE_ENFORCE_EQ(
         tensor->dims(), it->second->dims(),
-        "Shape not matching: the Program requires a parameter with a shape of "
-        "(%s), "
-        "while the loaded parameter (namely [ %s ]) has a shape of  (%s).",
-        tensor->dims(), vec_tensor_name_list[i], it->second->dims());
+        platform::errors::InvalidArgument(
+            "Shape does not match, the program requires a parameter with a "
+            "shape of "
+            "(%s), while the loaded parameter (namely [ %s ]) has a shape of "
+            "(%s).",
+            tensor->dims(), vec_tensor_name_list[i], it->second->dims()));
 
     TensorCopySync(*(it->second.get()), tensor->place(), tensor);
 
@@ -239,9 +252,9 @@ bool SaveTensorToDisk(const std::string& file_name,
   MkDirRecursively(DirName(file_name).c_str());
 
   std::ofstream fout(file_name, std::ios::binary);
-  if (!fout) {
-    PADDLE_THROW("File open error. Can not open file [%s]", file_name);
-  }
+  PADDLE_ENFORCE_EQ(
+      fout.is_open(), true,
+      platform::errors::Unavailable("File (%s) open failed.", file_name));
 
   // first 256 byte for reserve for fulture upgrade
   char* kReserveBuffer = new char[model_file_reserve_size];
@@ -292,9 +305,8 @@ bool SaveTensorToDisk(const std::string& file_name,
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
       data_ptr = temp.data<void>();
 #else
-      PADDLE_THROW(
-          "Tensor is in CUDA device, but paddle not compile with CUDA, this "
-          "should not happen");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Tensor is in CUDA device, but paddle not compiled with CUDA."));
 #endif
     }
     fout.write(static_cast<const char*>(data_ptr),
@@ -302,8 +314,9 @@ bool SaveTensorToDisk(const std::string& file_name,
   }
 
   if (!fout) {
-    PADDLE_THROW("Model save failed, data write to model file [%s] error",
-                 file_name);
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Model save failed, error when writing data into model file [%s].",
+        file_name));
   }
 
   fout.close();
@@ -316,9 +329,9 @@ bool LoadTensorFromDisk(
     std::map<std::string, std::shared_ptr<Tensor>>* map_tensor) {
   std::ifstream fin(file_name, std::ios::binary);
 
-  if (!fin) {
-    PADDLE_THROW("File open error. Can not open model file [%s]", file_name);
-  }
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("File (%s) open failed.", file_name));
 
   ReadReserveBuffer(fin);
 
@@ -331,7 +344,8 @@ bool LoadTensorFromDisk(
     uint32_t version;
     fin.read(reinterpret_cast<char*>(&version), sizeof(version));
     CheckInStreamState(fin, sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+    PADDLE_ENFORCE_EQ(version, 0U, platform::errors::InvalidArgument(
+                                       "Only version 0 tensor is supported."));
     proto::VarType::TensorDesc desc;
     {
       // int32_t size
@@ -344,7 +358,7 @@ bool LoadTensorFromDisk(
       CheckInStreamState(fin, sizeof(size));
       PADDLE_ENFORCE_EQ(
           desc.ParseFromArray(buf.get(), size), true,
-          platform::errors::InvalidArgument("Cannot parse tensor desc"));
+          platform::errors::InvalidArgument("Parse tensor desc failed."));
     }
 
     {  // read tensor
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 54a818250b4..1f402ea9dd3 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -113,7 +113,9 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
     // the 1st field, unit32_t version for SelectedRows
     uint32_t version;
     is.read(reinterpret_cast<char*>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+    PADDLE_ENFORCE_EQ(version, 0U,
+                      platform::errors::InvalidArgument(
+                          "Only version 0 SelectedRows is supported."));
   }
   {
     // the 2st field, rows information
@@ -155,24 +157,27 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown,
   auto iter = id_to_index_.find(key);
   if (iter == id_to_index_.end()) {
     rwlock_->UNLock();
-    if (!auto_grown) {
-      PADDLE_THROW("key %d not found", key);
-    }
+    PADDLE_ENFORCE_EQ(
+        auto_grown, true,
+        platform::errors::NotFound("Input key(%lld) is not found.", key));
     rwlock_->WRLock();
     auto map_size = id_to_index_.size();
     auto vector_size = rows_.size();
     if (map_size != vector_size) {
       rwlock_->UNLock();
-      PADDLE_THROW(
-          "id_to_index_ size %d should have the same size with rows_ %d",
-          map_size, vector_size);
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Row map size(%zu) should be equal to rows size(%zu).", map_size,
+          vector_size));
     }
     auto write_iter = id_to_index_.find(key);
     if (write_iter == id_to_index_.end()) {
       int row_num = rows_.size();
       if (row_num == value_->dims()[0]) {
         rwlock_->UNLock();
-        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Selected rows is full, then length exceed the length of first "
+            "dimension (%d).",
+            row_num));
       }
       // key logic to put a key into id_to_index_
       rows_.push_back(key);
@@ -203,15 +208,20 @@ void SelectedRows::SyncIndex() {
 
 void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
                        bool auto_grown, bool is_test) {
-  PADDLE_ENFORCE(value->IsInitialized(),
-                 "The value tensor should be initialized.");
+  PADDLE_ENFORCE_EQ(value->IsInitialized(), true,
+                    platform::errors::InvalidArgument(
+                        "The value tensor is not initialized."));
   if (ids.numel() == 0) {
     VLOG(3) << "keys is empty, please check data!";
   } else {
     int64_t value_width = value_->numel() / value_->dims()[0];
-    PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
-                      "output tensor should have the same shape with table "
-                      "except the dims[0].");
+    PADDLE_ENFORCE_EQ(
+        value_width, value->numel() / value->dims()[0],
+        platform::errors::InvalidArgument(
+            "Output tensor should have the same shape with table "
+            "except the first dimmension, excepted value width not counting "
+            "the first dimension is %d, actual value width is %d.",
+            value_width, value->numel() / value->dims()[0]));
     for (int i = 0; i < ids.numel(); ++i) {
       auto id = ids.data<int64_t>()[i];
       int64_t index = AutoGrownIndex(id, auto_grown, is_test);
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 5f733139419..285af1d5530 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -82,7 +82,8 @@ class SelectedRows {
   int64_t Index(int64_t key) const {
     auto it = std::find(rows_.begin(), rows_.end(), key);
     if (it == rows_.end()) {
-      PADDLE_THROW("id %s not in table", key);
+      PADDLE_THROW(platform::errors::NotFound(
+          "Input id (%lld) is not in current rows table.", key));
     }
     return static_cast<int64_t>(std::distance(rows_.begin(), it));
   }
diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc
index 4ac872ac3d3..f5bb3f68007 100644
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
@@ -25,20 +25,22 @@ namespace framework {
 std::vector<DDim> InferShapeContext::GetReaderDims(
     const std::string &name) const {
   const std::vector<std::string> &arg_names = Inputs(name);
-  PADDLE_ENFORCE_EQ(
-      arg_names.size(), 1UL,
-      "Reader input '%s' should hold one element, but now it holds %d", name,
-      arg_names.size());
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "Reader input '%s' should hold one element, but now it "
+                        "holds %d elements.",
+                        name, arg_names.size()));
   return this->GetRepeatedDims(arg_names[0]);
 }
 
 void InferShapeContext::SetReaderDims(const std::string &name,
                                       const std::vector<DDim> &dims) {
   const std::vector<std::string> &arg_names = Outputs(name);
-  PADDLE_ENFORCE_EQ(
-      arg_names.size(), 1UL,
-      "Reader output '%s' should hold one element, but now it holds %d", name,
-      arg_names.size());
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "Reader output '%s' should hold one element, but now "
+                        "it holds %d elements.",
+                        name, arg_names.size()));
   return this->SetRepeatedDims(arg_names[0], dims);
 }
 
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index c3626c5c9e0..0e3d11b9f02 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -94,9 +94,17 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place), true,
+        platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
     auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place,
+                      platform::errors::Unavailable(
+                          "Source place and context place do not match, source "
+                          "place is %s, context place is %s.",
+                          src_gpu_place, ctx_gpu_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
@@ -106,9 +114,17 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place), true,
+        platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
     auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
-    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place,
+                      platform::errors::Unavailable(
+                          "Destination place and context place do not match, "
+                          "destination place is %s, context place is %s.",
+                          dst_gpu_place, ctx_gpu_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
@@ -164,7 +180,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place), true,
+        platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     if (platform::is_same_place(src_place, dst_place)) {
@@ -180,12 +200,14 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                      stream);
       } else {
-        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Context place dose not match the source and destination place."));
       }
     }
   }
   else {  // NOLINT
-    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copying from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
 }
@@ -298,7 +320,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                  nullptr);
   }
   else {  // NOLINT
-    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
 }
@@ -832,7 +855,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
 void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst,
                             const platform::Place& dst_place) {
   // vector types not currently supported
-  PADDLE_ENFORCE_LE(type.lanes, 1, "vector types not currently supported");
+  PADDLE_ENFORCE_LE(type.lanes, 1,
+                    platform::errors::Unimplemented(
+                        "Vector type is not supported currently."));
 
   switch (type.bits) {
     case 8:
@@ -840,32 +865,37 @@ void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst,
         return static_cast<void*>(dst->mutable_data<int8_t>(dst_place));
       if (type.code == kDLUInt)
         return static_cast<void*>(dst->mutable_data<uint8_t>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
     case 16:
       if (type.code == kDLInt)
         return static_cast<void*>(dst->mutable_data<int16_t>(dst_place));
       if (type.code == kDLFloat)
         return static_cast<void*>(
             dst->mutable_data<paddle::platform::float16>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
     case 32:
       if (type.code == kDLInt)
         return static_cast<void*>(dst->mutable_data<int32_t>(dst_place));
       if (type.code == kDLFloat)
         return static_cast<void*>(dst->mutable_data<float>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
     case 64:
       if (type.code == kDLInt)
         return static_cast<void*>(dst->mutable_data<int64_t>(dst_place));
       if (type.code == kDLFloat)
         return static_cast<void*>(dst->mutable_data<double>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
     default:
-      PADDLE_THROW("Unsupport type.bits %d", type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported DLDataType.bits %d.", type.bits));
   }
 }
 
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index fce0142b41d..a0408dbc3db 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -183,7 +183,11 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
   dst->resize(src.numel());
   auto dst_ptr = static_cast<void*>(dst->data());
 
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(src.place()), true);
+  PADDLE_ENFORCE_EQ(
+      platform::is_cpu_place(src.place()), true,
+      platform::errors::InvalidArgument(
+          "The input tensor should be CPU device, but actually it is in %s.",
+          src.place()));
 
   memory::Copy(dst_place, dst_ptr,
                BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
-- 
GitLab


From c296618c94c891775ecec76cf2a12d98dfd0aa62 Mon Sep 17 00:00:00 2001
From: ShenLiang <shenliang03@baidu.com>
Date: Wed, 16 Sep 2020 13:22:57 +0800
Subject: [PATCH 096/261] fix error message in broadcast/allreduce/gather
 (#27302)

* fix error message
---
 .../operators/distributed_ops/allreduce_op.h  |  3 +-
 .../operators/distributed_ops/broadcast_op.cc |  3 +-
 .../distributed_ops/broadcast_op.cu.cc        |  5 ++-
 paddle/fluid/operators/gather_op.cc           | 17 ++++++++-
 .../tests/unittests/test_broadcast_error.py   | 38 +++++++++++++++++++
 5 files changed, 60 insertions(+), 6 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_broadcast_error.py

diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.h b/paddle/fluid/operators/distributed_ops/allreduce_op.h
index c77113ad405..e486faa5758 100644
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.h
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.h
@@ -76,7 +76,8 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
     }
 #else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
 #endif
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cc
index 535cf701441..61e27887b68 100644
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cc
+++ b/paddle/fluid/operators/distributed_ops/broadcast_op.cc
@@ -58,7 +58,8 @@ template <typename T>
 class BroadcastOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("Broadcast op can run on gpu place only for now.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Broadcast op can run on gpu place only for now."));
   }
 };
 
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
index f067840e539..337422f0bd6 100644
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
@@ -68,10 +68,11 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
             << " From " << root_dev_id << " to " << dev_id;
 
     if (ctx.Attr<bool>("sync_mode")) {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
     }
 #else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
 #endif
   }
 };
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 28afeb6f541..a99879316d6 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -37,8 +37,21 @@ class GatherOp : public framework::OperatorWithKernel {
                           "Output(Out) of GatherOp should not be null."));
 
     auto index_dims = ctx->GetInputDim("Index");
-    PADDLE_ENFORCE(index_dims.size() == 1 ||
-                   (index_dims.size() == 2 && index_dims[1] == 1));
+
+    if (index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          index_dims[1], 1,
+          platform::errors::InvalidArgument(
+              "The last dim of index should be 1 when it is 2D, but we get %d",
+              index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          index_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The index should be 1D, when it is not 2D, but we get %d",
+              index_dims.size()));
+    }
+
     int batch_size = ctx->GetInputDim("Index")[0];
     framework::DDim output_dims(ctx->GetInputDim("X"));
     output_dims[0] = batch_size;
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_error.py b/python/paddle/fluid/tests/unittests/test_broadcast_error.py
new file mode 100644
index 00000000000..517de67fd6d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_broadcast_error.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+class TestBroadcastOpCpu(OpTest):
+    def setUp(self):
+        self.op_type = "broadcast"
+        input = np.random.random((100, 2)).astype("float32")
+        np_out = input[:]
+        self.inputs = {"X": input}
+        self.attrs = {"sync_mode": False, "root": 0}
+        self.outputs = {"Out": np_out}
+
+    def test_check_output_cpu(self):
+        try:
+            self.check_output_with_place(place=core.CPUPlace())
+        except:
+            print("do not support cpu test, skip")
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From c67c39168282bd3da37b4905b8d1856ca30275f2 Mon Sep 17 00:00:00 2001
From: yaoxuefeng <yaoxuefeng@baidu.com>
Date: Wed, 16 Sep 2020 13:37:07 +0800
Subject: [PATCH 097/261] refine fleet dataset class api (#27133)

---
 python/paddle/distributed/__init__.py         |    8 +-
 python/paddle/distributed/fleet/__init__.py   |    1 -
 .../distributed/fleet/dataset/dataset.py      | 1000 ++++++++++-------
 python/paddle/fluid/reader.py                 |    4 +-
 .../fluid/tests/unittests/dist_fleet_ctr.py   |   12 +-
 .../tests/unittests/dist_fleet_ctr_ps_gpu.py  |   10 +-
 .../tests/unittests/dist_fleet_heter_ctr.py   |   10 +-
 .../fluid/tests/unittests/test_dataset.py     |  328 ++++--
 .../unittests/test_dataset_dataloader.py      |   12 +-
 .../tests/unittests/test_fleet_rolemaker_2.py |    5 +-
 .../fluid/tests/unittests/test_monitor.py     |   13 +-
 11 files changed, 835 insertions(+), 568 deletions(-)

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index b7357eef7ad..27c82227316 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -21,6 +21,7 @@ from .parallel import get_rank
 from .parallel import get_world_size
 from paddle.fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
 from paddle.fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
+from paddle.distributed.fleet.dataset import *
 
 from . import collective
 from .collective import *
@@ -30,11 +31,8 @@ __all__ = ["spawn"]
 
 # dygraph parallel apis
 __all__ += [
-    "init_parallel_env",
-    "get_rank",
-    "get_world_size",
-    "prepare_context",
-    "ParallelEnv",
+    "init_parallel_env", "get_rank", "get_world_size", "prepare_context",
+    "ParallelEnv", "InMemoryDataset", "QueueDataset"
 ]
 
 # collective apis
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 5f0cf9f93d6..2539fa57a34 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -23,7 +23,6 @@ from .dataset import *
 __all__ = [
     "DistributedStrategy",
     "UtilBase",
-    "DatasetFactory",
     "UserDefinedRoleMaker",
     "PaddleCloudRoleMaker",
     "Fleet",
diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
index f6504cacd96..5bd971181ed 100644
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -14,54 +14,11 @@
 """This is definition of dataset class, which is high performance IO."""
 
 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.proto import data_feed_pb2
 from google.protobuf import text_format
 import paddle.fluid.core as core
 
 
-class DatasetFactory(object):
-    """
-    DatasetFactory is a factory which create dataset by its name,
-    you can create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
-    the default is "QueueDataset".
-
-    Example:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-
-    """
-
-    def __init__(self):
-        """ Init. """
-        pass
-
-    def create_dataset(self, datafeed_class="QueueDataset"):
-        """
-        Create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
-        the default is "QueueDataset".
-
-        Args:
-            datafeed_class(str): datafeed class name, QueueDataset or InMemoryDataset.
-                                 Default is QueueDataset.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-
-        """
-        try:
-            dataset = globals()[datafeed_class]()
-            return dataset
-        except:
-            raise ValueError("datafeed class %s does not exist" %
-                             datafeed_class)
-
-
 class DatasetBase(object):
     """ Base dataset class. """
 
@@ -75,96 +32,67 @@ class DatasetBase(object):
         self.thread_num = 1
         self.filelist = []
 
-    def set_pipe_command(self, pipe_command):
+    def init(self,
+             batch_size=1,
+             thread_num=1,
+             use_var=[],
+             pipe_command="cat",
+             input_type=0,
+             fs_name="",
+             fs_ugi="",
+             download_cmd="cat"):
         """
-        Set pipe command of current dataset
-        A pipe command is a UNIX pipeline command that can be used only
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_pipe_command("python my_script.py")
+        should be called only once in user's python scripts to initialize setings of dataset instance. 
+        Normally, it is called by InMemoryDataset or QueueDataset.
 
         Args:
-            pipe_command(str): pipe command
+            batch_size(int): batch size. It will be effective during training. default is 1.
+            thread_num(int): thread num, it is the num of readers. default is 1.
+            use_var(list): list of variables. Variables which you will use. default is [].
+            pipe_command(str): pipe command of current dataset. A pipe command is a UNIX pipeline command that can be used only. default is "cat"
+            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. defalut is 0.
+            fs_name(str): fs name. default is "".
+            fs_ugi(str): fs ugi. default is "".
+            download_cmd(str): customized download command. default is "cat"
 
-        """
-        self.proto_desc.pipe_command = pipe_command
 
-    def set_rank_offset(self, rank_offset):
         """
-        Set rank_offset for merge_pv. It set the message of Pv.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_rank_offset("rank_offset")
-
-        Args:
-            rank_offset(str): rank_offset's name
+        self._set_batch_size(batch_size)
+        self._set_thread(thread_num)
+        self._set_use_var(use_var)
+        self._set_pipe_command(pipe_command)
+        self._set_input_type(input_type)
+        self._set_hdfs_config(fs_name, fs_ugi)
+        self._set_download_cmd(download_cmd)
 
+    def _set_pipe_command(self, pipe_command):
         """
-        self.proto_desc.rank_offset = rank_offset
+        Set pipe command of current dataset
+        A pipe command is a UNIX pipeline command that can be used only
 
-    def set_fea_eval(self, record_candidate_size, fea_eval=True):
-        """
-        set fea eval mode for slots shuffle to debug the importance level of
-        slots(features), fea_eval need to be set True for slots shuffle.
-        
-        Args:
-            record_candidate_size(int): size of instances candidate to shuffle 
-                                        one slot
-            fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
-                            default is True.
-            
         Examples:
             .. code-block:: python
 
-            import paddle.fluid as fluid
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-            dataset.set_fea_eval(1000000, True)
+              import paddle
+              dataset = paddle.distributed.fleet.dataset.DatasetBase()
+              dataset._set_pipe_command("python my_script.py")
 
-        """
-        if fea_eval:
-            self.dataset.set_fea_eval(fea_eval, record_candidate_size)
-        self.fea_eval = fea_eval
-
-    def slots_shuffle(self, slots):
-        """
-        Slots Shuffle 
-        Slots Shuffle is a shuffle method in slots level, which is usually used 
-        in sparse feature with large scale of instances. To compare the metric, i.e.
-        auc while doing slots shuffle on one or several slots with baseline to 
-        evaluate the importance level of slots(features).
-        
         Args:
-            slots(list[string]): the set of slots(string) to do slots shuffle.
+            pipe_command(str): pipe command
 
-        Examples:
-            import paddle.fluid as fluid
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-            dataset.set_merge_by_lineid()
-            #suppose there is a slot 0
-            dataset.slots_shuffle(['0'])
         """
-        if self.fea_eval:
-            slots_set = set(slots)
-            self.dataset.slots_shuffle(slots_set)
+        self.proto_desc.pipe_command = pipe_command
 
-    def set_batch_size(self, batch_size):
+    def _set_batch_size(self, batch_size):
         """
         Set batch size. Will be effective during training
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_batch_size(128)
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_batch_size(128)
 
         Args:
             batch_size(int): batch size
@@ -172,32 +100,16 @@ class DatasetBase(object):
         """
         self.proto_desc.batch_size = batch_size
 
-    def set_pv_batch_size(self, pv_batch_size):
-        """
-        Set pv batch size. It will be effective during enable_pv_merge
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_pv_batch(128)
-        Args:
-            pv_batch_size(int): pv batch size
-
-        """
-        self.proto_desc.pv_batch_size = pv_batch_size
-
-    def set_thread(self, thread_num):
+    def _set_thread(self, thread_num):
         """
         Set thread num, it is the num of readers.
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-               dataset.set_thread(12)
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_thread(12)
 
         Args:
             thread_num(int): thread num
@@ -212,8 +124,8 @@ class DatasetBase(object):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
               dataset.set_filelist(['a.txt', 'b.txt'])
 
         Args:
@@ -222,19 +134,19 @@ class DatasetBase(object):
         self.dataset.set_filelist(filelist)
         self.filelist = filelist
 
-    def set_input_type(self, input_type):
+    def _set_input_type(self, input_type):
         self.proto_desc.input_type = input_type
 
-    def set_use_var(self, var_list):
+    def _set_use_var(self, var_list):
         """
         Set Variables which you will use.
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_use_var([data, label])
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_use_var([data, label])
 
         Args:
             var_list(list): variable list
@@ -253,19 +165,19 @@ class DatasetBase(object):
                 slot_var.type = "uint64"
             else:
                 raise ValueError(
-                    "Currently, fluid.dataset only supports dtype=float32 and dtype=int64"
+                    "Currently, paddle.distributed.fleet.dataset only supports dtype=float32 and dtype=int64"
                 )
 
-    def set_hdfs_config(self, fs_name, fs_ugi):
+    def _set_hdfs_config(self, fs_name, fs_ugi):
         """
         Set hdfs config: fs name ad ugi
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_hdfs_config("my_fs_name", "my_fs_ugi")
 
         Args:
             fs_name(str): fs name
@@ -273,16 +185,16 @@ class DatasetBase(object):
         """
         self.dataset.set_hdfs_config(fs_name, fs_ugi)
 
-    def set_download_cmd(self, download_cmd):
+    def _set_download_cmd(self, download_cmd):
         """
         Set customized download cmd: download_cmd
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_download_cmd("./read_from_afs")
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_download_cmd("./read_from_afs")
 
         Args:
             download_cmd(str): customized download command
@@ -297,22 +209,22 @@ class DatasetBase(object):
         if self.thread_num > len(self.filelist):
             self.thread_num = len(self.filelist)
         self.dataset.set_thread_num(self.thread_num)
-        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.set_data_feed_desc(self._desc())
         self.dataset.create_readers()
 
     def _finish_to_run(self):
         self.dataset.destroy_readers()
 
-    def desc(self):
+    def _desc(self):
         """
         Returns a protobuf message for this DataFeedDesc
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              print(dataset.desc())
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              print(dataset._desc())
 
         Returns:
             A string message
@@ -330,10 +242,10 @@ class InMemoryDataset(DatasetBase):
     """
     InMemoryDataset, it will load data into memory
     and shuffle data before training.
-    This class should be created by DatasetFactory
 
     Example:
-        dataset = paddle.fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        import paddle
+        dataset = paddle.distributed.InMemoryDataset()
     """
 
     def __init__(self):
@@ -351,7 +263,229 @@ class InMemoryDataset(DatasetBase):
         self.merge_by_lineid = False
         self.fleet_send_sleep_seconds = None
 
-    def set_feed_type(self, data_feed_type):
+    def _init_distributed_settings(self, **kwargs):
+        """
+        should be called only once in user's python scripts to initialize distributed-related setings of dataset instance
+        Args:
+            kwargs: Keyword arguments. Currently, we support following keys in **kwargs:
+
+            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id, 
+                             instances of same line id will be merged after shuffle, 
+                             you should parse line id in data generator. default is -1.
+            parse_ins_id(bool): Set if Dataset need to parse ins_id. default is False.
+            parse_content(bool): Set if Dataset need to parse content. default is False.
+            fleet_send_batch_size(int): Set fleet send batch size in one rpc, default is 1024
+            fleet_send_sleep_seconds(int): Set fleet send sleep time, default is 0
+            fea_eval(bool): Set if Dataset need to do feature importance evaluation using slots shuffle.
+                            default is False.
+            candidate_size(int): if fea_eval is set True, set the candidate size used in slots shuffle.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset.init(
+                    batch_size=1,
+                    thread_num=2,
+                    input_type=1,
+                    pipe_command="cat",
+                    use_var=[])
+              dataset._init_distributed_settings(
+                    parse_ins_id=True,
+                    parse_content=True,
+                    fea_eval=True,
+                    candidate_size=10000)
+              
+        """
+        merge_size = kwargs.get("merge_size", -1)
+        if merge_size > 0:
+            self._set_merge_by_lineid(merge_size)
+
+        parse_ins_id = kwargs.get("parse_ins_id", False)
+        self._set_parse_ins_id(parse_ins_id)
+
+        parse_content = kwargs.get("parse_content", False)
+        self._set_parse_content(parse_content)
+
+        fleet_send_batch_size = kwargs.get("fleet_send_batch_size", None)
+        if fleet_send_batch_size:
+            self._set_fleet_send_batch_size(fleet_send_batch_size)
+
+        fleet_send_sleep_seconds = kwargs.get("fleet_send_sleep_seconds", None)
+        if fleet_send_sleep_seconds:
+            self._set_fleet_send_sleep_seconds(fleet_send_sleep_seconds)
+
+        fea_eval = kwargs.get("fea_eval", False)
+        if fea_eval:
+            candidate_size = kwargs.get("candidate_size", 10000)
+            self._set_fea_eval(candidate_size, True)
+
+    def update_settings(self, **kwargs):
+        """
+        should be called in user's python scripts to update setings of dataset instance
+        Args:
+            kwargs: Keyword arguments. Currently, we support following keys in **kwargs,
+                    including single node settings and advanced distributed related settings:
+
+            batch_size(int): batch size. It will be effective during training. default is 1.
+            thread_num(int): thread num, it is the num of readers. default is 1.
+            use_var(list): list of variables. Variables which you will use. default is [].
+            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. defalut is 0.
+            fs_name(str): fs name. default is "".
+            fs_ugi(str): fs ugi. default is "".
+            pipe_command(str): pipe command of current dataset. A pipe command is a UNIX pipeline command that can be used only. default is "cat"
+            download_cmd(str): customized download command. default is "cat"
+            data_feed_type(str): data feed type used in c++ code. default is "MultiSlotInMemoryDataFeed".
+            queue_num(int): Dataset output queue num, training threads get data from queues. default is-1, which is set same as thread number in c++.
+
+            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id, 
+                             instances of same line id will be merged after shuffle, 
+                             you should parse line id in data generator. default is -1.
+            parse_ins_id(bool): Set if Dataset need to parse ins_id. default is False.
+            parse_content(bool): Set if Dataset need to parse content. default is False.
+            fleet_send_batch_size(int): Set fleet send batch size in one rpc, default is 1024
+            fleet_send_sleep_seconds(int): Set fleet send sleep time, default is 0
+            fea_eval(bool): Set if Dataset need to do feature importance evaluation using slots shuffle.
+                            default is False.
+            candidate_size(int): if fea_eval is set True, set the candidate size used in slots shuffle.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset.init(
+                    batch_size=1,
+                    thread_num=2,
+                    input_type=1,
+                    pipe_command="cat",
+                    use_var=[])
+              dataset._init_distributed_settings(
+                    parse_ins_id=True,
+                    parse_content=True,
+                    fea_eval=True,
+                    candidate_size=10000)
+              dataset.update_settings(batch_size=2)
+            
+        """
+        for key in kwargs:
+            if key == "pipe_command":
+                self._set_pipe_command(kwargs[key])
+            elif key == "batch_size":
+                self._set_batch_size(kwargs[key])
+            elif key == "thread_num":
+                self._set_thread(kwargs[key])
+            elif key == "use_var":
+                self._set_use_var(kwargs[key])
+            elif key == "input_type":
+                self._set_input_type(kwargs[key])
+            elif key == "fs_name" and "fs_ugi" in kwargs:
+                self._set_hdfs_config(kwargs[key], kwargs["fs_ugi"])
+            elif key == "download_cmd":
+                self._set_download_cmd(kwargs[key])
+            elif key == "merge_size" and kwargs.get("merge_size", -1) > 0:
+                self._set_merge_by_lineid(kwargs[key])
+            elif key == "parse_ins_id":
+                self._set_parse_ins_id(kwargs[key])
+            elif key == "parse_content":
+                self._set_parse_content(kwargs[key])
+            elif key == "fleet_send_batch_size":
+                self._set_fleet_send_batch_size(kwargs[key])
+            elif key == "fleet_send_sleep_seconds":
+                self._set_fleet_send_sleep_seconds(kwargs[key])
+            elif key == "fea_eval" and kwargs[key] == True:
+                candidate_size = kwargs.get("candidate_size", 10000)
+                self._set_fea_eval(candidate_size, True)
+
+    def init(self, **kwargs):
+        """
+        should be called only once in user's python scripts to initialize setings of dataset instance
+        Args:
+            kwargs: Keyword arguments. Currently, we support following keys in **kwargs:
+            
+            batch_size(int): batch size. It will be effective during training. default is 1.
+            thread_num(int): thread num, it is the num of readers. default is 1.
+            use_var(list): list of variables. Variables which you will use. default is [].
+            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. defalut is 0.
+            fs_name(str): fs name. default is "".
+            fs_ugi(str): fs ugi. default is "".
+            pipe_command(str): pipe command of current dataset. A pipe command is a UNIX pipeline command that can be used only. default is "cat"
+            download_cmd(str): customized download command. default is "cat"
+            data_feed_type(str): data feed type used in c++ code. default is "MultiSlotInMemoryDataFeed".
+            queue_num(int): Dataset output queue num, training threads get data from queues. default is -1, which is set same as thread number in c++.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                with open("test_queue_dataset_run_a.txt", "w") as f:
+                    data = "2 1 2 2 5 4 2 2 7 2 1 3\n"
+                    data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
+                    data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
+                    data += "2 7 2 2 1 9 2 3 7 2 5 3\n"
+                    f.write(data)
+                with open("test_queue_dataset_run_b.txt", "w") as f:
+                    data = "2 1 2 2 5 4 2 2 7 2 1 3\n"
+                    data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
+                    data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
+                    data += "2 7 2 2 1 9 2 3 7 2 5 3\n"
+                    f.write(data)
+
+                slots = ["slot1", "slot2", "slot3", "slot4"]
+                slots_vars = []
+                for slot in slots:
+                    var = fluid.data(
+                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                    slots_vars.append(var)
+
+                dataset = paddle.distributed.InMemoryDataset()
+                dataset.init(
+                    batch_size=1,
+                    thread_num=2,
+                    input_type=1,
+                    pipe_command="cat",
+                    use_var=slots_vars)
+                dataset.set_filelist(
+                    ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
+                dataset.load_into_memory()
+
+                exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
+                ) else fluid.CUDAPlace(0))
+                exe.run(fluid.default_startup_program())
+                exe.train_from_dataset(fluid.default_main_program(),
+                                           dataset)
+                os.remove("./test_queue_dataset_run_a.txt")
+                os.remove("./test_queue_dataset_run_b.txt")
+        """
+        batch_size = kwargs.get("batch_size", 1)
+        thread_num = kwargs.get("thread_num", 1)
+        use_var = kwargs.get("use_var", [])
+        input_type = kwargs.get("input_type", 0)
+        fs_name = kwargs.get("fs_name", "")
+        fs_ugi = kwargs.get("fs_ugi", "")
+        pipe_command = kwargs.get("pipe_command", "cat")
+        download_cmd = kwargs.get("download_cmd", "cat")
+
+        super(InMemoryDataset, self).init(
+            batch_size=batch_size,
+            thread_num=thread_num,
+            use_var=use_var,
+            pipe_command=pipe_command,
+            input_type=input_type,
+            fs_name=fs_name,
+            fs_ugi=fs_ugi,
+            download_cmd=download_cmd)
+
+        data_feed_type = kwargs.get("data_feed_type",
+                                    "MultiSlotInMemoryDataFeed")
+        self._set_feed_type(data_feed_type)
+
+        if kwargs.get("queue_num", -1) > 0:
+            queue_num = kwargs.get("queue_num", -1)
+            self._set_queue_num(queue_num)
+
+    def _set_feed_type(self, data_feed_type):
         """
         Set data_feed_desc
         """
@@ -373,7 +507,7 @@ class InMemoryDataset(DatasetBase):
         self.dataset.set_parse_logkey(self.parse_logkey)
         self.dataset.set_merge_by_sid(self.merge_by_sid)
         self.dataset.set_enable_pv_merge(self.enable_pv_merge)
-        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.set_data_feed_desc(self._desc())
         self.dataset.create_channel()
         self.dataset.create_readers()
 
@@ -387,7 +521,7 @@ class InMemoryDataset(DatasetBase):
             self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
         self.dataset.dynamic_adjust_readers_num(self.thread_num)
 
-    def set_queue_num(self, queue_num):
+    def _set_queue_num(self, queue_num):
         """
         Set Dataset output queue num, training threads get data from queues
 
@@ -397,17 +531,17 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_queue_num(12)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_queue_num(12)
 
         """
         self.is_user_set_queue_num = True
         self.queue_num = queue_num
 
-    def set_parse_ins_id(self, parse_ins_id):
+    def _set_parse_ins_id(self, parse_ins_id):
         """
-        Set id Dataset need to parse insid
+        Set if Dataset need to parse insid
 
         Args:
             parse_ins_id(bool): if parse ins_id or not
@@ -415,14 +549,14 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_ins_id(True)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_parse_ins_id(True)
 
         """
         self.parse_ins_id = parse_ins_id
 
-    def set_parse_content(self, parse_content):
+    def _set_parse_content(self, parse_content):
         """
         Set if Dataset need to parse content
 
@@ -432,120 +566,14 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_content(True)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_parse_content(True)
 
         """
         self.parse_content = parse_content
 
-    def set_parse_logkey(self, parse_logkey):
-        """
-        Set if Dataset need to parse logkey
-
-        Args:
-            parse_content(bool): if parse logkey or not
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_logkey(True)
-
-        """
-        self.parse_logkey = parse_logkey
-
-    def set_merge_by_sid(self, merge_by_sid):
-        """
-        Set if Dataset need to merge sid. If not, one ins means one Pv.
-
-        Args:
-            merge_by_sid(bool): if merge sid or not
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_merge_by_sid(True)
-
-        """
-        self.merge_by_sid = merge_by_sid
-
-    def set_enable_pv_merge(self, enable_pv_merge):
-        """
-        Set if Dataset need to merge pv.
-
-        Args:
-            enable_pv_merge(bool): if enable_pv_merge or not
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_enable_pv_merge(True)
-
-        """
-        self.enable_pv_merge = enable_pv_merge
-
-    def preprocess_instance(self):
-        """
-        Merge pv instance and convey it from input_channel to input_pv_channel. 
-        It will be effective when enable_pv_merge_ is True.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.preprocess_instance()
-
-        """
-        self.dataset.preprocess_instance()
-
-    def set_current_phase(self, current_phase):
-        """
-        Set current phase in train. It is useful for untest.
-        current_phase : 1 for join, 0 for update.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.set_current_phase(1)
-
-        """
-        self.dataset.set_current_phase(current_phase)
-
-    def postprocess_instance(self):
-        """
-        Divide pv instance and convey it to input_channel.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.preprocess_instance()
-              exe.train_from_dataset(dataset)
-              dataset.postprocess_instance()
-
-        """
-        self.dataset.postprocess_instance()
-
-    def set_fleet_send_batch_size(self, fleet_send_batch_size=1024):
+    def _set_fleet_send_batch_size(self, fleet_send_batch_size=1024):
         """
         Set fleet send batch size, default is 1024
 
@@ -555,14 +583,14 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_fleet_send_batch_size(800)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_fleet_send_batch_size(800)
 
         """
         self.fleet_send_batch_size = fleet_send_batch_size
 
-    def set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0):
+    def _set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0):
         """
         Set fleet send sleep time, default is 0
 
@@ -572,14 +600,14 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_fleet_send_sleep_seconds(2)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_fleet_send_sleep_seconds(2)
 
         """
         self.fleet_send_sleep_seconds = fleet_send_sleep_seconds
 
-    def set_merge_by_lineid(self, merge_size=2):
+    def _set_merge_by_lineid(self, merge_size=2):
         """
         Set merge by line id, instances of same line id will be merged after
         shuffle, you should parse line id in data generator.
@@ -590,22 +618,22 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_merge_by_lineid()
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_merge_by_lineid()
 
         """
         self.dataset.set_merge_by_lineid(merge_size)
         self.merge_by_lineid = True
         self.parse_ins_id = True
 
-    def set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num):
+    def _set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num):
         self.dataset.set_generate_unique_feasigns(generate_uni_feasigns)
         self.gen_uni_feasigns = generate_uni_feasigns
         self.local_shard_num = shard_num
 
-    def generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num,
-                                     consume_thread_num, shard_num):
+    def _generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num,
+                                      consume_thread_num, shard_num):
         self.dataset.generate_local_tables_unlock(
             table_id, fea_dim, read_thread_num, consume_thread_num, shard_num)
 
@@ -616,8 +644,8 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -635,8 +663,8 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.preload_into_memory()
@@ -656,8 +684,8 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.preload_into_memory()
@@ -673,8 +701,8 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -692,9 +720,9 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -736,9 +764,9 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -751,30 +779,6 @@ class InMemoryDataset(DatasetBase):
         """
         self.dataset.release_memory()
 
-    def get_pv_data_size(self):
-        """
-        Get memory data size of Pv, user can call this function to know the pv num
-        of ins in all workers after load into memory.
-
-        Note:
-            This function may cause bad performance, because it has barrier
-
-        Returns:
-            The size of memory pv data.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              print dataset.get_pv_data_size()
-
-        """
-        return self.dataset.get_pv_data_size()
-
     def get_memory_data_size(self, fleet=None):
         """
         Get memory data size, user can call this function to know the num
@@ -792,9 +796,9 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -829,9 +833,9 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -849,6 +853,51 @@ class InMemoryDataset(DatasetBase):
             return global_data_size[0]
         return local_data_size[0]
 
+    def _set_fea_eval(self, record_candidate_size, fea_eval=True):
+        """
+        set fea eval mode for slots shuffle to debug the importance level of
+        slots(features), fea_eval need to be set True for slots shuffle.
+        
+        Args:
+            record_candidate_size(int): size of instances candidate to shuffle 
+                                        one slot
+            fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
+                            default is True.
+            
+        Examples:
+            .. code-block:: python
+
+            import paddle
+            dataset = paddle.distributed.InMemoryDataset()
+            dataset._set_fea_eval(1000000, True)
+
+        """
+        if fea_eval:
+            self.dataset.set_fea_eval(fea_eval, record_candidate_size)
+        self.fea_eval = fea_eval
+
+    def slots_shuffle(self, slots):
+        """
+        Slots Shuffle 
+        Slots Shuffle is a shuffle method in slots level, which is usually used 
+        in sparse feature with large scale of instances. To compare the metric, i.e.
+        auc while doing slots shuffle on one or several slots with baseline to 
+        evaluate the importance level of slots(features).
+        
+        Args:
+            slots(list[string]): the set of slots(string) to do slots shuffle.
+
+        Examples:
+            import paddle
+            dataset = paddle.distributed.InMemoryDataset()
+            dataset.set_merge_by_lineid()
+            #suppose there is a slot 0
+            dataset.slots_shuffle(['0'])
+        """
+        if self.fea_eval:
+            slots_set = set(slots)
+            self.dataset.slots_shuffle(slots_set)
+
 
 class QueueDataset(DatasetBase):
     """
@@ -857,19 +906,24 @@ class QueueDataset(DatasetBase):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+          import paddle
+          dataset = paddle.distributed.QueueDataset()
 
     """
 
     def __init__(self):
         """
         Initialize QueueDataset
-        This class should be created by DatasetFactory
         """
         super(QueueDataset, self).__init__()
         self.proto_desc.name = "MultiSlotDataFeed"
 
+    def init(self, **kwargs):
+        """
+        should be called only once in user's python scripts to initialize setings of dataset instance
+        """
+        super(QueueDataset, self).init(**kwargs)
+
     def _prepare_to_run(self):
         """
         Set data_feed_desc/thread num/filelist before run,
@@ -881,57 +935,9 @@ class QueueDataset(DatasetBase):
             self.thread_num = 1
         self.dataset.set_thread_num(self.thread_num)
         self.dataset.set_filelist(self.filelist)
-        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.set_data_feed_desc(self._desc())
         self.dataset.create_readers()
 
-    def local_shuffle(self):
-        """
-        Local shuffle data.
-
-        Local shuffle is not supported in QueueDataset
-        NotImplementedError will be raised
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-              dataset.local_shuffle()
-
-        Raises:
-            NotImplementedError: QueueDataset does not support local shuffle
-
-        """
-        raise NotImplementedError(
-            "QueueDataset does not support local shuffle, "
-            "please use InMemoryDataset for local_shuffle")
-
-    def global_shuffle(self, fleet=None):
-        """
-        Global shuffle data.
-
-        Global shuffle is not supported in QueueDataset
-        NotImplementedError will be raised
-
-        Args:
-            fleet(Fleet): fleet singleton. Default None.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-              dataset.global_shuffle(fleet)
-
-        Raises:
-            NotImplementedError: QueueDataset does not support global shuffle
-
-        """
-        raise NotImplementedError(
-            "QueueDataset does not support global shuffle, "
-            "please use InMemoryDataset for global_shuffle")
-
 
 class FileInstantDataset(DatasetBase):
     """
@@ -940,35 +946,22 @@ class FileInstantDataset(DatasetBase):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory.create_dataset("FileInstantDataset")
+          import paddle
+          dataset = paddle.distributed.fleet.FileInstantDataset()
     """
 
     def __init__(self):
         """
         Initialize FileInstantDataset
-        This class should be created by DatasetFactory
         """
         super(FileInstantDataset, self).__init__()
         self.proto_desc.name = "MultiSlotFileInstantDataFeed"
 
-    def local_shuffle(self):
+    def init(self, **kwargs):
         """
-        Local shuffle
-        FileInstantDataset does not support local shuffle
+        should be called only once in user's python scripts to initialize setings of dataset instance
         """
-        raise NotImplementedError(
-            "FileInstantDataset does not support local shuffle, "
-            "please use InMemoryDataset for local_shuffle")
-
-    def global_shuffle(self, fleet=None):
-        """
-        Global shuffle
-        FileInstantDataset does not support global shuffle
-        """
-        raise NotImplementedError(
-            "FileInstantDataset does not support global shuffle, "
-            "please use InMemoryDataset for global_shuffle")
+        super(FileInstantDataset, self).init(**kwargs)
 
 
 class BoxPSDataset(InMemoryDataset):
@@ -978,19 +971,119 @@ class BoxPSDataset(InMemoryDataset):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+          import paddle
+          dataset = paddle.distributed.fleet.BoxPSDataset()
     """
 
     def __init__(self):
         """
         Initialize BoxPSDataset
-        This class should be created by DatasetFactory
         """
         super(BoxPSDataset, self).__init__()
         self.boxps = core.BoxPS(self.dataset)
         self.proto_desc.name = "PaddleBoxDataFeed"
 
+    def init(self, **kwargs):
+        """
+        should be called only once in user's python scripts to initialize setings of dataset instance
+        """
+        super(BoxPSDataset, self).init(**kwargs)
+
+        rank_offset = kwargs.get("rank_offset", "")
+        self._set_rank_offset(rank_offset)
+        pv_batch_size = kwargs.get("pv_batch_size", 1)
+        self._set_pv_batch_size(pv_batch_size)
+        parse_logkey = kwargs.get("parse_logkey", False)
+        self._set_parse_logkey(parse_logkey)
+        merge_by_sid = kwargs.get("merge_by_sid", False)
+        self._set_merge_by_sid(merge_by_sid)
+        enable_pv_merge = kwargs.get("enable_pv_merge", False)
+        self._set_enable_pv_merge(enable_pv_merge)
+
+    def _set_rank_offset(self, rank_offset):
+        """
+        Set rank_offset for merge_pv. It set the message of Pv.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_rank_offset("rank_offset")
+
+        Args:
+            rank_offset(str): rank_offset's name
+
+        """
+        self.proto_desc.rank_offset = rank_offset
+
+    def _set_pv_batch_size(self, pv_batch_size):
+        """
+        Set pv batch size. It will be effective during enable_pv_merge
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_pv_batch_size(128)
+        Args:
+            pv_batch_size(int): pv batch size
+
+        """
+        self.proto_desc.pv_batch_size = pv_batch_size
+
+    def _set_parse_logkey(self, parse_logkey):
+        """
+        Set if Dataset need to parse logkey
+
+        Args:
+            parse_content(bool): if parse logkey or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_parse_logkey(True)
+
+        """
+        self.parse_logkey = parse_logkey
+
+    def _set_merge_by_sid(self, merge_by_sid):
+        """
+        Set if Dataset need to merge sid. If not, one ins means one Pv.
+
+        Args:
+            merge_by_sid(bool): if merge sid or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_merge_by_sid(True)
+
+        """
+        self.merge_by_sid = merge_by_sid
+
+    def _set_enable_pv_merge(self, enable_pv_merge):
+        """
+        Set if Dataset need to merge pv.
+
+        Args:
+            enable_pv_merge(bool): if enable_pv_merge or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_enable_pv_merge(True)
+
+        """
+        self.enable_pv_merge = enable_pv_merge
+
     def set_date(self, date):
         """
         Workaround for date
@@ -1008,8 +1101,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               dataset.begin_pass()
         """
         self.boxps.begin_pass()
@@ -1021,8 +1114,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               dataset.end_pass(True)
         """
         self.boxps.end_pass(need_save_delta)
@@ -1034,8 +1127,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.preload_into_memory()
@@ -1049,8 +1142,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -1064,8 +1157,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.preload_into_memory()
@@ -1093,11 +1186,90 @@ class BoxPSDataset(InMemoryDataset):
             slots(list[string]): the set of slots(string) to do slots shuffle.
 
         Examples:
-            import paddle.fluid as fluid
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            import paddle
+            dataset = paddle.distributed.fleet.BoxPSDataset()
             dataset.set_merge_by_lineid()
             #suppose there is a slot 0
             dataset.slots_shuffle(['0'])
         """
         slots_set = set(slots)
         self.boxps.slots_shuffle(slots_set)
+
+    def set_current_phase(self, current_phase):
+        """
+        Set current phase in train. It is useful for untest.
+        current_phase : 1 for join, 0 for update.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.set_current_phase(1)
+
+        """
+        self.dataset.set_current_phase(current_phase)
+
+    def get_pv_data_size(self):
+        """
+        Get memory data size of Pv, user can call this function to know the pv num
+        of ins in all workers after load into memory.
+
+        Note:
+            This function may cause bad performance, because it has barrier
+
+        Returns:
+            The size of memory pv data.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              print dataset.get_pv_data_size()
+
+        """
+        return self.dataset.get_pv_data_size()
+
+    def preprocess_instance(self):
+        """
+        Merge pv instance and convey it from input_channel to input_pv_channel. 
+        It will be effective when enable_pv_merge_ is True.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.preprocess_instance()
+
+        """
+        self.dataset.preprocess_instance()
+
+    def postprocess_instance(self):
+        """
+        Divide pv instance and convey it to input_channel.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.preprocess_instance()
+              exe.train_from_dataset(dataset)
+              dataset.postprocess_instance()
+
+        """
+        self.dataset.postprocess_instance()
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index f2bb567b95b..533222531f9 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1726,13 +1726,13 @@ class DatasetLoader(DataLoaderBase):
             logging.warn('thread_num {} which is set in Dataset is ignored'.
                          format(dataset.thread_num))
 
-        dataset.set_thread(thread_num)
+        dataset._set_thread(thread_num)
 
         if isinstance(dataset, paddle.distributed.fleet.dataset.
                       InMemoryDataset) and dataset.queue_num > thread_num:
             logging.warn("queue_num {} which is set in Dataset is ignored".
                          format(dataset.queue_num))
-            dataset.set_queue_num(thread_num)
+            dataset._set_queue_num(thread_num)
 
         self._dataset = dataset
         use_slots = [
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index dc39472d7ae..1b0ce0c03e7 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -208,14 +208,16 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         filelist = train_file_list
 
         # config dataset
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset()
-        dataset.set_batch_size(batch_size)
-        dataset.set_use_var(self.feeds)
+        dataset = paddle.distributed.QueueDataset()
         pipe_command = 'python ctr_dataset_reader.py'
-        dataset.set_pipe_command(pipe_command)
+
+        dataset.init(
+            batch_size=batch_size,
+            use_var=self.feeds,
+            pipe_command=pipe_command,
+            thread_num=thread_num)
 
         dataset.set_filelist(filelist)
-        dataset.set_thread(thread_num)
 
         for epoch_id in range(1):
             pass_start = time.time()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
index 03d0fa447da..0e3c8099277 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
@@ -114,14 +114,14 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
             filelist.append(train_file_path)
 
         # config dataset
-        dataset = paddle.fleet.DatasetFactory().create_dataset()
-        dataset.set_batch_size(batch_size)
-        dataset.set_use_var(self.feeds)
+        dataset = paddle.distributed.QueueDataset()
+        dataset._set_batch_size(batch_size)
+        dataset._set_use_var(self.feeds)
         pipe_command = 'python ctr_dataset_reader.py'
-        dataset.set_pipe_command(pipe_command)
+        dataset._set_pipe_command(pipe_command)
 
         dataset.set_filelist(filelist)
-        dataset.set_thread(thread_num)
+        dataset._set_thread(thread_num)
 
         for epoch_id in range(1):
             pass_start = time.time()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index 7a4e7534f07..a5633bb0450 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -183,14 +183,14 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
         print("filelist: {}".format(filelist))
 
         # config dataset
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset()
-        dataset.set_batch_size(batch_size)
-        dataset.set_use_var(self.feeds)
+        dataset = paddle.distributed.QueueDataset()
+        dataset._set_batch_size(batch_size)
+        dataset._set_use_var(self.feeds)
         pipe_command = 'python ctr_dataset_reader.py'
-        dataset.set_pipe_command(pipe_command)
+        dataset._set_pipe_command(pipe_command)
 
         dataset.set_filelist(filelist)
-        dataset.set_thread(thread_num)
+        dataset._set_thread(thread_num)
 
         for epoch_id in range(1):
             pass_start = time.time()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 582bb3dcc68..208956b825e 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -38,26 +38,22 @@ class TestDataset(unittest.TestCase):
     def test_dataset_create(self):
         """ Testcase for dataset create. """
         try:
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "InMemoryDataset")
+            dataset = paddle.distributed.InMemoryDataset()
         except:
             self.assertTrue(False)
 
         try:
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "QueueDataset")
+            dataset = paddle.distributed.QueueDataset()
         except:
             self.assertTrue(False)
 
         try:
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "FileInstantDataset")
+            dataset = paddle.distributed.fleet.dataset.FileInstantDataset()
         except:
             self.assertTrue(False)
 
         try:
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "MyOwnDataset")
+            dataset = paddle.distributed.fleet.dataset.MyOwnDataset()
             self.assertTrue(False)
         except:
             self.assertTrue(True)
@@ -95,18 +91,18 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
+        dataset.update_settings(pipe_command="cat1")
+        dataset._init_distributed_settings(
+            parse_ins_id=True,
+            parse_content=True,
+            fea_eval=True,
+            candidate_size=10000)
         dataset.set_filelist(
             ["test_run_with_dump_a.txt", "test_run_with_dump_b.txt"])
-        dataset.set_parse_ins_id(True)
-        dataset.set_parse_content(True)
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
-        dataset.set_fea_eval(10000, True)
         dataset.local_shuffle()
 
         exe = fluid.Executor(fluid.CPUPlace())
@@ -176,14 +172,14 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32,
+            thread_num=3,
+            pipe_command="cat",
+            download_cmd="cat",
+            use_var=slots_vars)
         dataset.set_filelist([filename1, filename2])
-        dataset.set_pipe_command("cat")
-        dataset.set_download_cmd("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
@@ -228,22 +224,19 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
+        dataset._init_distributed_settings(fea_eval=True, candidate_size=1)
         dataset.set_filelist([
             "test_in_memory_dataset_run_a.txt",
             "test_in_memory_dataset_run_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
-        dataset.set_fea_eval(1, True)
         dataset.slots_shuffle(["slot1"])
         dataset.local_shuffle()
-        dataset.set_generate_unique_feasigns(True, 15)
-        dataset.generate_local_tables_unlock(0, 11, 1, 25, 15)
+        dataset._set_generate_unique_feasigns(True, 15)
+        dataset._generate_local_tables_unlock(0, 11, 1, 25, 15)
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
         if self.use_data_loader:
@@ -300,17 +293,14 @@ class TestDataset(unittest.TestCase):
                     name=slot, shape=[1], dtype="float32", lod_level=1)
                 slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(1)
-        dataset.set_parse_ins_id(True)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars)
+        dataset._init_distributed_settings(parse_ins_id=True)
         dataset.set_filelist([
             "test_in_memory_dataset_masterpatch_a.txt",
             "test_in_memory_dataset_masterpatch_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
         dataset.local_shuffle()
 
@@ -325,7 +315,8 @@ class TestDataset(unittest.TestCase):
             except Exception as e:
                 self.assertTrue(False)
 
-        dataset.set_merge_by_lineid(2)
+        #dataset._set_merge_by_lineid(2)
+        dataset.update_settings(merge_size=2)
         dataset.dataset.merge_by_lineid()
 
         os.remove("./test_in_memory_dataset_masterpatch_a.txt")
@@ -367,17 +358,14 @@ class TestDataset(unittest.TestCase):
                 name="slot4", shape=[1], dtype="float32", lod_level=0)
             slots_vars = [var1, var2, var3, var4]
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(1)
-        dataset.set_parse_ins_id(True)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars)
+        dataset._init_distributed_settings(parse_ins_id=True)
         dataset.set_filelist([
             "test_in_memory_dataset_masterpatch1_a.txt",
             "test_in_memory_dataset_masterpatch1_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
         dataset.local_shuffle()
 
@@ -392,7 +380,7 @@ class TestDataset(unittest.TestCase):
             except Exception as e:
                 self.assertTrue(False)
 
-        dataset.set_merge_by_lineid(2)
+        dataset._set_merge_by_lineid(2)
         dataset.dataset.merge_by_lineid()
 
         os.remove("./test_in_memory_dataset_masterpatch1_a.txt")
@@ -423,16 +411,13 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="float32", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
         dataset.set_filelist([
             "test_in_memory_dataset_run_a.txt",
             "test_in_memory_dataset_run_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
         dataset.local_shuffle()
 
@@ -473,9 +458,9 @@ class TestDataset(unittest.TestCase):
                 except Exception as e:
                     self.assertTrue(False)
 
-        dataset.set_merge_by_lineid(2)
-        dataset.set_parse_ins_id(False)
-        dataset.set_fleet_send_sleep_seconds(2)
+        dataset._set_merge_by_lineid(2)
+        dataset._set_parse_ins_id(False)
+        dataset._set_fleet_send_sleep_seconds(2)
         dataset.preload_into_memory()
         dataset.wait_preload_done()
         dataset.release_memory()
@@ -483,10 +468,25 @@ class TestDataset(unittest.TestCase):
         dataset.wait_preload_done()
         dataset.dataset.merge_by_lineid()
         dataset.release_memory()
-        dataset.set_merge_by_lineid(30)
-        dataset.set_parse_ins_id(False)
+        dataset._set_merge_by_lineid(30)
+        dataset._set_parse_ins_id(False)
         dataset.load_into_memory()
         dataset.dataset.merge_by_lineid()
+        dataset.update_settings(
+            batch_size=1,
+            thread_num=2,
+            input_type=1,
+            pipe_command="cat",
+            use_var=[],
+            fs_name="",
+            fs_ugi="",
+            download_cmd="cat",
+            merge_size=-1,
+            parse_ins_id=False,
+            parse_content=False,
+            fleet_send_batch_size=2,
+            fleet_send_sleep_seconds=2,
+            fea_eval=True)
         fleet_ptr = fluid.core.Fleet()
         fleet_ptr.set_client2client_config(1, 1, 1)
         fleet_ptr.get_cache_threshold(0)
@@ -517,14 +517,11 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "QueueDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.QueueDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
         dataset.set_filelist(
             ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
 
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
@@ -543,12 +540,9 @@ class TestDataset(unittest.TestCase):
                 except Exception as e:
                     self.assertTrue(False)
 
-        dataset2 = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "QueueDataset")
-        dataset2.set_use_var(slots_vars)
-        dataset2.set_batch_size(32)
-        dataset2.set_thread(3)
-        dataset2.set_pipe_command("cat")
+        dataset2 = paddle.distributed.QueueDataset()
+        dataset2.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
         dataset.set_filelist([])
         try:
             exe.train_from_dataset(fluid.default_main_program(), dataset2)
@@ -585,14 +579,11 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="float32", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "QueueDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.QueueDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
         dataset.set_filelist(
             ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
 
         exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
         ) else fluid.CUDAPlace(0))
@@ -641,15 +632,15 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[None, 1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_input_type(1)
-        dataset.set_batch_size(1)
-        dataset.set_thread(2)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=1,
+            thread_num=2,
+            input_type=1,
+            pipe_command="cat",
+            use_var=slots_vars)
         dataset.set_filelist(
             ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
 
         exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
@@ -721,13 +712,10 @@ class TestDatasetWithFetchHandler(unittest.TestCase):
             inputs(list): inputs of get_dataset
             files(list): files of  get_dataset
         """
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "QueueDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.QueueDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=inputs)
         dataset.set_filelist(files)
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(inputs)
         return dataset
 
     def setUp(self):
@@ -879,16 +867,17 @@ class TestDataset2(unittest.TestCase):
             except ImportError as e:
                 print("warning: no mpi4py")
             exe.run(startup_program)
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "InMemoryDataset")
-            dataset.set_batch_size(32)
-            dataset.set_thread(3)
+            dataset = paddle.distributed.InMemoryDataset()
+
+            dataset.init(
+                batch_size=32,
+                thread_num=3,
+                pipe_command="cat",
+                use_var=slots_vars)
             dataset.set_filelist([
                 "test_in_memory_dataset2_run_a.txt",
                 "test_in_memory_dataset2_run_b.txt"
             ])
-            dataset.set_pipe_command("cat")
-            dataset.set_use_var(slots_vars)
             dataset.load_into_memory()
             fleet._opt_info = None
             fleet._fleet_ptr = None
@@ -949,16 +938,16 @@ class TestDataset2(unittest.TestCase):
             except ImportError as e:
                 print("warning: no mpi4py")
             exe.run(startup_program)
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "InMemoryDataset")
-            dataset.set_batch_size(32)
-            dataset.set_thread(3)
+            dataset = paddle.distributed.InMemoryDataset()
+            dataset.init(
+                batch_size=32,
+                thread_num=3,
+                pipe_command="cat",
+                use_var=slots_vars)
             dataset.set_filelist([
                 "test_in_memory_dataset2_run2_a.txt",
                 "test_in_memory_dataset2_run2_b.txt"
             ])
-            dataset.set_pipe_command("cat")
-            dataset.set_use_var(slots_vars)
             dataset.load_into_memory()
             try:
                 dataset.global_shuffle(fleet)
@@ -966,14 +955,11 @@ class TestDataset2(unittest.TestCase):
                 print("warning: catch expected error")
             fleet._opt_info = None
             fleet._fleet_ptr = None
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "InMemoryDataset")
-            dataset.set_rank_offset("")
-            dataset.set_pv_batch_size(1)
-            dataset.set_hdfs_config("", "")
+            dataset = paddle.distributed.InMemoryDataset()
+            dataset.init(fs_name="", fs_ugi="")
             d = paddle.distributed.fleet.DatasetBase()
             try:
-                dataset.set_feed_type("MultiSlotInMemoryDataFeed")
+                dataset._set_feed_type("MultiSlotInMemoryDataFeed")
             except:
                 print("warning: catch expected error")
             dataset.thread_num = 0
@@ -981,9 +967,6 @@ class TestDataset2(unittest.TestCase):
                 dataset._prepare_to_run()
             except:
                 print("warning: catch expected error")
-            dataset.set_parse_logkey(True)
-            dataset.set_merge_by_sid(True)
-            dataset.set_enable_pv_merge(True)
             try:
                 dataset.preprocess_instance()
             except:
@@ -996,16 +979,15 @@ class TestDataset2(unittest.TestCase):
                 dataset.postprocess_instance()
             except:
                 print("warning: catch expected error")
-            dataset.set_fleet_send_batch_size(1024)
+            dataset._set_fleet_send_batch_size(1024)
             try:
                 dataset.global_shuffle()
             except:
                 print("warning: catch expected error")
-            dataset.get_pv_data_size()
+            #dataset.get_pv_data_size()
             dataset.get_memory_data_size()
             dataset.get_shuffle_data_size()
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "QueueDataset")
+            dataset = paddle.distributed.QueueDataset()
             try:
                 dataset.local_shuffle()
             except:
@@ -1027,6 +1009,120 @@ class TestDataset2(unittest.TestCase):
         os.remove("./test_in_memory_dataset2_run2_a.txt")
         os.remove("./test_in_memory_dataset2_run2_b.txt")
 
+    def test_bosps_dataset_fleet2(self):
+        """
+        Testcase for InMemoryDataset from create to run.
+        """
+        with open("test_in_memory_dataset2_run2_a.txt", "w") as f:
+            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
+            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
+            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
+            f.write(data)
+        with open("test_in_memory_dataset2_run2_b.txt", "w") as f:
+            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
+            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
+            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
+            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
+            f.write(data)
+
+        train_program = fluid.Program()
+        startup_program = fluid.Program()
+        scope = fluid.Scope()
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+        with fluid.program_guard(train_program, startup_program):
+            slots = ["slot1_ff", "slot2_ff", "slot3_ff", "slot4_ff"]
+            slots_vars = []
+            for slot in slots:
+                var = fluid.layers.data(\
+                    name=slot, shape=[1], dtype="float32", lod_level=1)
+                slots_vars.append(var)
+            fake_cost = \
+                fluid.layers.elementwise_sub(slots_vars[0], slots_vars[-1])
+            fake_cost = fluid.layers.mean(fake_cost)
+        with fluid.scope_guard(scope):
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            try:
+                fleet.init()
+            except ImportError as e:
+                print("warning: no mpi4py")
+            adam = fluid.optimizer.Adam(learning_rate=0.000005)
+            try:
+                adam = fleet.distributed_optimizer(
+                    adam,
+                    strategy={
+                        "fs_uri": "fs_uri_xxx",
+                        "fs_user": "fs_user_xxx",
+                        "fs_passwd": "fs_passwd_xxx",
+                        "fs_hadoop_bin": "fs_hadoop_bin_xxx"
+                    })
+                adam.minimize([fake_cost], [scope])
+            except AttributeError as e:
+                print("warning: no mpi")
+            except ImportError as e:
+                print("warning: no mpi4py")
+            exe.run(startup_program)
+            dataset = paddle.distributed.fleet.BoxPSDataset()
+            dataset.init(
+                batch_size=32,
+                thread_num=3,
+                pipe_command="cat",
+                use_var=slots_vars)
+            dataset.set_filelist([
+                "test_in_memory_dataset2_run2_a.txt",
+                "test_in_memory_dataset2_run2_b.txt"
+            ])
+            dataset.load_into_memory()
+            try:
+                dataset.global_shuffle(fleet)
+            except:
+                print("warning: catch expected error")
+            fleet._opt_info = None
+            fleet._fleet_ptr = None
+            dataset = paddle.distributed.fleet.BoxPSDataset()
+            dataset.init(
+                rank_offset="",
+                pv_batch_size=1,
+                fs_name="",
+                fs_ugi="",
+                data_feed_type="MultiSlotInMemoryDataFeed",
+                parse_logkey=True,
+                merge_by_sid=True,
+                enable_pv_merge=True)
+            d = paddle.distributed.fleet.DatasetBase()
+            try:
+                dataset._set_feed_type("MultiSlotInMemoryDataFeed")
+            except:
+                print("warning: catch expected error")
+            dataset.thread_num = 0
+            try:
+                dataset._prepare_to_run()
+            except:
+                print("warning: catch expected error")
+            dataset._set_parse_logkey(True)
+            dataset._set_merge_by_sid(True)
+            dataset._set_enable_pv_merge(True)
+            try:
+                dataset.preprocess_instance()
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.set_current_phase(1)
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.postprocess_instance()
+            except:
+                print("warning: catch expected error")
+            dataset._set_fleet_send_batch_size(1024)
+            try:
+                dataset.global_shuffle()
+            except:
+                print("warning: catch expected error")
+            #dataset.get_pv_data_size()
+            dataset.get_memory_data_size()
+            dataset.get_shuffle_data_size()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
index c13c33f209f..9195ac277b9 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
@@ -97,9 +97,11 @@ class DatasetLoaderTestBase(unittest.TestCase):
 
     def check_batch_number(self, place, randomize_batch_num=False):
         main_prog, startup_prog, feeds = self.build_network()
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            self.dataset_name)
-        dataset.set_batch_size(BATCH_SIZE)
+        if self.dataset_name == "QueueDataset":
+            dataset = paddle.distributed.QueueDataset()
+        else:
+            dataset = paddle.distributed.InMemoryDataset()
+        dataset._set_batch_size(BATCH_SIZE)
 
         if isinstance(place, fluid.CPUPlace):
             file_num = 10
@@ -128,8 +130,8 @@ class DatasetLoaderTestBase(unittest.TestCase):
                 fake_reader(batch_num=BATCH_NUM + random_delta_batch_size[i]))
 
         dataset.set_filelist(filelist)
-        dataset.set_use_var(feeds)
-        dataset.set_pipe_command("cat")
+        dataset._set_use_var(feeds)
+        dataset._set_pipe_command("cat")
         if self.dataset_name == 'InMemoryDataset':
             dataset.load_into_memory()
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
index eb5d9eb6660..a831f6e838e 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
@@ -163,10 +163,9 @@ class TestCloudRoleMaker2(unittest.TestCase):
             data = "1 1 1 1\n"
             f.write(data)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
+        dataset = paddle.distributed.InMemoryDataset()
         dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"])
-        dataset.set_use_var([show, label])
+        dataset._set_use_var([show, label])
         dataset.load_into_memory()
         dataset.get_memory_data_size(fleet)
         dataset.get_shuffle_data_size(fleet)
diff --git a/python/paddle/fluid/tests/unittests/test_monitor.py b/python/paddle/fluid/tests/unittests/test_monitor.py
index f6207edb41c..cf273876b1f 100644
--- a/python/paddle/fluid/tests/unittests/test_monitor.py
+++ b/python/paddle/fluid/tests/unittests/test_monitor.py
@@ -52,18 +52,17 @@ class TestDatasetWithStat(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset._set_batch_size(32)
+        dataset._set_thread(3)
         dataset.set_filelist([
             "test_in_memory_dataset_run_a.txt",
             "test_in_memory_dataset_run_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
+        dataset._set_pipe_command("cat")
+        dataset._set_use_var(slots_vars)
         dataset.load_into_memory()
-        dataset.set_fea_eval(1, True)
+        dataset._set_fea_eval(1, True)
         dataset.slots_shuffle(["slot1"])
 
         exe = fluid.Executor(fluid.CPUPlace())
-- 
GitLab


From c23f09fea6722d250c4236f8bc989bc9e6d5a9e7 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 16 Sep 2020 14:54:45 +0800
Subject: [PATCH 098/261] Support load state_dict from save_params/persistables
 (#27298)

* support load state_dict from save_params/persistables

* remove failed unittest

* add load eof check & unittest

* remove eof check
---
 python/paddle/fluid/dygraph/checkpoint.py     | 148 ++++++++++++------
 .../test_load_state_dict_from_old_format.py   |  28 ++--
 2 files changed, 115 insertions(+), 61 deletions(-)

diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 9876fc620b8..93cb0bafc84 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -195,58 +195,11 @@ def load_dygraph(model_path, config=None):
     params_file_path = model_prefix + ".pdparams"
     opti_file_path = model_prefix + ".pdopt"
 
-    # deal with argument `configs`
-    configs = config
-    if configs is None:
-        configs = SaveLoadConfig()
-
-    if not os.path.exists(params_file_path) and not os.path.exists(
-            opti_file_path):
-        # Load state dict by `jit.save/io.save_inference_model` save format
-        # NOTE(chenweihang): [ Compatibility of save_inference_model save format ]
-        # The model saved by `save_inference_model` does not completely correspond to 
-        # the information required by the `state_dict` under the dygraph. 
-        # `save_inference_model` not save structured name, we need to remind 
-        # the user to configure the `use_structured_name` argument when `set_state_dict`
-        # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
-
-        # 1. check model path
-        if not os.path.isdir(model_prefix):
-            raise ValueError("Model saved directory '%s' is not exists." %
-                             model_prefix)
+    # deal with argument `config`
+    if config is None:
+        config = SaveLoadConfig()
 
-        # 2. load program desc & construct _ProgramHolder
-        programs = _construct_program_holders(model_path,
-                                              configs.model_filename)
-
-        # 3. load layer parameters & buffers
-        # NOTE: using fluid.dygraph.guard() here will cause import error in py2
-        with guard():
-            persistable_var_dict = _construct_params_and_buffers(
-                model_prefix,
-                programs,
-                configs.separate_params,
-                configs.params_filename,
-                append_suffix=False)
-
-            # 4. construct state_dict
-            para_dict = dict()
-            for var_name in persistable_var_dict:
-                para_dict[var_name] = persistable_var_dict[var_name].numpy()
-
-            # if __variables.info__ exists, we can recover structured_name
-            var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME)
-            if os.path.exists(var_info_path):
-                with open(var_info_path, 'rb') as f:
-                    extra_var_info = pickle.load(f)
-                structured_para_dict = dict()
-                for var_name in para_dict:
-                    structured_name = extra_var_info[var_name].get(
-                        'structured_name', None)
-                    assert structured_name is not None, "Cannot find saved variable (%s)'s structured name in saved model." % var_name
-                    structured_para_dict[structured_name] = para_dict[var_name]
-                para_dict = structured_para_dict
-    else:
+    if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
         # Load state dict by `save_dygraph` save format
         para_dict = {}
         if os.path.exists(params_file_path):
@@ -254,12 +207,103 @@ def load_dygraph(model_path, config=None):
                 para_dict = pickle.load(f) if six.PY2 else pickle.load(
                     f, encoding='latin1')
 
-        if not configs.keep_name_table and "StructuredToParameterName@@" in para_dict:
+        if not config.keep_name_table and "StructuredToParameterName@@" in para_dict:
             del para_dict["StructuredToParameterName@@"]
 
         if os.path.exists(opti_file_path):
             with open(opti_file_path, 'rb') as f:
                 opti_dict = pickle.load(f) if six.PY2 else pickle.load(
                     f, encoding='latin1')
+    else:
+        # check model path
+        if not os.path.isdir(model_prefix):
+            raise ValueError("Model saved directory '%s' is not exists." %
+                             model_prefix)
+
+        # check whether model file exists
+        if config.model_filename is None:
+            model_filename = '__model__'
+        else:
+            model_filename = config.model_filename
+        model_file_path = os.path.join(model_path, model_filename)
+
+        if os.path.exists(model_file_path):
+            # Load state dict by `jit.save/io.save_inference_model` save format
+            # NOTE(chenweihang): [ Compatibility of save_inference_model save format ]
+            # The model saved by `save_inference_model` does not completely correspond to 
+            # the information required by the `state_dict` under the dygraph. 
+            # `save_inference_model` not save structured name, we need to remind 
+            # the user to configure the `use_structured_name` argument when `set_state_dict`
+            # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
+
+            # 1. load program desc & construct _ProgramHolder
+            programs = _construct_program_holders(model_path,
+                                                  config.model_filename)
+
+            # 2. load layer parameters & buffers
+            # NOTE: using fluid.dygraph.guard() here will cause import error in py2
+            with guard():
+                persistable_var_dict = _construct_params_and_buffers(
+                    model_prefix,
+                    programs,
+                    config.separate_params,
+                    config.params_filename,
+                    append_suffix=False)
+
+                # 3. construct state_dict
+                para_dict = dict()
+                for var_name in persistable_var_dict:
+                    para_dict[var_name] = persistable_var_dict[var_name].numpy()
+
+                # if __variables.info__ exists, we can recover structured_name
+                var_info_path = os.path.join(model_prefix,
+                                             EXTRA_VAR_INFO_FILENAME)
+                if os.path.exists(var_info_path):
+                    with open(var_info_path, 'rb') as f:
+                        extra_var_info = pickle.load(f)
+                    structured_para_dict = dict()
+                    for var_name in para_dict:
+                        structured_name = extra_var_info[var_name].get(
+                            'structured_name', None)
+                        assert structured_name is not None, "Cannot find saved variable (%s)'s structured name in saved model." % var_name
+                        structured_para_dict[structured_name] = para_dict[
+                            var_name]
+                    para_dict = structured_para_dict
+        else:
+            # load state dict by `io.save_params/persistables` save format
+            # TODO(chenweihang): [ Now only supports loading parameters seperately ]
+            # If users save all parameters as one file, the [ variable.name -> variable ]
+            # mapping info will lost, so users need to give variable list, but users build 
+            # variable list in dygraph mode is difficult, we recommend users to use
+            # paddle.io.load_program_state in this case
+
+            # Try to load all the files in the directory in VarBase format, 
+            # the file name is used as the name of VarBase
+            load_var_list = []
+
+            # 1. load file names
+            var_name_list = []
+            for root, _, files in os.walk(model_path):
+                for filename in files:
+                    file_path = os.path.join(root, filename)
+                    tmp_var_name = os.path.relpath(file_path, model_path)
+                    var_name = tmp_var_name.replace("\\", "/")
+                    var_name_list.append(var_name)
+
+            # 2. create and load VarBase
+            with guard():
+                for name in var_name_list:
+                    new_var = _varbase_creator(name=name, persistable=True)
+                    _dygraph_tracer().trace_op(
+                        type='load',
+                        inputs={},
+                        outputs={'Out': new_var},
+                        attrs={'file_path': os.path.join(model_path, name)})
+                    load_var_list.append(new_var)
+
+            # 3. construct state_dict
+            para_dict = dict()
+            for var in load_var_list:
+                para_dict[var.name] = var.numpy()
 
     return para_dict, opti_dict
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
index ed1939dbe27..a1a9b3f444f 100644
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -64,7 +64,7 @@ class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
         self.batch_size = 128
         self.batch_num = 10
 
-    def train_and_save_model(self):
+    def train_and_save_model(self, only_params=False):
         with new_program_scope():
             startup_program = fluid.default_startup_program()
             main_program = fluid.default_main_program()
@@ -102,11 +102,15 @@ class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
                 static_param_dict[param.name] = fluid.executor._fetch_var(
                     param.name)
 
-            fluid.io.save_inference_model(
-                self.save_dirname, ["img"], [prediction],
-                exe,
-                model_filename=self.model_filename,
-                params_filename=self.params_filename)
+            if only_params:
+                fluid.io.save_params(
+                    exe, self.save_dirname, filename=self.params_filename)
+            else:
+                fluid.io.save_inference_model(
+                    self.save_dirname, ["img"], [prediction],
+                    exe,
+                    model_filename=self.model_filename,
+                    params_filename=self.params_filename)
 
         return static_param_dict
 
@@ -120,9 +124,7 @@ class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
         self.params_filename = None
         orig_param_dict = self.train_and_save_model()
 
-        configs = paddle.SaveLoadConfig()
-        configs.separate_params = True
-        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        load_param_dict, _ = paddle.load(self.save_dirname)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
     def test_load_with_model_filename(self):
@@ -160,6 +162,14 @@ class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
         load_param_dict, _ = paddle.load(self.save_dirname, configs)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
+    def test_load_state_dict_from_save_params(self):
+        self.save_dirname = "static_mnist.load_state_dict.save_params"
+        self.params_filename = None
+        orig_param_dict = self.train_and_save_model(True)
+
+        load_param_dict, _ = paddle.load(self.save_dirname)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
 
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From 4582f697b61477a81772f031fa6b5ef0480b9a3c Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Wed, 16 Sep 2020 09:09:08 +0200
Subject: [PATCH 099/261] - Fix to concat oneDNN overwritting data (#27273)

test=develop
---
 paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 3cafb0e9fc6..b2815cbdc65 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -86,8 +86,10 @@ class ConcatPrimitiveFactory {
   concat CreateConcatPrimitive(const concat::primitive_desc& concat_pd,
                                Tensor* output, platform::CPUPlace place,
                                const mkldnn::engine& mkldnn_engine) {
-    dst_mem = mkldnn::memory(concat_pd.dst_desc(), mkldnn_engine,
-                             output->mutable_data<T>(place));
+    dst_mem = mkldnn::memory(
+        concat_pd.dst_desc(), mkldnn_engine,
+        output->mutable_data<T>(place, concat_pd.dst_desc().get_size()));
+
     return concat(concat_pd);
   }
 
@@ -193,7 +195,9 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         prim_creator.SetSrcDataHandleByIndex(
             *srcs, i, to_void_cast<T>(multi_input[i]->data<T>()));
       }
-      prim_creator.SetDstDataHandle(*dst_mem, output->mutable_data<T>(place));
+      prim_creator.SetDstDataHandle(
+          *dst_mem,
+          output->mutable_data<T>(place, concat_pd->dst_desc().get_size()));
     }
 
     mkldnn::stream astream(mkldnn_engine);
-- 
GitLab


From f992f8d7ef58cb632985832816fad8ff5db7947a Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Wed, 16 Sep 2020 15:33:09 +0800
Subject: [PATCH 100/261] fix judge cache file of inference api more accurate
 (#27175)

fix judge cache file of inference api more accurate
---
 paddle/fluid/inference/tests/api/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index a1b43de4695..d7d4f7969fa 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -9,6 +9,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 function(download_data install_dir data_file)
+    string(REGEX MATCH "[^/\\]+$" data_file ${data_file})
     if (NOT EXISTS ${install_dir}/${data_file})
         inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file})
     endif()
-- 
GitLab


From 34091533c95291dd96459af65dacf04a9cce9fa2 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Wed, 16 Sep 2020 16:19:17 +0800
Subject: [PATCH 101/261] Fix bug in continuous apply, test=develop (#27337)

---
 python/paddle/fluid/optimizer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 192effd2e42..1e7915ed781 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -3570,8 +3570,10 @@ class ExponentialMovingAverage(object):
                 # bias correction
                 with layers.control_flow.Switch() as switch:
                     with switch.case(global_step > 0):
-                        layers.assign(output=ema, input=ema / (1.0 - decay_pow))
-                layers.assign(input=ema, output=param)
+                        layers.assign(
+                            output=param, input=ema / (1.0 - decay_pow))
+                    with switch.default():
+                        layers.assign(output=param, input=ema)
 
         self.restore_program = Program()
         block = self.restore_program.global_block()
-- 
GitLab


From e25bcc987b32881a713899213f7c26383acee34d Mon Sep 17 00:00:00 2001
From: zhangchunle <clzhang_cauc@163.com>
Date: Wed, 16 Sep 2020 17:06:11 +0800
Subject: [PATCH 102/261] add setup (#27346)

---
 setup.py | 577 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 577 insertions(+)
 create mode 100644 setup.py

diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000000..af558c2ef0b
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,577 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import os
+import os.path
+import errno
+import re
+import shutil
+import sys
+import fnmatch
+import errno
+import platform
+
+from contextlib import contextmanager
+from setuptools import Command
+from setuptools import setup, Distribution, Extension
+from setuptools.command.install import install as InstallCommandBase
+
+
+class BinaryDistribution(Distribution):
+    def has_ext_modules(foo):
+        return True
+
+
+RC = 0
+
+ext_name = '.dll' if os.name == 'nt' else ('.dylib' if sys.platform == 'darwin'
+                                           else '.so')
+
+
+def git_commit():
+    try:
+        cmd = ['git', 'rev-parse', 'HEAD']
+        git_commit = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE,
+            cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
+    except:
+        git_commit = 'Unknown'
+    git_commit = git_commit.decode()
+    return str(git_commit)
+
+
+def _get_version_detail(idx):
+    assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
+        so detail index must less than 3"
+
+    if re.match('@TAG_VERSION_REGEX@', '@PADDLE_VERSION@'):
+        version_details = '@PADDLE_VERSION@'.split('.')
+
+        if len(version_details) >= 3:
+            return version_details[idx]
+
+    return 0
+
+
+def get_major():
+    return int(_get_version_detail(0))
+
+
+def get_minor():
+    return int(_get_version_detail(1))
+
+
+def get_patch():
+    return str(_get_version_detail(2))
+
+
+def is_taged():
+    try:
+        cmd = [
+            'git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null'
+        ]
+        git_tag = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE,
+            cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
+        git_tag = git_tag.decode()
+    except:
+        return False
+
+    if str(git_tag).replace('v', '') == '@PADDLE_VERSION@':
+        return True
+    else:
+        return False
+
+
+def write_version_py(filename='paddle/version.py'):
+    cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+#
+full_version    = '%(major)d.%(minor)d.%(patch)s'
+major           = '%(major)d'
+minor           = '%(minor)d'
+patch           = '%(patch)s'
+rc              = '%(rc)d'
+istaged         = %(istaged)s
+commit          = '%(commit)s'
+with_mkl        = '%(with_mkl)s'
+
+def show():
+    if istaged:
+        print('full_version:', full_version)
+        print('major:', major)
+        print('minor:', minor)
+        print('patch:', patch)
+        print('rc:', rc)
+    else:
+        print('commit:', commit)
+
+def mkl():
+    return with_mkl
+'''
+    commit = git_commit()
+    with open(filename, 'w') as f:
+        f.write(cnt % {
+            'major': get_major(),
+            'minor': get_minor(),
+            'patch': get_patch(),
+            'rc': RC,
+            'version': '${PADDLE_VERSION}',
+            'commit': commit,
+            'istaged': is_taged(),
+            'with_mkl': '@WITH_MKL@'
+        })
+
+
+write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
+
+
+def write_distributed_training_mode_py(
+        filename='paddle/fluid/incubate/fleet/parameter_server/version.py'):
+    cnt = '''from __future__ import print_function
+
+# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+
+from paddle.fluid.incubate.fleet.base.mode import Mode
+
+BUILD_MODE=Mode.%(mode)s
+
+def is_transpiler():
+    return Mode.TRANSPILER == BUILD_MODE
+
+'''
+
+    dirname = os.path.dirname(filename)
+
+    try:
+        os.makedirs(dirname)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+    with open(filename, 'w') as f:
+        f.write(cnt %
+                {'mode': 'PSLIB' if '${WITH_PSLIB}' == 'ON' else 'TRANSPILER'})
+
+
+write_distributed_training_mode_py(
+    filename='@PADDLE_BINARY_DIR@/python/paddle/fluid/incubate/fleet/parameter_server/version.py'
+)
+
+packages = [
+    'paddle',
+    'paddle.libs',
+    'paddle.utils',
+    'paddle.dataset',
+    'paddle.reader',
+    'paddle.distributed',
+    'paddle.incubate',
+    'paddle.incubate.complex',
+    'paddle.incubate.complex.tensor',
+    'paddle.distributed.fleet',
+    'paddle.distributed.fleet.base',
+    'paddle.distributed.fleet.meta_optimizers',
+    'paddle.distributed.fleet.runtime',
+    'paddle.distributed.fleet.dataset',
+    'paddle.distributed.fleet.metrics',
+    'paddle.distributed.fleet.proto',
+    'paddle.distributed.fleet.utils',
+    'paddle.framework',
+    'paddle.jit',
+    'paddle.fluid',
+    'paddle.fluid.inference',
+    'paddle.fluid.dygraph',
+    'paddle.fluid.dygraph.dygraph_to_static',
+    'paddle.fluid.dygraph.amp',
+    'paddle.fluid.proto',
+    'paddle.fluid.proto.profiler',
+    'paddle.fluid.distributed',
+    'paddle.fluid.layers',
+    'paddle.fluid.dataloader',
+    'paddle.fluid.contrib',
+    'paddle.fluid.contrib.decoder',
+    'paddle.fluid.contrib.quantize',
+    'paddle.fluid.contrib.reader',
+    'paddle.fluid.contrib.slim',
+    'paddle.fluid.contrib.slim.quantization',
+    'paddle.fluid.contrib.slim.quantization.imperative',
+    'paddle.fluid.contrib.utils',
+    'paddle.fluid.contrib.extend_optimizer',
+    'paddle.fluid.contrib.mixed_precision',
+    'paddle.fluid.contrib.layers',
+    'paddle.fluid.transpiler',
+    'paddle.fluid.transpiler.details',
+    'paddle.fluid.incubate',
+    'paddle.fluid.incubate.data_generator',
+    'paddle.fluid.incubate.fleet',
+    'paddle.fluid.incubate.checkpoint',
+    'paddle.fluid.incubate.fleet.base',
+    'paddle.fluid.incubate.fleet.parameter_server',
+    'paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler',
+    'paddle.fluid.incubate.fleet.parameter_server.pslib',
+    'paddle.fluid.incubate.fleet.parameter_server.ir',
+    'paddle.fluid.incubate.fleet.collective',
+    'paddle.fluid.incubate.fleet.utils',
+    'paddle.hapi',
+    'paddle.vision',
+    'paddle.vision.models',
+    'paddle.vision.transforms',
+    'paddle.vision.datasets',
+    'paddle.text',
+    'paddle.text.datasets',
+    'paddle.incubate',
+    'paddle.io',
+    'paddle.optimizer',
+    'paddle.nn',
+    'paddle.nn.functional',
+    'paddle.nn.layer',
+    'paddle.nn.initializer',
+    'paddle.nn.utils',
+    'paddle.metric',
+    'paddle.static',
+    'paddle.static.nn',
+    'paddle.tensor',
+]
+
+with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
+    setup_requires = f.read().splitlines()
+
+# Note(wangzhongpu):
+# When compiling paddle under python36, the dependencies belonging to python2.7 will be imported, resulting in errors when installing paddle
+if sys.version_info >= (3, 6) and sys.version_info < (3, 7):
+    setup_requires_tmp = []
+    for setup_requires_i in setup_requires:
+        if "<\"3.6\"" in setup_requires_i or "<\"3.5\"" in setup_requires_i or "<=\"3.5\"" in setup_requires_i:
+            continue
+        setup_requires_tmp += [setup_requires_i]
+    setup_requires = setup_requires_tmp
+if sys.version_info >= (3, 5) and sys.version_info < (3, 6):
+    setup_requires_tmp = []
+    for setup_requires_i in setup_requires:
+        if "<\"3.5\"" in setup_requires_i:
+            continue
+        setup_requires_tmp += [setup_requires_i]
+    setup_requires = setup_requires_tmp
+if sys.version_info >= (3, 7):
+    setup_requires_tmp = []
+    for setup_requires_i in setup_requires:
+        if "<\"3.6\"" in setup_requires_i or "<=\"3.6\"" in setup_requires_i or "<\"3.5\"" in setup_requires_i or "<=\"3.5\"" in setup_requires_i or "<\"3.7\"" in setup_requires_i:
+            continue
+        setup_requires_tmp += [setup_requires_i]
+    setup_requires = setup_requires_tmp
+
+if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
+    setup_requires += ['opencv-python']
+
+# the prefix is sys.prefix which should always be usr
+paddle_bins = ''
+
+if not '${WIN32}':
+    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
+package_data = {
+    'paddle.fluid':
+    ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]
+}
+if '${HAS_NOAVX_CORE}' == 'ON':
+    package_data['paddle.fluid'] += [
+        'core_noavx' + ('.so' if os.name != 'nt' else '.pyd')
+    ]
+
+package_dir = {
+    '': '${PADDLE_BINARY_DIR}/python',
+    # The paddle.fluid.proto will be generated while compiling.
+    # So that package points to other directory.
+    'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
+    'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
+    'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
+}
+
+# put all thirdparty libraries in paddle.libs
+libs_path = '${PADDLE_BINARY_DIR}/python/paddle/libs'
+
+package_data['paddle.libs'] = []
+package_data['paddle.libs'] = [('libwarpctc'
+                                if os.name != 'nt' else 'warpctc') + ext_name]
+shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+
+if '${WITH_MKL}' == 'ON':
+    shutil.copy('${MKLML_SHARED_LIB}', libs_path)
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
+    package_data['paddle.libs'] += [
+        ('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name,
+        ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name
+    ]
+else:
+    if os.name == 'nt':
+        # copy the openblas.dll
+        shutil.copy('${OPENBLAS_SHARED_LIB}', libs_path)
+        package_data['paddle.libs'] += ['openblas' + ext_name]
+
+if '${WITH_LITE}' == 'ON':
+    shutil.copy('${LITE_SHARED_LIB}', libs_path)
+    package_data['paddle.libs'] += ['libpaddle_full_api_shared' + ext_name]
+
+if '${WITH_PSLIB}' == 'ON':
+    shutil.copy('${PSLIB_LIB}', libs_path)
+    if os.path.exists('${PSLIB_VERSION_PY}'):
+        shutil.copy(
+            '${PSLIB_VERSION_PY}',
+            '${PADDLE_BINARY_DIR}/python/paddle/fluid/incubate/fleet/parameter_server/pslib/'
+        )
+    package_data['paddle.libs'] += ['libps' + ext_name]
+
+if '${WITH_MKLDNN}' == 'ON':
+    if '${CMAKE_BUILD_TYPE}' == 'Release' and os.name != 'nt':
+        # only change rpath in Release mode.
+        # TODO(typhoonzero): use install_name_tool to patch mkl libs once
+        # we can support mkl on mac.
+        #
+        # change rpath of libdnnl.so.1, add $ORIGIN/ to it.
+        # The reason is that all thirdparty libraries in the same directory,
+        # thus, libdnnl.so.1 will find libmklml_intel.so and libiomp5.so.
+        command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
+        if os.system(command) != 0:
+            raise Exception("patch libdnnl.so failed, command: %s" % command)
+    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
+    if os.name != 'nt':
+        shutil.copy('${MKLDNN_SHARED_LIB_1}', libs_path)
+        package_data['paddle.libs'] += ['libmkldnn.so.0', 'libdnnl.so.1']
+    else:
+        package_data['paddle.libs'] += ['mkldnn.dll']
+
+if '${WITH_XPU}' == 'ON':
+    # only change rpath in Release mode,
+    if '${CMAKE_BUILD_TYPE}' == 'Release':
+        if os.name != 'nt':
+            if "@APPLE@" == "1":
+                command = "install_name_tool -id \"@loader_path/\" ${XPU_API_LIB}"
+            else:
+                command = "patchelf --set-rpath '$ORIGIN/' ${XPU_API_LIB}"
+            if os.system(command) != 0:
+                raise Exception("patch ${XPU_API_LIB} failed, command: %s" %
+                                command)
+    shutil.copy('${XPU_API_LIB}', libs_path)
+    shutil.copy('${XPU_RT_LIB}', libs_path)
+    shutil.copy('${XPU_SIM_LIB}', libs_path)
+    package_data['paddle.libs'] += [
+        '${XPU_API_LIB_NAME}', '${XPU_RT_LIB_NAME}', '${XPU_SIM_LIB_NAME}'
+    ]
+
+# copy libfuild_framework.so to libs
+if os.name != 'nt' and sys.platform != 'darwin':
+    paddle_framework_lib = '${FLUID_FRAMEWORK_SHARED_LIB}'
+    shutil.copy(paddle_framework_lib, libs_path)
+    package_data['paddle.libs'] += [
+        ('libpaddle_framework'
+         if os.name != 'nt' else 'paddle_framework') + ext_name
+    ]
+
+# remove unused paddle/libs/__init__.py
+if os.path.isfile(libs_path + '/__init__.py'):
+    os.remove(libs_path + '/__init__.py')
+package_dir['paddle.libs'] = libs_path
+
+# change rpath of ${FLUID_CORE_NAME}.ext, add $ORIGIN/../libs/ to it.
+# The reason is that libwarpctc.ext, libiomp5.ext etc are in paddle.libs, and
+# ${FLUID_CORE_NAME}.ext is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
+# This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
+if '${CMAKE_BUILD_TYPE}' == 'Release':
+    if os.name != 'nt':
+        # only change rpath in Release mode, since in Debug mode, ${FLUID_CORE_NAME}.xx is too large to be changed.
+        if "@APPLE@" == "1":
+            command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
+        else:
+            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
+        # The dynamic library compiled under aarch64 is greater than 64M,
+        # and an oversize error will be reported when using patchelf.
+        if platform.machine() != 'aarch64':
+            if os.system(command) != 0:
+                raise Exception(
+                    "patch ${FLUID_CORE_NAME}.%s failed, command: %s" %
+                    (ext_name, command))
+
+ext_modules = [Extension('_foo', ['stub.cc'])]
+if os.name == 'nt':
+    # fix the path separator under windows
+    fix_package_dir = {}
+    for k, v in package_dir.items():
+        fix_package_dir[k] = v.replace('/', '\\')
+    package_dir = fix_package_dir
+    ext_modules = []
+elif sys.platform == 'darwin':
+    ext_modules = []
+
+
+def find_files(pattern, root):
+    for dirpath, _, files in os.walk(root):
+        for filename in fnmatch.filter(files, pattern):
+            yield os.path.join(dirpath, filename)
+
+
+headers = (
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/framework')) +
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/imperative')) +
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/memory')) +
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform')) +
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) +
+    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) +
+    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) +
+    list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}'))
+    +  # errorMessage.pb for errormessage
+    ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] +  # eigen
+    list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) +  # eigen
+    list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) +  # eigen
+    list(find_files('*', '${GFLAGS_INSTALL_DIR}/include')) +  # gflags
+    list(find_files('*', '${GLOG_INSTALL_DIR}/include')) +  # glog
+    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost')) +  # boost
+    list(find_files('*', '${XXHASH_INSTALL_DIR}/include')) +  # xxhash
+    list(find_files('*', '${PROTOBUF_INCLUDE_DIR}')) +  # protobuf
+    list(find_files('*', '${DLPACK_INCLUDE_DIR}')) +  # dlpack
+    list(find_files('*.h', '${THREADPOOL_INCLUDE_DIR}')))  # threadpool
+
+if '${WITH_MKLDNN}' == 'ON':
+    headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include'))  # mkldnn
+
+if '${WITH_GPU}' == 'ON':
+    headers += list(find_files(
+        '*.pb', '${cudaerror_INCLUDE_DIR}'))  # errorMessage.pb for errormessage
+
+
+class InstallCommand(InstallCommandBase):
+    def finalize_options(self):
+        ret = InstallCommandBase.finalize_options(self)
+        self.install_headers = os.path.join(self.install_purelib, 'paddle',
+                                            'include')
+        self.install_lib = self.install_platlib
+        return ret
+
+
+class InstallHeaders(Command):
+    """Override how headers are copied.
+    """
+    description = 'install C/C++ header files'
+
+    user_options = [
+        ('install-dir=', 'd', 'directory to install header files to'),
+        ('force', 'f', 'force installation (overwrite existing files)'),
+    ]
+
+    boolean_options = ['force']
+
+    def initialize_options(self):
+        self.install_dir = None
+        self.force = 0
+        self.outfiles = []
+
+    def finalize_options(self):
+        self.set_undefined_options(
+            'install', ('install_headers', 'install_dir'), ('force', 'force'))
+
+    def mkdir_and_copy_file(self, header):
+        if 'pb.h' in header:
+            install_dir = re.sub('${PADDLE_BINARY_DIR}/', '', header)
+        elif 'third_party' not in header:
+            # framework
+            install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
+        else:
+            # third_party
+            install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
+            patterns = [
+                'eigen3/src/extern_eigen3', 'boost/src/extern_boost',
+                'dlpack/src/extern_dlpack/include', 'install/protobuf/include',
+                'install/gflags/include', 'install/glog/include',
+                'install/xxhash/include', 'install/mkldnn/include',
+                'threadpool/src/extern_threadpool'
+            ]
+            for pattern in patterns:
+                install_dir = re.sub(pattern, '', install_dir)
+        install_dir = os.path.join(self.install_dir,
+                                   os.path.dirname(install_dir))
+        if not os.path.exists(install_dir):
+            self.mkpath(install_dir)
+        return self.copy_file(header, install_dir)
+
+    def run(self):
+        # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
+        if os.name == 'nt' or sys.platform == 'darwin':
+            if '${WITH_GPU}' == 'ON':
+                self.mkdir_and_copy_file(
+                    '${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
+            return
+        hdrs = self.distribution.headers
+        if not hdrs:
+            return
+        self.mkpath(self.install_dir)
+        for header in hdrs:
+            (out, _) = self.mkdir_and_copy_file(header)
+            self.outfiles.append(out)
+
+    def get_inputs(self):
+        return self.distribution.headers or []
+
+    def get_outputs(self):
+        return self.outfiles
+
+
+# we redirect setuptools log for non-windows
+if sys.platform != 'win32':
+
+    @contextmanager
+    def redirect_stdout():
+        f_log = open('${SETUP_LOG_FILE}', 'w')
+        origin_stdout = sys.stdout
+        sys.stdout = f_log
+        yield
+        f_log = sys.stdout
+        sys.stdout = origin_stdout
+        f_log.close()
+else:
+
+    @contextmanager
+    def redirect_stdout():
+        yield
+
+
+if '${WITH_GPU}' == 'ON':
+    os.environ['PACKAGE_NAME'] = "paddlepaddle-gpu"
+else:
+    os.environ['PACKAGE_NAME'] = "paddlepaddle"
+
+with redirect_stdout():
+    setup(
+        name='${PACKAGE_NAME}',
+        version='${PADDLE_VERSION}',
+        description='Parallel Distributed Deep Learning',
+        install_requires=setup_requires,
+        packages=packages,
+        ext_modules=ext_modules,
+        package_data=package_data,
+        package_dir=package_dir,
+        scripts=paddle_bins,
+        distclass=BinaryDistribution,
+        headers=headers,
+        cmdclass={
+            'install_headers': InstallHeaders,
+            'install': InstallCommand,
+        },
+        entry_points={
+            'console_scripts':
+            ['fleetrun = paddle.distributed.fleet.launch:launch']
+        })
+
+# As there are a lot of files in purelib which causes many logs,
+# we don't print them on the screen, and you can open `setup.py.log`
+# for the full logs.
+if os.path.exists('${SETUP_LOG_FILE}'):
+    os.system('grep -v "purelib" ${SETUP_LOG_FILE}')
-- 
GitLab


From 6e29c2da05c7dbfbd2e799c017f16f29e85d809c Mon Sep 17 00:00:00 2001
From: Jack Zhou <136876878@qq.com>
Date: Wed, 16 Sep 2020 18:35:46 +0800
Subject: [PATCH 103/261] Error description optimize for the math dir

Error description optimize for the math dir
---
 .../operators/math/math_function_test.cc      |  4 +-
 paddle/fluid/operators/math/sampler.h         |  8 +--
 paddle/fluid/operators/math/vol2col.cc        | 68 +++++++++----------
 paddle/fluid/operators/math/vol2col.cu        | 66 +++++++++---------
 4 files changed, 70 insertions(+), 76 deletions(-)

diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 587823e535a..3388d7edafe 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -226,8 +226,8 @@ TEST(math_funciton, set_constant) {
   for (int64_t i = 0; i < t.numel(); ++i) {
     PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
                       paddle::platform::errors::InvalidArgument(
-                          "Each value of input"
-                          "tensor should be 10, but received %d.",
+                          "Each value of input tensor should be 10, "
+                          "but received %d.",
                           t.data<int>()[i]));
   }
   delete ctx;
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index de9113f2bb6..b90e7e19803 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -33,10 +33,10 @@ namespace math {
 class Sampler {
  public:
   explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
-    PADDLE_ENFORCE_GT(range, 0, platform::errors::InvalidArgument(
-                                    "Range should be"
-                                    " greater than 0, but recevied %d.",
-                                    range));
+    PADDLE_ENFORCE_GT(
+        range, 0,
+        platform::errors::InvalidArgument(
+            "Range should be greater than 0, but recevied %d.", range));
     if (seed == 0) {
       std::random_device r;
       seed_ = r();
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index c05da0062f2..794fc647172 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -34,16 +34,15 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* col,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(
-        vol.dims().size(), 4,
-        platform::errors::InvalidArgument("The dimension of"
-                                          " vol should be 4, but received %d.",
-                                          vol.dims().size()));
-    PADDLE_ENFORCE_EQ(
-        col->dims().size(), 7,
-        platform::errors::InvalidArgument("The dimension of"
-                                          "col should be 7, but received %d.",
-                                          col->dims().size()));
+    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol.dims().size()));
+
+    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col should be 7, but received %d.",
+                          col->dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
@@ -152,16 +151,15 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* vol,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(
-        vol->dims().size(), 4,
-        platform::errors::InvalidArgument("The dimension of vol"
-                                          " should be 4, but received %d.",
-                                          vol->dims().size()));
-    PADDLE_ENFORCE_EQ(
-        col.dims().size(), 7,
-        platform::errors::InvalidArgument("The dimension of col"
-                                          " should be 7, but received %d.",
-                                          col.dims().size()));
+    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol->dims().size()));
+
+    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col  should be 7, but received %d.",
+                          col.dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
@@ -192,29 +190,29 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
                             ((dilations[0] * (filter_depth - 1) + 1))) /
                                strides[0] +
                            1;
-    PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
-                      platform::errors::InvalidArgument(
-                          "input_depth(%d)"
-                          " and output_depth(%d) are mismatching.",
-                          input_depth_tmp, output_depth));
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
     auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
                              ((dilations[1] * (filter_height - 1) + 1))) /
                                 strides[1] +
                             1;
-    PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
-                      platform::errors::InvalidArgument(
-                          "input_height(%d)"
-                          " and output_height(%d) are mismatching.",
-                          input_height_tmp, output_height));
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
     auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
                             ((dilations[2] * (filter_width - 1) + 1))) /
                                strides[2] +
                            1;
-    PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
-                      platform::errors::InvalidArgument(
-                          "input_width(%d)"
-                          " and output_width(%d) are mismatching.",
-                          input_width_tmp, output_width));
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d)  and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
     T* vol_data = vol->data<T>();
     const T* col_data = col.data<T>();
 
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
index fe5a6009098..eca39e91973 100644
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -90,16 +90,14 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* col,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(
-        vol.dims().size(), 4,
-        platform::errors::InvalidArgument("The dimension of"
-                                          " vol should be 4, but received %d.",
-                                          vol.dims().size()));
-    PADDLE_ENFORCE_EQ(
-        col->dims().size(), 7,
-        platform::errors::InvalidArgument("The dimension of"
-                                          "col should be 7, but received %d.",
-                                          col->dims().size()));
+    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of  vol should be 4, but received %d.",
+                          vol.dims().size()));
+    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col should be 7, but received %d.",
+                          col->dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
@@ -253,16 +251,14 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* vol,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(
-        vol->dims().size(), 4,
-        platform::errors::InvalidArgument("The dimension of vol"
-                                          " should be 4, but received %d.",
-                                          vol->dims().size()));
-    PADDLE_ENFORCE_EQ(
-        col.dims().size(), 7,
-        platform::errors::InvalidArgument("The dimension of col"
-                                          " should be 7, but received %d.",
-                                          col.dims().size()));
+    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol  should be 4, but received %d.",
+                          vol->dims().size()));
+    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col  should be 7, but received %d.",
+                          col.dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
@@ -291,29 +287,29 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
                             ((dilations[0] * (filter_depth - 1) + 1))) /
                                strides[0] +
                            1;
-    PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
-                      platform::errors::InvalidArgument(
-                          "input_depth(%d)"
-                          " and output_depth(%d) are mismatching.",
-                          input_depth_tmp, output_depth));
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
     auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
                              ((dilations[1] * (filter_height - 1) + 1))) /
                                 strides[1] +
                             1;
-    PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
-                      platform::errors::InvalidArgument(
-                          "input_height(%d)"
-                          " and output_height(%d) are mismatching.",
-                          input_height_tmp, output_height));
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
     auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
                             ((dilations[2] * (filter_width - 1) + 1))) /
                                strides[2] +
                            1;
-    PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
-                      platform::errors::InvalidArgument(
-                          "input_width(%d)"
-                          " and output_width(%d) are mismatching.",
-                          input_width_tmp, output_width));
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
 
     int num_kernels = input_channels * input_depth * input_height * input_width;
 
-- 
GitLab


From 54b81fa32c6bfacd637df590051123a31225aac2 Mon Sep 17 00:00:00 2001
From: ShenLiang <shenliang03@baidu.com>
Date: Wed, 16 Sep 2020 19:23:30 +0800
Subject: [PATCH 104/261] add adaptivelsgd in meta_optimizer (#27289)

* add adaptivelsgd

* Todo fix the code to avoid the conflict.
---
 .../framework/distributed_strategy.proto      |   7 +
 .../fleet/base/distributed_strategy.py        |  57 ++++
 .../fleet/meta_optimizers/__init__.py         |   1 +
 .../fleet/meta_optimizers/amp_optimizer.py    |   2 +-
 .../meta_optimizers/localsgd_optimizer.py     | 253 +++++++++++++++++-
 .../test_fleet_distributed_strategy.py        |   7 +
 .../test_fleet_localsgd_meta_optimizer.py     |  31 +++
 7 files changed, 356 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index edd1700ae72..df482f43346 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -41,6 +41,11 @@ message LocalSGDConfig {
   optional int32 begin_step = 2 [ default = 1 ];
 }
 
+message AdaptiveLocalSGDConfig {
+  optional int32 init_k_steps = 1 [ default = 1 ];
+  optional int32 begin_step = 2 [ default = 1 ];
+}
+
 message GradientMergeConfig {
   optional int32 k_steps = 1 [ default = 1 ];
   optional bool avg = 2 [ default = true ];
@@ -121,6 +126,7 @@ message DistributedStrategy {
   optional bool cudnn_exhaustive_search = 21 [ default = true ];
   optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
   optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
+  optional bool adaptive_localsgd = 24 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
@@ -131,6 +137,7 @@ message DistributedStrategy {
   optional AsyncConfig a_sync_configs = 107;
   optional LarsConfig lars_configs = 108;
   optional LambConfig lamb_configs = 109;
+  optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
   optional BuildStrategy build_strategy = 201;
   optional ExecutionStrategy execution_strategy = 202;
 }
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 1b86056c004..f1c836468da 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -728,6 +728,63 @@ class DistributedStrategy(object):
                           "localsgd_configs")
         assign_configs_value(self.strategy.localsgd_configs, configs)
 
+    @property
+    def adaptive_localsgd(self):
+        """
+        Indicating whether we are using Adaptive Local SGD training. Default Value: False
+        For more details, please refer to `Adaptive Communication Strategies to Achieve 
+        the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
+
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.adaptive_localsgd = True # by default this is false
+
+        """
+        return self.strategy.localsgd
+
+    @adaptive_localsgd.setter
+    @is_strict_auto
+    def adaptive_localsgd(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.localsgd = flag
+        else:
+            print("WARNING: adaptive_localsgd should have value of bool type")
+
+    @property
+    def adaptive_localsgd_configs(self):
+        """
+        Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable
+        setting that can be configured through a dict.
+
+        **Notes**:
+            init_k_steps(int) The initial steps for training before adaptive localsgd.
+                              Then, the adaptive localsgd method will modify init_k_steps automatically.
+                              Default 1.
+            begin_step(int) The step of begining training by adaptive localsgd. Default 1.
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.adaptive_localsgd = True
+            strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
+                                                  "begin_step": 30}
+        """
+
+        return get_msg_dict(self.strategy.adaptive_localsgd_configs)
+
+    @adaptive_localsgd_configs.setter
+    @is_strict_auto
+    def adaptive_localsgd_configs(self, configs):
+        check_configs_key(self.strategy.adaptive_localsgd_configs, configs,
+                          "adaptive_localsgd_configs")
+        assign_configs_value(self.strategy.adaptive_localsgd_configs, configs)
+
     @property
     def dgc(self):
         """
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index d98b2ef3e2a..a3a2dee7038 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -18,6 +18,7 @@ from .graph_execution_optimizer import GraphExecutionOptimizer
 from .parameter_server_optimizer import ParameterServerOptimizer
 from .pipeline_optimizer import PipelineOptimizer
 from .localsgd_optimizer import LocalSGDOptimizer
+from .localsgd_optimizer import AdaptiveLocalSGDOptimizer
 from .lars_optimizer import LarsOptimizer
 from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer
 from .dgc_optimizer import DGCOptimizer
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index 31a9913701c..3e89d382111 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -24,7 +24,7 @@ class AMPOptimizer(MetaOptimizerBase):
         self.meta_optimizers_white_list = [
             "LarsOptimizer", "LambOptimizer", "RecomputeOptimizer",
             "LocalSGDOptimizer", "GradientMergeOptimizer",
-            "GraphExecutionOptimizer"
+            "GraphExecutionOptimizer", "AdaptiveLocalSGDOptimizer"
         ]
         self.meta_optimizers_black_list = ["DGCOptimizer"]
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 6fa34d8d28a..75271968fca 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -25,7 +25,9 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         super(LocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         self.meta_optimizers_white_list = []
-        self.meta_optimizers_black_list = ["GraphExecutionOptimizer"]
+        self.meta_optimizers_black_list = [
+            "GraphExecutionOptimizer", "AdaptiveLocalSGDOptimizer"
+        ]
         self.snapshot_key = '@SNAPSHOT'
 
     def _can_apply(self):
@@ -186,3 +188,252 @@ class LocalSGDOptimizer(MetaOptimizerBase):
 
             layers.cond(step > begin_step, begin_localsgd, communicate)
         return minimized
+
+
+class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(AdaptiveLocalSGDOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = [
+            "GraphExecutionOptimizer", "LocalSGDOptimizer"
+        ]
+        self.snapshot_key = '@SNAPSHOT'
+
+    def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
+        if not self.user_defined_strategy.adaptive_localsgd:
+            return False
+
+        if self.role_maker.worker_num() <= 1:
+            return False
+
+        return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
+            or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \
+            or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \
+            or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD)
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.adaptive_localsgd = False
+        dist_strategy.adaptive_localsgd_configs = {}
+
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.adaptive_localsgd = True
+        dist_strategy.adaptive_localsgd_configs = {
+            "init_k_steps": 1,
+            "begin_step": 1
+        }
+
+    def snapshot_name(self, param_name):
+        return param_name + self.snapshot_key
+
+    def create_snapshot_vars(self, program):
+        block = program.global_block()
+
+        non_dist_params = []
+        for param in block.iter_parameters():
+            if not param.is_distributed:
+                non_dist_params.append(param)
+
+        p2s = []
+        for param in non_dist_params:
+            snapshot = block.create_var(
+                name=self.snapshot_name(param.name),
+                shape=param.shape,
+                persistable=True,
+                stop_gradient=True,
+                dtype=param.dtype)
+            p2s.append([param, snapshot])
+        return p2s
+
+    def init_snapshot_vars(self, startup_program, param2snapshot):
+        with program_guard(startup_program):
+            for param, snapshot in param2snapshot:
+                layers.assign(param, snapshot)
+
+    def _generate_avg_loss(self, program_block, loss, avg_loss):
+        program_block.append_op(
+            type='c_allreduce_sum',
+            inputs={'X': [loss]},
+            outputs={'Out': [avg_loss]},
+            attrs={
+                'ring_id': 0,
+                OP_ROLE_KEY: OpRole.Optimize,
+                'use_calc_stream': True
+            })
+        program_block.append_op(
+            type='c_sync_calc_stream',
+            inputs={'X': [avg_loss]},
+            outputs={'Out': [avg_loss]},
+            attrs={OP_ROLE_KEY: OpRole.Optimize})
+
+        program_block.append_op(
+            type='scale',
+            inputs={'X': [avg_loss]},
+            outputs={'Out': [avg_loss]},
+            attrs={
+                'scale': 1.0 / self.role_maker.worker_num(),
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        minimized = self.inner_opt.minimize(
+            loss, startup_program=startup_program)
+
+        init_k_steps = self.user_defined_strategy.adaptive_localsgd_configs[
+            'init_k_steps']
+        begin_step_value = self.user_defined_strategy.adaptive_localsgd_configs[
+            'begin_step']
+
+        if startup_program is None:
+            startup_program = default_startup_program()
+        main_block = loss.block
+
+        self.nrings = 2
+        collective_helper = CollectiveHelper(self.role_maker, self.nrings)
+        collective_helper.update_startup_program(startup_program)
+        p2s = self.create_snapshot_vars(startup_program)
+        self.init_snapshot_vars(startup_program, p2s)
+
+        p2s = self.create_snapshot_vars(main_block.program)
+        with program_guard(main_block.program, startup_program):
+            step = layers.autoincreased_step_counter(begin=1)
+
+            k_steps = layers.create_global_var(
+                name="k_steps",
+                shape=[1],
+                value=int(init_k_steps),
+                dtype='int64',
+                persistable=True)
+
+            begin_step = layers.create_global_var(
+                name="begin_step",
+                shape=[1],
+                value=int(begin_step_value),
+                dtype='int64',
+                persistable=True)
+
+            last_step = layers.create_global_var(
+                name="last_step",
+                shape=[1],
+                value=int(0),
+                dtype='int64',
+                persistable=True)
+
+            avg_loss = layers.create_global_var(
+                name="avg_loss",
+                shape=[1],
+                value=float(0),
+                dtype=loss.dtype,
+                persistable=True)
+
+            lr_0 = layers.create_global_var(
+                name="lr_0",
+                shape=[1],
+                value=float(0),
+                dtype='float32',
+                persistable=True)
+
+            loss_0 = layers.create_global_var(
+                name="loss_0",
+                shape=[1],
+                value=float(0),
+                dtype='float32',
+                persistable=True)
+
+            global_lr = self.inner_opt._global_learning_rate()
+
+            def initialize():
+                self._generate_avg_loss(main_block, loss, avg_loss)
+                layers.assign(avg_loss, loss_0)
+                layers.assign(global_lr, lr_0)
+
+            layers.cond(step == 1, initialize)
+
+            def communicate():
+                sub_block = default_main_program().current_block()
+                ring_id = -1
+                for param, snapshot in p2s:
+                    sub_block.append_op(
+                        type='elementwise_sub',
+                        inputs={'X': [snapshot],
+                                'Y': [param]},
+                        outputs={'Out': [param]},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(
+                        type='c_sync_calc_stream',
+                        inputs={'X': param},
+                        outputs={'Out': param},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    ring_id = (ring_id + 1) % self.nrings
+                    sub_block.append_op(
+                        type='c_allreduce_sum',
+                        inputs={'X': [param]},
+                        outputs={'Out': [param]},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
+
+                for ring_id in range(self.nrings):
+                    sub_block.append_op(
+                        type='c_sync_comm_stream',
+                        inputs={'X': param},
+                        outputs={'Out': param},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
+
+                for param, snapshot in p2s:
+                    sub_block.append_op(
+                        type='scale',
+                        inputs={'X': [param]},
+                        outputs={'Out': [param]},
+                        attrs={
+                            'scale': 1.0 / self.role_maker.worker_num(),
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
+                    sub_block.append_op(
+                        type='elementwise_sub',
+                        inputs={'X': [snapshot],
+                                'Y': [param]},
+                        outputs={'Out': [param]},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(
+                        type='assign',
+                        inputs={'X': [param]},
+                        outputs={'Out': [snapshot]},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                layers.assign(step, last_step)
+
+            def communicate_avg_loss():
+                communicate()
+                self._generate_avg_loss(main_block, loss, avg_loss)
+                next_local_steps = layers.cast(
+                    layers.ceil(
+                        layers.sqrt(lr_0 * avg_loss / (global_lr * loss_0) *
+                                    float(init_k_steps))),
+                    dtype='int64')
+                max_local_steps = layers.fill_constant(
+                    shape=[1], dtype='int64', value=16)
+                min_local_steps = layers.fill_constant(
+                    shape=[1], dtype='int64', value=1)
+                next_local_steps = layers.elementwise_min(next_local_steps,
+                                                          max_local_steps)
+                next_local_steps = layers.elementwise_max(next_local_steps,
+                                                          min_local_steps)
+                layers.assign(next_local_steps, k_steps)
+
+            def begin_localsgd():
+                layers.cond(step - last_step == k_steps, communicate_avg_loss)
+
+            layers.cond(step > begin_step, begin_localsgd, communicate)
+
+        return minimized
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 6f8af3017ef..b20f33e11b6 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -86,6 +86,13 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.localsgd_configs["k_steps"], 4)
         self.assertEqual(strategy.localsgd_configs["begin_step"], 120)
 
+    def test_adaptive_localsgd_configs(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        configs = {"init_k_steps": 1, "begin_step": 120}
+        strategy.adaptive_localsgd_configs = configs
+        self.assertEqual(strategy.adaptive_localsgd_configs["init_k_steps"], 1)
+        self.assertEqual(strategy.adaptive_localsgd_configs["begin_step"], 120)
+
     def test_dgc(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.dgc = True
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
index 945f5ae5745..f5347b0c665 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
@@ -52,5 +52,36 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
         optimizer.minimize(avg_cost)
 
 
+class TestFleetAdaptiveLocalSGDMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
+
+    def test_adaptive_localsgd_optimizer(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.adaptive_localsgd = True
+        config = strategy.adaptive_localsgd_configs
+        config['init_k_steps'] = 1
+        config['begin_step'] = 1
+        strategy.adaptive_localsgd_configs = config
+
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From 11bcf0e21c6d03f13f9474396614f5eee655749c Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 16 Sep 2020 20:13:54 +0800
Subject: [PATCH 105/261] Cleanup redundant code files (#27319)

---
 python/paddle/distributed/fleet/utils/fs.py   |  20 --
 .../fluid/incubate/fleet/utils/fleet_util.py  |   3 +-
 .../paddle/fluid/incubate/fleet/utils/fs.py   | 180 ------------------
 .../tests/unittests/auto_checkpoint_utils.py  |   3 +-
 .../tests/unittests/test_checkpoint_saver.py  |   3 +-
 .../tests/unittests/test_fleet_checkpoint.py  |   3 +-
 6 files changed, 4 insertions(+), 208 deletions(-)
 delete mode 100644 python/paddle/fluid/incubate/fleet/utils/fs.py

diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 2dbe5cefbb4..966b7219d60 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -200,26 +200,6 @@ class LocalFS(FS):
         return dirs
 
 
-"""HDFS Utils."""
-
-
-def _handle_errors(f):
-    def handler(*args, **kwargs):
-        start = time.time()
-        while True:
-            try:
-                return f(*args, **kwargs)
-            except ExecuteError as e:
-                o = args[0]
-                time_out = float(o._time_out) / 1000.0
-                inter = float(o._sleep_inter) / 1000.0
-                if time.time() - start >= time_out:
-                    raise FSTimeOut
-                time.sleep(inter)
-
-    return functools.wraps(f)(handler)
-
-
 def _handle_errors(max_time_out=None):
     def decorator(f):
         @functools.wraps(f)
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
index cb1a54ef198..58313c46c3c 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -26,8 +26,7 @@ import paddle.fluid as fluid
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet as fleet_pslib
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet as fleet_transpiler
-from . import hdfs
-from .hdfs import *
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 from . import utils
 
 __all__ = ["FleetUtil"]
diff --git a/python/paddle/fluid/incubate/fleet/utils/fs.py b/python/paddle/fluid/incubate/fleet/utils/fs.py
deleted file mode 100644
index 0ba06ef934a..00000000000
--- a/python/paddle/fluid/incubate/fleet/utils/fs.py
+++ /dev/null
@@ -1,180 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import subprocess
-import multiprocessing
-from datetime import datetime
-
-import re
-import copy
-import errno
-import time
-import logging
-import abc
-from pathlib import PurePosixPath, Path
-import shutil
-
-__all__ = ['FS', 'LocalFS']
-
-
-class ExecuteError(Exception):
-    pass
-
-
-class FSFileExistsError(Exception):
-    pass
-
-
-class FSFileNotExistsError(Exception):
-    pass
-
-
-class FSTimeOut(Exception):
-    pass
-
-
-class FSShellCmdAborted(ExecuteError):
-    pass
-
-
-class FS(object):
-    @abc.abstractmethod
-    def ls_dir(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def is_file(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def is_dir(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def is_exist(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def upload(self, local_path, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def download(self, fs_path, local_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def mkdirs(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def delete(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def need_upload_download(self):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def rename(self, fs_src_path, fs_dst_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=False):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def upload_dir(self, local_dir, dest_dir):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def list_dirs(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def touch(self, fs_path, exist_ok=True):
-        raise NotImplementedError
-
-
-class LocalFS(FS):
-    def ls_dir(self, fs_path):
-        return [f for f in os.listdir(fs_path)]
-
-    def mkdirs(self, fs_path):
-        assert not os.path.isfile(fs_path), "{} is already a file".format(
-            fs_path)
-        os.system("mkdir -p {}".format(fs_path))
-
-    def rename(self, fs_src_path, fs_dst_path):
-        os.rename(fs_src_path, fs_dst_path)
-
-    def _rmr(self, fs_path):
-        shutil.rmtree(fs_path)
-
-    def _rm(self, fs_path):
-        os.remove(fs_path)
-
-    def delete(self, fs_path):
-        if not self.is_exist(fs_path):
-            return
-
-        if os.path.isfile(fs_path):
-            return self._rm(fs_path)
-
-        return self._rmr(fs_path)
-
-    def need_upload_download(self):
-        return False
-
-    def is_file(self, fs_path):
-        return os.path.isfile(fs_path)
-
-    def is_dir(self, fs_path):
-        return os.path.isdir(fs_path)
-
-    def is_exist(self, fs_path):
-        return os.path.exists(fs_path)
-
-    def touch(self, fs_path, exist_ok=True):
-        if self.is_exist(fs_path):
-            if exist_ok:
-                return
-            raise FSFileExistsError
-
-        return Path(fs_path).touch(exist_ok=True)
-
-    def mv(self, src_path, dst_path, overwrite=False, test_exists=False):
-        if not self.is_exist(src_path):
-            raise FSFileNotExistsError
-
-        if overwrite and self.is_exist(dst_path):
-            self.delete(dst_path)
-
-        if self.is_exist(dst_path):
-            raise FSFileExistsError
-
-        return self.rename(src_path, dst_path)
-
-    def list_dirs(self, fs_path):
-        """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
-        """
-        if not self.is_exist(fs_path):
-            return []
-
-        dirs = [
-            f for f in os.listdir(fs_path) if os.path.isdir(fs_path + "/" + f)
-        ]
-
-        return dirs
diff --git a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
index 529ff4ec45d..2464882d617 100644
--- a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
+++ b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
@@ -20,8 +20,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
 from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
 from paddle.fluid.framework import program_guard
diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py b/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
index ad75f2aa8bc..4c1b1e0f0bf 100644
--- a/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
@@ -21,8 +21,7 @@ from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 import os
 import sys
 
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
index 66baf8faac5..fc57602b445 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
@@ -21,8 +21,7 @@ from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 import os
 import sys
 
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 
 
-- 
GitLab


From 189e10f1dd9d5f645eac8cb2a8515c444256953e Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 16 Sep 2020 22:17:40 +0800
Subject: [PATCH 106/261] Remove unnecessary requirements (#27341)

* remove objgraph

* remove graphviz

* fix ut
---
 python/paddle/fluid/dygraph/base.py           | 19 -----
 .../fluid/tests/unittests/CMakeLists.txt      |  2 -
 .../unittests/test_imperative_debug_string.py | 75 -------------------
 python/requirements.txt                       |  2 -
 4 files changed, 98 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/test_imperative_debug_string.py

diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 2f95c2b9007..01c2f0fed49 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -23,7 +23,6 @@ from paddle.fluid import framework
 from paddle.fluid.multiprocess_utils import CleanupFuncRegistrar
 from .tracer import Tracer
 import logging
-import objgraph
 from ..data_feeder import convert_dtype
 import warnings
 
@@ -368,24 +367,6 @@ def guard(place=None):
                     yield
 
 
-def _print_debug_msg(parameter_list, limit=5, is_test=False):
-    if not core._is_dygraph_debug_enabled():
-        logging.warn(
-            'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
-        )
-        return
-    unique_name_size = len(framework.unique_name.generator.ids)
-    tracer_var_size = len(parameter_list)
-    alive_cpp_var_size = len(core.VarBase._alive_vars())
-    if not is_test:
-        logging.warn(
-            'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}'
-            .format(unique_name_size, tracer_var_size, alive_cpp_var_size))
-        objgraph.show_growth(limit=limit)
-    else:
-        return unique_name_size, tracer_var_size, alive_cpp_var_size
-
-
 @framework.dygraph_only
 def grad(outputs,
          inputs,
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index fa092ffb191..c0fff5b5c81 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -326,7 +326,6 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_api)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
-list(REMOVE_ITEM TEST_OPS test_imperative_debug_string)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass)
 list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_mnist)
 list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_while)
@@ -416,7 +415,6 @@ py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_
 py_test_modules(test_install_check MODULES test_install_check ENVS
         FLAGS_cudnn_deterministic=1 SERIAL)
 set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
-py_test_modules(test_imperative_debug_string MODULES test_imperative_debug_string ENVS FLAGS_dygraph_debug=1)
 py_test_modules(test_imperative_static_runner_mnist MODULES test_imperative_static_runner_mnist ENVS
     FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_static_runner_while MODULES test_imperative_static_runner_while ENVS
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py b/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py
deleted file mode 100644
index 171687283bc..00000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import numpy as np
-
-
-class MLP(fluid.Layer):
-    def __init__(self, input_size):
-        super(MLP, self).__init__()
-        self._linear1 = fluid.dygraph.Linear(
-            input_size,
-            3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
-        self._linear2 = fluid.dygraph.Linear(
-            3,
-            4,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
-
-    def forward(self, inputs):
-        x = self._linear1(inputs)
-        x = self._linear2(x)
-        x = fluid.layers.reduce_sum(x)
-        return x
-
-
-class TestDygraphDebugString(unittest.TestCase):
-    def test_dygraph_debug_string(self):
-        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        unique_name = 0
-        trace_var = 0
-        alive_var = 0
-        with fluid.dygraph.guard():
-            mlp = MLP(input_size=2)
-            for i in range(10):
-                var_inp = fluid.dygraph.base.to_variable(np_inp)
-                out = mlp(var_inp)
-                out.backward()
-                mlp.clear_gradients()
-                unique_name_tmp, trace_var_tmp, alive_var_tmp = fluid.dygraph.base._print_debug_msg(
-                    mlp.parameters(), is_test=True)
-                if i > 0:
-                    self.assertGreaterEqual(unique_name, unique_name_tmp)
-                    self.assertGreaterEqual(trace_var, trace_var_tmp)
-                    self.assertGreaterEqual(alive_var, alive_var_tmp)
-                else:
-                    unique_name = unique_name_tmp
-                    trace_var = trace_var_tmp
-                    alive_var = alive_var_tmp
-                try:
-                    fluid.dygraph.base._print_debug_msg(mlp.parameters())
-                except Exception as e:
-                    raise RuntimeError(
-                        "No Exception is accepted in _print_debug_msg, but we got: {}".
-                        format(e))
diff --git a/python/requirements.txt b/python/requirements.txt
index c8d3b2af179..ddd1e943df7 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -13,11 +13,9 @@ scipy ; python_version>"3.5"
 nltk ; python_version>="3.5"
 rarfile
 Pillow
-graphviz
 six
 decorator
 prettytable
-objgraph
 astor
 pathlib
 netifaces
-- 
GitLab


From ebc6d544464bb7e75c58a7a301bdbd2b93443eb8 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Thu, 17 Sep 2020 09:10:37 +0800
Subject: [PATCH 107/261] fix cache file judge (#27369)

---
 paddle/fluid/inference/tests/api/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index d7d4f7969fa..146d5932577 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -9,8 +9,8 @@ if(WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 function(download_data install_dir data_file)
-    string(REGEX MATCH "[^/\\]+$" data_file ${data_file})
-    if (NOT EXISTS ${install_dir}/${data_file})
+    string(REGEX MATCH "[^/\\]+$" file_name ${data_file})
+    if (NOT EXISTS ${install_dir}/${file_name})
         inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file})
     endif()
 endfunction()
-- 
GitLab


From bf8e030eedc72dde5f0ecf9903c2c7bca925aa1a Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 17 Sep 2020 09:28:28 +0800
Subject: [PATCH 108/261] modify test_imperative_using_non_zero_gpu from use
 two gpus to one gpu (#27348)

* add op_function_generator.exe retry in windows, test=develop

* modify test_imperative_using_non_zero_gpu from use two gpus to one gpu, test=develop
---
 python/paddle/fluid/tests/unittests/CMakeLists.txt              | 2 +-
 .../fluid/tests/unittests/test_imperative_using_non_zero_gpu.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index c0fff5b5c81..b7848066280 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -558,7 +558,7 @@ endif()
 set_tests_properties(test_parallel_executor_test_while_train test_parallel_executor_mnist
         test_parallel_executor_feed_persistable_var
         test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
-        test_data_norm_op test_imperative_using_non_zero_gpu
+        test_data_norm_op
         test_dataloader_keep_order
         test_dataloader_unkeep_order
         test_parallel_executor_fetch_isolated_var
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
index 0af8132acfd..f2dfaef3977 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
@@ -21,7 +21,6 @@ import numpy as np
 class TestImperativeUsingNonZeroGpu(unittest.TestCase):
     def run_main(self, np_arr, place):
         with guard(place):
-            embedding = Embedding(size=[10, 10])
             var = to_variable(np_arr)
             self.assertTrue(np.array_equal(np_arr, var.numpy()))
 
@@ -30,7 +29,6 @@ class TestImperativeUsingNonZeroGpu(unittest.TestCase):
             return
 
         np_arr = np.random.random([11, 13]).astype('float32')
-        self.run_main(np_arr, fluid.CUDAPlace(1))
         self.run_main(np_arr, fluid.CUDAPlace(0))
 
 
-- 
GitLab


From 746a8ded293d01be7b394266417769f561da645e Mon Sep 17 00:00:00 2001
From: ShenLiang <shenliang03@baidu.com>
Date: Thu, 17 Sep 2020 10:19:27 +0800
Subject: [PATCH 109/261] fix comment of adaptive lsgd (#27362)

---
 .../distributed/fleet/meta_optimizers/amp_optimizer.py | 10 +++++++---
 .../fleet/meta_optimizers/localsgd_optimizer.py        |  3 ++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index 3e89d382111..ad96e142669 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -22,9 +22,13 @@ class AMPOptimizer(MetaOptimizerBase):
         self.amp_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = [
-            "LarsOptimizer", "LambOptimizer", "RecomputeOptimizer",
-            "LocalSGDOptimizer", "GradientMergeOptimizer",
-            "GraphExecutionOptimizer", "AdaptiveLocalSGDOptimizer"
+            "LarsOptimizer",
+            "LambOptimizer",
+            "RecomputeOptimizer",
+            "LocalSGDOptimizer",
+            "GradientMergeOptimizer",
+            "GraphExecutionOptimizer",
+            "AdaptiveLocalSGDOptimizer",
         ]
         self.meta_optimizers_black_list = ["DGCOptimizer"]
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 75271968fca..4ebac20888d 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -26,7 +26,8 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         self.inner_opt = optimizer
         self.meta_optimizers_white_list = []
         self.meta_optimizers_black_list = [
-            "GraphExecutionOptimizer", "AdaptiveLocalSGDOptimizer"
+            "GraphExecutionOptimizer",
+            "AdaptiveLocalSGDOptimizer",
         ]
         self.snapshot_key = '@SNAPSHOT'
 
-- 
GitLab


From 8d05c00c67cf079429db53e0058d7213c13ea038 Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Thu, 17 Sep 2020 10:21:51 +0800
Subject: [PATCH 110/261] fix paddle.fleet en-doc for apis in dynamic mode
 (#27354)

* fix fleet dynamic-mode en-doc;test=develop
---
 .../distributed/fleet/base/fleet_base.py      | 235 +++++++++---------
 1 file changed, 121 insertions(+), 114 deletions(-)

diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 0dfcd5f3255..805c2d1fc73 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -608,25 +608,31 @@ class Fleet(object):
     @dygraph_only
     def distributed_model(self, model):
         """
-        Return dygraph distributed data parallel model (Layer)
-        Only work in dygraph mode
+        Return distributed data parallel model (Only work in dygraph mode)
+
+        Args:
+            model (Layer): the user-defind model which inherits Layer.
+
+        Returns:
+            distributed data parallel model which inherits Layer.
 
         Examples:
+
             .. code-block:: python
-            import paddle
-            import paddle.nn as nn
-            from paddle.distributed import fleet
 
-            class LinearNet(nn.Layer):
-                def __init__(self):
-                    super(LinearNet, self).__init__()
-                    self._linear1 = nn.Linear(10, 10)
-                    self._linear2 = nn.Linear(10, 1)
+                import paddle
+                import paddle.nn as nn
+                from paddle.distributed import fleet
+
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear1 = nn.Linear(10, 10)
+                        self._linear2 = nn.Linear(10, 1)
 
-                def forward(self, x):
-                    return self._linear2(self._linear1(x))
+                    def forward(self, x):
+                        return self._linear2(self._linear1(x))
 
-            def train():
                 # 1. enable dynamic mode
                 paddle.disable_static()
 
@@ -658,8 +664,7 @@ class Fleet(object):
                 adam.step()
                 adam.clear_grad()
 
-            if __name__ == '__main__':
-                paddle.distributed.spawn(train)
+
         """
         assert model is not None
         self.model = paddle.DataParallel(model)
@@ -669,29 +674,30 @@ class Fleet(object):
     def state_dict(self):
         """
         Get state dict information from optimizer.
-        Only work in dygraph mode
+        (Only work in dygraph mode)
 
         Returns: 
             state_dict(dict) : dict contains all the Tensor used by optimizer
 
         Examples:
             .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.distributed import fleet
 
-            paddle.disable_static()
-            fleet.init(is_collective=True)
+                import numpy as np
+                import paddle
+                from paddle.distributed import fleet
+
+                paddle.disable_static()
+                fleet.init(is_collective=True)
 
-            value = np.arange(26).reshape(2, 13).astype("float32")
-            a = paddle.fluid.dygraph.to_variable(value)
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.fluid.dygraph.to_variable(value)
 
-            layer = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+                layer = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
 
-            adam = fleet.distributed_optimizer(adam)
-            dp_layer = fleet.distributed_model(layer)
-            state_dict = adam.state_dict()
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+                state_dict = adam.state_dict()
         """
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.state_dict()
@@ -700,34 +706,36 @@ class Fleet(object):
     def set_state_dict(self, state_dict):
         """
         Load optimizer state dict.
-        Only work in dygraph mode
+        (Only work in dygraph mode)
 
         Args: 
             state_dict(dict) : Dict contains all the Tensor needed by optimizer
 
-        Returns: None 
+        Returns:
+            None
 
         Examples:
             .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.distributed import fleet
 
-            paddle.disable_static()
-            fleet.init(is_collective=True)
+                import numpy as np
+                import paddle
+                from paddle.distributed import fleet
+
+                paddle.disable_static()
+                fleet.init(is_collective=True)
 
-            value = np.arange(26).reshape(2, 13).astype("float32")
-            a = paddle.fluid.dygraph.to_variable(value)
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.fluid.dygraph.to_variable(value)
 
-            layer = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+                layer = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
 
-            adam = fleet.distributed_optimizer(adam)
-            dp_layer = fleet.distributed_model(layer)
-            state_dict = adam.state_dict()
-            paddle.framework.save(state_dict, "paddle_dy")
-            para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
-            adam.set_state_dict(opti_state_dict)
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+                state_dict = adam.state_dict()
+                paddle.framework.save(state_dict, "paddle_dy")
+                para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
+                adam.set_state_dict(opti_state_dict)
         """
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.set_state_dict(state_dict)
@@ -736,42 +744,44 @@ class Fleet(object):
     def set_lr(self, value):
         """
         Set the value of the learning rate manually in the optimizer. 
-        Only work in dygraph mode
+        (Only work in dygraph mode)
 
         Args:
             value (float|Tensor): the value of learning rate
 
-        Returns: None 
+        Returns: 
+            None 
 
         Examples:
             .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.distributed import fleet
 
-            paddle.disable_static()
-            fleet.init(is_collective=True)
+                import numpy as np
+                import paddle
+                from paddle.distributed import fleet
 
-            value = np.arange(26).reshape(2, 13).astype("float32")
-            a = paddle.fluid.dygraph.to_variable(value)
+                paddle.disable_static()
+                fleet.init(is_collective=True)
 
-            layer = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.fluid.dygraph.to_variable(value)
 
-            adam = fleet.distributed_optimizer(adam)
-            dp_layer = fleet.distributed_model(layer)
+                layer = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
 
-            lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
-            for i in range(5):
-                adam.set_lr(lr_list[i])
-                lr = adam.get_lr()
-                print("current lr is {}".format(lr))
-            # Print:
-            #    current lr is 0.2
-            #    current lr is 0.3
-            #    current lr is 0.4
-            #    current lr is 0.5
-            #    current lr is 0.6
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+
+                lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+                for i in range(5):
+                    adam.set_lr(lr_list[i])
+                    lr = adam.get_lr()
+                    print("current lr is {}".format(lr))
+                # Print:
+                #    current lr is 0.2
+                #    current lr is 0.3
+                #    current lr is 0.4
+                #    current lr is 0.5
+                #    current lr is 0.6
         """
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.set_lr(value)
@@ -780,31 +790,32 @@ class Fleet(object):
     def get_lr(self):
         """
         Get current step learning rate.
-        Only work in dygraph mode
+        (Only work in dygraph mode)
 
         Returns:
             float: The learning rate of the current step.
 
         Examples:
             .. code-block:: python
-            import numpy as np
-            import paddle
-            from paddle.distributed import fleet
 
-            paddle.disable_static()
-            fleet.init(is_collective=True)
+                import numpy as np
+                import paddle
+                from paddle.distributed import fleet
+
+                paddle.disable_static()
+                fleet.init(is_collective=True)
 
-            value = np.arange(26).reshape(2, 13).astype("float32")
-            a = paddle.fluid.dygraph.to_variable(value)
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.fluid.dygraph.to_variable(value)
 
-            layer = paddle.nn.Linear(13, 5)
-            adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+                layer = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
 
-            adam = fleet.distributed_optimizer(adam)
-            dp_layer = fleet.distributed_model(layer)
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
 
-            lr = adam.get_lr()
-            print(lr) # 0.01
+                lr = adam.get_lr()
+                print(lr) # 0.01
         """
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.get_lr()
@@ -813,27 +824,27 @@ class Fleet(object):
     def step(self):
         """
         Execute the optimizer once.
-        Only work in dygraph mode
+        (Only work in dygraph mode)
 
-        Returns: None
+        Returns:
+            None
 
         Examples:
             .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-            from paddle.distributed import fleet
+                import paddle
+                import paddle.nn as nn
+                from paddle.distributed import fleet
 
-            class LinearNet(nn.Layer):
-                def __init__(self):
-                    super(LinearNet, self).__init__()
-                    self._linear1 = nn.Linear(10, 10)
-                    self._linear2 = nn.Linear(10, 1)
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear1 = nn.Linear(10, 10)
+                        self._linear2 = nn.Linear(10, 1)
 
-                def forward(self, x):
-                    return self._linear2(self._linear1(x))
+                    def forward(self, x):
+                        return self._linear2(self._linear1(x))
 
-            def train():
                 # 1. enable dynamic mode
                 paddle.disable_static()
 
@@ -865,8 +876,6 @@ class Fleet(object):
                 adam.step()
                 adam.clear_grad()
 
-            if __name__ == '__main__':
-                paddle.distributed.spawn(train)
 
         """
         # imitate target optimizer retrieval
@@ -875,28 +884,28 @@ class Fleet(object):
     @dygraph_only
     def clear_grad(self):
         """
-        Execute the optimizer once.
-        Only work in dygraph mode
+        Clear the gradients of all optimized parameters for model.
+        (Only work in dygraph mode)
 
-        Returns: None
+        Returns: 
+            None
 
         Examples:
             .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-            from paddle.distributed import fleet
+                import paddle
+                import paddle.nn as nn
+                from paddle.distributed import fleet
 
-            class LinearNet(nn.Layer):
-                def __init__(self):
-                    super(LinearNet, self).__init__()
-                    self._linear1 = nn.Linear(10, 10)
-                    self._linear2 = nn.Linear(10, 1)
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear1 = nn.Linear(10, 10)
+                        self._linear2 = nn.Linear(10, 1)
 
-                def forward(self, x):
-                    return self._linear2(self._linear1(x))
+                    def forward(self, x):
+                        return self._linear2(self._linear1(x))
 
-            def train():
                 # 1. enable dynamic mode
                 paddle.disable_static()
 
@@ -928,8 +937,6 @@ class Fleet(object):
                 adam.step()
                 adam.clear_grad()
 
-            if __name__ == '__main__':
-                paddle.distributed.spawn(train)
         """
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.clear_grad()
-- 
GitLab


From d4b4357bc01253221fd37f6b81b596d4e8265de1 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Thu, 17 Sep 2020 10:23:42 +0800
Subject: [PATCH 111/261] [Dy2stat] Change the Global Switch Name of
 ProgramTranslator for API 2.0 (#27203)

Change ProgramTranslator.enable_declarative to ProgramTranslator.enable_to_static to meet API 2.0
---
 .../dygraph_to_static/program_translator.py   | 46 +++++++++++--------
 python/paddle/fluid/dygraph/jit.py            |  6 +--
 python/paddle/hapi/model.py                   | 10 ++--
 3 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index e5fce3e6ede..dbf030ccda1 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -246,7 +246,7 @@ class StaticLayer(object):
         self._function_spec = FunctionSpec(function, input_spec)
         self._program_cache = ProgramCache()
         self._descriptor_cache = weakref.WeakKeyDictionary()
-        # Note: Hold a reference to ProgramTranslator for switching `enable_declarative`.
+        # Note: Hold a reference to ProgramTranslator for switching `enable_to_static`.
         self._program_trans = ProgramTranslator()
 
     def __get__(self, instance, owner):
@@ -299,16 +299,17 @@ class StaticLayer(object):
         """
 
         # 1. call dygraph function directly if not enable `declarative`
-        if not self._program_trans.enable_declarative:
+        if not self._program_trans.enable_to_static:
             logging_utils.warn(
-                "The decorator '@paddle.jit.to_static' does NOT work when setting ProgramTranslator.enable=False. "
-                "We will just return dygraph output.")
+                "The decorator '@paddle.jit.to_static' does NOT work when setting ProgramTranslator.enable to False. "
+                "We will just return dygraph output. If you would like to get static graph output, please call API "
+                "ProgramTranslator.enable(True)")
             return self._call_dygraph_function(*args, **kwargs)
 
-        if not in_dygraph_mode() and self._program_trans.enable_declarative:
+        if not in_dygraph_mode():
             raise RuntimeError(
                 "Failed to run the callable object {} decorated by '@paddle.jit.to_static', "
-                "because it does NOT in dynamic mode. Please disable the static mode to enter dynamic mode with the "
+                "because it is NOT in dynamic mode. Please disable the static mode to enter dynamic mode with the "
                 "following API: paddle.disable_static().".format(
                     self.dygraph_function))
 
@@ -723,15 +724,15 @@ class ProgramTranslator(object):
             return
         self._initialized = True
         self._program_cache = ProgramCache()
-        self.enable_declarative = True
+        self.enable_to_static = True
 
-    def enable(self, enable_declarative):
+    def enable(self, enable_to_static):
         """
         Enable or disable the converting from imperative to declarative by
         ProgramTranslator globally.
 
         Args:
-            enable_declarative (bool): True or False to enable or disable declarative.
+            enable_to_static (bool): True or False to enable or disable declarative.
 
         Returns:
             None.
@@ -760,9 +761,9 @@ class ProgramTranslator(object):
                 print(func(x).numpy()) # [[2. 2.]]
 
         """
-        check_type(enable_declarative, "enable_declarative", bool,
+        check_type(enable_to_static, "enable_to_static", bool,
                    "ProgramTranslator.enable")
-        self.enable_declarative = enable_declarative
+        self.enable_to_static = enable_to_static
 
     def get_output(self, dygraph_func, *args, **kwargs):
         """
@@ -803,10 +804,12 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_output"
-        if not self.enable_declarative:
+        if not self.enable_to_static:
             warnings.warn(
-                "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable = False. "
-                "We will just return dygraph output.")
+                "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable to False. "
+                "We will just return dygraph output. "
+                "Please call ProgramTranslator.enable(True) if you would like to get static output."
+            )
             return dygraph_func(*args, **kwargs)
         try:
             function_spec = FunctionSpec(dygraph_func)
@@ -876,10 +879,11 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_func"
-        if not self.enable_declarative:
+        if not self.enable_to_static:
             warnings.warn(
-                "The ProgramTranslator.get_func doesn't work when setting ProgramTranslator.enable=False. We will "
-                "just return dygraph output.")
+                "The ProgramTranslator.get_func doesn't work when setting ProgramTranslator.enable to False. We will "
+                "just return dygraph output. Please call ProgramTranslator.enable(True) if you would like to get static output."
+            )
             return dygraph_func
 
         static_func = convert_to_static(dygraph_func)
@@ -929,10 +933,12 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_program"
-        if not self.enable_declarative:
+        if not self.enable_to_static:
             warnings.warn(
-                "The ProgramTranslator.get_program doesn't work when setting ProgramTranslator.enable=False."
-                "We will just return dygraph output.")
+                "The ProgramTranslator.get_program doesn't work when setting ProgramTranslator.enable to False."
+                "We will just return dygraph output. "
+                "Please call ProgramTranslator.enable(True) if you would like to get static output."
+            )
             return dygraph_func(*args, **kwargs)
 
         function_spec = FunctionSpec(dygraph_func)
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 57864efec8a..834c1a737d7 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -119,7 +119,7 @@ def _dygraph_to_static_func_(dygraph_func):
     # TODO: remove this decorator after we finalize training API
     def __impl__(*args, **kwargs):
         program_translator = ProgramTranslator()
-        if in_dygraph_mode() or not program_translator.enable_declarative:
+        if in_dygraph_mode() or not program_translator.enable_to_static:
             warnings.warn(
                 "The decorator 'dygraph_to_static_func' doesn't work in "
                 "dygraph mode or set ProgramTranslator.enable to False. "
@@ -832,9 +832,9 @@ def save(layer, model_path, input_spec=None, config=None):
 
     # 1. input check
     prog_translator = ProgramTranslator()
-    if not prog_translator.enable:
+    if not prog_translator.enable_to_static:
         raise RuntimeError(
-            "The paddle.jit.save doesn't work when setting ProgramTranslator.enable=False."
+            "The paddle.jit.save doesn't work when setting ProgramTranslator.enable to False."
         )
     if not isinstance(layer, Layer):
         raise TypeError(
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 2836a151ec3..c445977df14 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -1680,7 +1680,7 @@ class Model(object):
 
         # TODO:
         # 1. Make it Unnecessary to run model before calling `save_inference_model` for users in dygraph.
-        # 2. Save correct shape of input, now the interface stores the shape that the user sent to 
+        # 2. Save correct shape of input, now the interface stores the shape that the user sent to
         #    the inputs of the model in running.
         # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
         if fluid.in_dygraph_mode():
@@ -1689,9 +1689,9 @@ class Model(object):
 
                 # 1. input check
                 prog_translator = ProgramTranslator()
-                if not prog_translator.enable_declarative:
+                if not prog_translator.enable_to_static:
                     raise RuntimeError(
-                        "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
+                        "save_inference_model doesn't work when setting ProgramTranslator.enable to False."
                     )
                 if not isinstance(layer, Layer):
                     raise TypeError(
@@ -1902,8 +1902,8 @@ class Model(object):
                 assert isinstance(spec, Input)
                 if spec.name is None:
                     raise ValueError(
-                        "Requires Input[{}].name != None, but receive `None` with {}.".
-                        format(i, spec))
+                        "Requires Input[{}].name != None, but receive `None` with {}."
+                        .format(i, spec))
 
         return out_specs
 
-- 
GitLab


From a7fadce82fe7b660af40a869a096fd617da468db Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Thu, 17 Sep 2020 14:10:48 +0800
Subject: [PATCH 112/261] fix dll load bug on windows from python3.8 (#27324)

---
 python/paddle/fluid/core.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index a05aa3b0a84..2e3bb6b0021 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -39,6 +39,11 @@ try:
         third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
         os.environ['path'] = third_lib_path + ';' + os.environ['path']
         sys.path.insert(0, third_lib_path)
+        # Note: from python3.8, PATH will not take effect
+        # https://github.com/python/cpython/pull/12302
+        # Use add_dll_directory to specify dll resolution path
+        if sys.version_info[:2] >= (3, 8):
+            os.add_dll_directory(third_lib_path)
 
 except ImportError as e:
     from .. import compat as cpt
-- 
GitLab


From 9bea834ed4064b8229b352bd07ebb686166eb692 Mon Sep 17 00:00:00 2001
From: guofei <52460041+gfwm2013@users.noreply.github.com>
Date: Thu, 17 Sep 2020 14:14:16 +0800
Subject: [PATCH 113/261] Refine the unittest to support py38 (#27208)

* Refine the unittest to support py38

    test=develop
---
 .../fluid/tests/unittests/test_device_guard.py | 18 ++++++++++++++++--
 .../fluid/tests/unittests/test_gather_op.py    |  2 +-
 .../unittests/test_save_model_without_var.py   |  2 +-
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_device_guard.py b/python/paddle/fluid/tests/unittests/test_device_guard.py
index eb8861f1bc4..330065ecd92 100644
--- a/python/paddle/fluid/tests/unittests/test_device_guard.py
+++ b/python/paddle/fluid/tests/unittests/test_device_guard.py
@@ -33,6 +33,14 @@ def execute(main_program, startup_program):
     exe.run(main_program)
 
 
+def get_vaild_warning_num(warning, w):
+    num = 0
+    for i in range(len(w)):
+        if warning in str(w[i].message):
+            num += 1
+    return num
+
+
 class TestDeviceGuard(unittest.TestCase):
     def test_device_guard(self):
         main_program = fluid.Program()
@@ -133,7 +141,10 @@ class TestDeviceGuard(unittest.TestCase):
                         i = fluid.layers.increment(x=i, value=1, in_place=True)
                         fluid.layers.less_than(x=i, y=loop_len, cond=cond)
 
-        assert len(w) == 1
+        warning = "The Op(while) is not support to set device."
+        warning_num = get_vaild_warning_num(warning, w)
+        assert warning_num == 1
+
         all_ops = main_program.global_block().ops
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
         for op in all_ops:
@@ -169,7 +180,10 @@ class TestDeviceGuard(unittest.TestCase):
                         shape=[1], value=4.0, dtype='float32')
                     result = fluid.layers.less_than(x=x, y=y, force_cpu=False)
 
-        assert len(w) == 2
+        warning = "\'device_guard\' has higher priority when they are used at the same time."
+        warning_num = get_vaild_warning_num(warning, w)
+        assert warning_num == 2
+
         all_ops = main_program.global_block().ops
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
         for op in all_ops:
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 1f6e522d266..5dcce88acf1 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -216,7 +216,7 @@ class API_TestGather(unittest.TestCase):
                       "index": index_np,
                       'axis': axis_np},
                 fetch_list=[out])
-            expected_output = gather_numpy(x_np, index_np, axis_np)
+            expected_output = gather_numpy(x_np, index_np, axis_np[0])
         self.assertTrue(np.allclose(result, expected_output))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_save_model_without_var.py b/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
index b74a6e10917..4c63dced83b 100644
--- a/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
+++ b/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
@@ -50,7 +50,7 @@ class TestSaveModelWithoutVar(unittest.TestCase):
                 params_filename='params')
             expected_warn = "no variable in your model, please ensure there are any variables in your model to save"
             self.assertTrue(len(w) > 0)
-            self.assertTrue(expected_warn == str(w[0].message))
+            self.assertTrue(expected_warn == str(w[-1].message))
 
 
 if __name__ == '__main__':
-- 
GitLab


From 7d7f0fad7fa1d9db07d1836d21955fe93eb7297c Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Thu, 17 Sep 2020 14:34:50 +0800
Subject: [PATCH 114/261] python for windows not install netifaces (#27355)

---
 python/requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/requirements.txt b/python/requirements.txt
index ddd1e943df7..47888424755 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -18,4 +18,5 @@ decorator
 prettytable
 astor
 pathlib
-netifaces
+netifaces ; platform_system != "Windows"
+netifaces ; python_version>="3.5" and platform_system == "Windows"
-- 
GitLab


From 9ee77b1f4138835dc13bb1bd3945320b5d5850ad Mon Sep 17 00:00:00 2001
From: ShenLiang <shenliang03@baidu.com>
Date: Thu, 17 Sep 2020 14:40:55 +0800
Subject: [PATCH 115/261] Fix elementwise_floordiv op (#27352)

* fix floordiv
---
 .../operators/elementwise/elementwise_floordiv_op.h   | 11 +++++++++--
 .../tests/unittests/test_elementwise_floordiv_op.py   |  7 +++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
index 5dc93740949..721c23e3830 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -61,8 +61,15 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
                            const framework::Tensor *x,
                            const framework::Tensor *y, framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
-      ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
+  }
 }
 
 template <typename DeviceContext, typename T>
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
index f339081e31b..007affc1408 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -67,6 +67,13 @@ class TestElementwiseModOp_scalar(TestElementwiseModOp):
         self.out = np.floor_divide(self.x, self.y)
 
 
+class TestElementwiseModOpInverse(TestElementwiseModOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+
 class TestFloorDivideOp(unittest.TestCase):
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
-- 
GitLab


From d773c6c94ee7622378e53dd93624825817a4fef7 Mon Sep 17 00:00:00 2001
From: chalsliu <45041955+chalsliu@users.noreply.github.com>
Date: Thu, 17 Sep 2020 15:01:12 +0800
Subject: [PATCH 116/261] Support precision test

---
 paddle/scripts/paddle_build.sh | 19 ++++++++-
 tools/get_pr_ut.py             | 74 ++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 2 deletions(-)
 create mode 100644 tools/get_pr_ut.py

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index ec07565c5af..afc613c4beb 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -930,6 +930,10 @@ function parallel_test_base_gpu() {
 EOF
 
 set +x
+        precison_cases=""
+        if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
+            precision_cases=`python $PADDLE_ROOT/tools/get_pr_ut.py`
+        fi
         EXIT_CODE=0;
         test_cases=$(ctest -N -V) # get all test cases
         exclusive_tests=''        # cases list which would be run exclusively
@@ -959,6 +963,19 @@ set +x
                     echo $testcase" will only run at night."
                     continue
                 fi
+                if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]]; then
+                    will_test="false"
+                    for case in $precision_cases; do
+                        if [[ $testcase == $case ]]; then
+                            will_test="true"
+                            break
+                        fi
+                    done
+                    if [[ $will_test == "false" ]]; then
+                        echo $testcase" won't run in PRECISION_TEST mode."
+                        continue
+                    fi
+                fi
 
                 if [[ "$is_multicard" == "" ]]; then
                   # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
@@ -1077,8 +1094,6 @@ set +x
                 done
         fi
 
-
-       
         if [[ "$EXIT_CODE" != "0" ]]; then
             if [[ "$failed_test_lists" == "" ]]; then
                 echo "========================================"
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
new file mode 100644
index 00000000000..970f89551c5
--- /dev/null
+++ b/tools/get_pr_ut.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" For the PR that only modified the unit test, get cases in pull request. """
+
+import os
+import json
+from github import Github
+
+PADDLE_ROOT = os.getenv('PADDLE_ROOT', '/paddle/')
+
+
+class PRChecker(object):
+    """ PR Checker. """
+
+    def __init__(self):
+        self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60)
+        self.repo = self.github.get_repo('PaddlePaddle/Paddle')
+        self.pr = None
+
+    def init(self):
+        """ Get pull request. """
+        pr_id = os.getenv('GIT_PR_ID')
+        if not pr_id:
+            print('No PR ID')
+            exit(0)
+        self.pr = self.repo.get_pull(int(pr_id))
+
+    def get_pr_files(self):
+        """ Get files in pull request. """
+        page = 0
+        file_list = []
+        while True:
+            files = self.pr.get_files().get_page(page)
+            if not files:
+                break
+            for f in files:
+                file_list.append(PADDLE_ROOT + f.filename)
+            page += 1
+        return file_list
+
+    def get_pr_ut(self):
+        """ Get unit tests in pull request. """
+        ut_list = []
+        file_ut_map = None
+        cmd = 'wget -q --no-check-certificate https://sys-p0.bj.bcebos.com/prec/file_ut.json'
+        os.system(cmd)
+        with open('file_ut.json') as jsonfile:
+            file_ut_map = json.load(jsonfile)
+        for f in self.get_pr_files():
+            if f not in file_ut_map:
+                return ''
+            if f.endswith('.h') or f.endswith('.cu'):
+                return ''
+            else:
+                ut_list.extend(file_ut_map.get(f))
+        ut_list = list(set(ut_list))
+        return ' '.join(ut_list)
+
+
+if __name__ == '__main__':
+    pr_checker = PRChecker()
+    pr_checker.init()
+    print(pr_checker.get_pr_ut())
-- 
GitLab


From 9f9d15e2854bcfd2cb56631f9255f7b5dc2c1f03 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Thu, 17 Sep 2020 15:03:35 +0800
Subject: [PATCH 117/261] fix the bug of non-exit, test=develop (#27350)

---
 paddle/fluid/framework/device_worker.h     | 1 +
 paddle/fluid/framework/pipeline_trainer.cc | 1 +
 paddle/fluid/framework/section_worker.cc   | 2 --
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 04befbe1ca0..efe6fa1b2da 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -441,6 +441,7 @@ class SectionWorker : public DeviceWorker {
     skip_vars_ = skip_vars;
   }
   static void ResetBatchId() { batch_id_ = 0; }
+  static void ResetThreadCompletedFlag() { threads_completed = false; }
 
   static std::atomic<int> cpu_id_;
 
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 758b728fd9c..d7506edbf4c 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -251,6 +251,7 @@ void PipelineTrainer::Finalize() {
   }
   root_scope_->DropKids();
   SectionWorker::ResetBatchId();
+  SectionWorker::ResetThreadCompletedFlag();
 }
 
 Scope* PipelineTrainer::GetWorkerScope(int thread_id) {
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 03b7afbb877..b9a3cac0ec4 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -196,7 +196,6 @@ void SectionWorker::TrainFiles() {
         if (threads_completed) {
           VLOG(3) << "thread " << thread_id_ << " completed.";
           lk.unlock();
-          threads_completed = false;
           return;
         }
         lk.unlock();
@@ -459,7 +458,6 @@ void SectionWorker::TrainFilesWithProfiler() {
                     << ", mean_time: " << op_total_time[i] / op_count[i];
           }
           VLOG(0) << "================================";
-          threads_completed = false;
           return;
         }
         lk.unlock();
-- 
GitLab


From 25902b2c12e7305e802c5bec7c1da166e97b0541 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Thu, 17 Sep 2020 15:27:15 +0800
Subject: [PATCH 118/261] del exclusive ut which name with test_dist_ (#27316)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index b7848066280..6d349a3f934 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -4,6 +4,7 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FL
 set(dist_ENVS http_proxy="" https_proxy="")
 
 file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
+list(REMOVE_ITEM DIST_TEST_OPS "test_dist_op")
 if(NOT WITH_NCCL)
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
 endif()
-- 
GitLab


From fd496fa9a3b04509d3996514e0a174da602cb9fd Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Thu, 17 Sep 2020 15:27:37 +0800
Subject: [PATCH 119/261] modified the ut name with test_dist (#27315)

---
 paddle/scripts/paddle_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index afc613c4beb..8bbd27a0fd3 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -979,7 +979,7 @@ set +x
 
                 if [[ "$is_multicard" == "" ]]; then
                   # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
-                  read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist")
+                  read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_")
                 fi
 
                 if [[ "$is_exclusive" != "" ]]; then
-- 
GitLab


From f0a5eef58fc8435a0243ea816e2bc13d9ec7b99c Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Thu, 17 Sep 2020 15:27:57 +0800
Subject: [PATCH 120/261] cancel three disable ut (#27359)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6d349a3f934..8d236dca22f 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -103,7 +103,6 @@ if(WIN32)
 endif()
 
 
-LIST(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_new)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint1)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint2)
@@ -464,8 +463,8 @@ if(WITH_DISTRIBUTE)
 	   #py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
         if(NOT WIN32)
             py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
-            #py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
-            #py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
         endif(NOT WIN32)
     endif(NOT APPLE)
     if(WITH_DGC)
-- 
GitLab


From 69279207eb7226259c906b5b4ebbd7383a7198ae Mon Sep 17 00:00:00 2001
From: LiuChiachi <709153940@qq.com>
Date: Thu, 17 Sep 2020 15:39:07 +0800
Subject: [PATCH 121/261] Update hapi.model._save_inference_model by using new
 features of dy2stat in 2.0-beta API (#27272)

* update model.save_inference_model

* update doc for _save_inference_model, delete useless class in unittests

* make users not be able to set model._inputs be None

* update usage of Model class in unittests

* fix bugs of _verify_spec

* fix bugs of _verify_spec

* add unittest to increase coverage rate

* delete http.log

* update doc for save, remove requirments and limitations for using

* update doc for class Model
---
 python/paddle/hapi/model.py       | 46 +++++++---------------------
 python/paddle/tests/test_model.py | 50 ++++++++++---------------------
 2 files changed, 26 insertions(+), 70 deletions(-)

diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index c445977df14..d41852c9d7f 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -792,15 +792,14 @@ class Model(object):
     switched by `paddle.disable_static()`. The usage is as follows.
     But note, the switching between dynamic and static should be before
     instantiating a Model. The input description, i.e, paddle.static.InputSpec,
-    must be required for static graph.
+    must be required.
 
     Args:
         network (paddle.nn.Layer): The network is an instance of
             paddle.nn.Layer.
         inputs (InputSpec|list|dict|None): `inputs`, entry points of network,
             could be a InputSpec instance, or lits of InputSpec instances,
-            or dict ({name: InputSpec}), or None. For static graph,
-            inputs must be set. For dynamic graph, it could be None.
+            or dict ({name: InputSpec}), and it couldn't be None.
         labels (InputSpec|list|None): `labels`, entry points of network,
             could be a InputSpec instnace or lits of InputSpec instances,
             or None. For static graph, if labels is required in loss,
@@ -849,10 +848,9 @@ class Model(object):
         self._optimizer = None
         self._test_dataloader = None
 
-        if not in_dygraph_mode():
-            if not isinstance(inputs, (list, dict, Input)):
-                raise TypeError(
-                    "'inputs' must be list or dict in static graph mode")
+        if not isinstance(inputs, (list, dict, Input)):
+            raise TypeError(
+                "'inputs' must be list or dict, and couldn't be None.")
         self._inputs = self._verify_spec(inputs, True)
         self._labels = self._verify_spec(labels)
 
@@ -1004,11 +1002,7 @@ class Model(object):
         have no variable need to save (like SGD), the fill will not generated).
         This function will silently overwrite existing file at the target location.
 
-        If `training` is set to False, only inference model will be saved. It 
-        should be noted that before using `save`, you should run the model, and 
-        the shape of input you saved is as same as the input of its running.
-        `@paddle.jit.to_static` must be added on `forward` function of your layer 
-        in dynamic mode now and these will be optimized later.
+        If `training` is set to False, only inference model will be saved.
 
         Args:
             path (str): The file prefix to save model. The format is
@@ -1037,8 +1031,6 @@ class Model(object):
                             nn.Linear(200, 10),
                             nn.Softmax())
 
-                    # If save for inference in dygraph, need this
-                    @paddle.jit.to_static
                     def forward(self, x):
                         return self.net(x)
 
@@ -1046,7 +1038,7 @@ class Model(object):
                 device = paddle.set_device('cpu')
                 # if use static graph, do not set
                 paddle.disable_static(device) if dynamic else None
-                # inputs and labels are not required for dynamic graph.
+
                 input = InputSpec([None, 784], 'float32', 'x')
                 label = InputSpec([None, 1], 'int64', 'label')
                 model = paddle.Model(Mnist(), input, label)
@@ -1649,10 +1641,6 @@ class Model(object):
                               model_only=False):
         """
         Save inference model can be in static or dynamic mode.
-        It should be noted that before using `save_inference_model`, you should
-        run the model, and the shape you saved is as same as the input of its
-        running. `@paddle.jit.to_static` must be added on `forward` function of
-        your layer in dynamic mode now and these will be optimized later.
 
         Args:
             save_dir (str): The directory path to save the inference model.
@@ -1678,14 +1666,11 @@ class Model(object):
 
             return result_list
 
-        # TODO:
-        # 1. Make it Unnecessary to run model before calling `save_inference_model` for users in dygraph.
-        # 2. Save correct shape of input, now the interface stores the shape that the user sent to
-        #    the inputs of the model in running.
-        # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
         if fluid.in_dygraph_mode():
             with fluid.framework._dygraph_guard(None):
                 layer = self.network
+                layer.forward = paddle.jit.to_static(
+                    layer.forward, input_spec=self._inputs)
 
                 # 1. input check
                 prog_translator = ProgramTranslator()
@@ -1879,18 +1864,7 @@ class Model(object):
     def _verify_spec(self, specs, is_input=False):
         out_specs = []
 
-        if specs is None:
-            # Note(Aurelius84): If not specific specs of `Input`, using argument names of `forward` function
-            # to generate `Input`. But how can we know the actual shape of each input tensor?
-            if is_input:
-                out_specs = [
-                    Input(
-                        name=n, shape=[None])
-                    for n in extract_args(self.network.forward) if n != 'self'
-                ]
-            else:
-                out_specs = to_list(specs)
-        elif isinstance(specs, dict):
+        if isinstance(specs, dict):
             assert is_input == False
             out_specs = [specs[n] \
                 for n in extract_args(self.network.forward) if n != 'self']
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 5c4e98feaa6..62cc39c1f7b 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -67,35 +67,6 @@ class LeNetDygraph(paddle.nn.Layer):
         return x
 
 
-class LeNetDeclarative(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10, classifier_activation=None):
-        super(LeNetDeclarative, self).__init__()
-        self.num_classes = num_classes
-        self.features = Sequential(
-            Conv2d(
-                1, 6, 3, stride=1, padding=1),
-            ReLU(),
-            Pool2D(2, 'max', 2),
-            Conv2d(
-                6, 16, 5, stride=1, padding=0),
-            ReLU(),
-            Pool2D(2, 'max', 2))
-
-        if num_classes > 0:
-            self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
-                Softmax())  #Todo: accept any activation
-
-    @declarative
-    def forward(self, inputs):
-        x = self.features(inputs)
-
-        if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
-            x = self.fc(x)
-        return x
-
-
 class MnistDataset(MNIST):
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
@@ -444,7 +415,9 @@ class TestModelFunction(unittest.TestCase):
         # dynamic saving
         device = paddle.set_device('cpu')
         fluid.enable_dygraph(device)
-        model = Model(MyModel(classifier_activation=None))
+        inputs = [InputSpec([None, 20], 'float32', 'x')]
+        labels = [InputSpec([None, 1], 'int64', 'label')]
+        model = Model(MyModel(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -543,11 +516,10 @@ class TestModelFunction(unittest.TestCase):
 
     def test_export_deploy_model(self):
         for dynamic in [True, False]:
-            fluid.enable_dygraph() if dynamic else None
-            # paddle.disable_static() if dynamic else None
+            paddle.disable_static() if dynamic else None
             prog_translator = ProgramTranslator()
             prog_translator.enable(False) if not dynamic else None
-            net = LeNetDeclarative()
+            net = LeNet()
             inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
@@ -556,8 +528,9 @@ class TestModelFunction(unittest.TestCase):
                 os.makedirs(save_dir)
             tensor_img = np.array(
                 np.random.random((1, 1, 28, 28)), dtype=np.float32)
-            ori_results = model.test_batch(tensor_img)
+
             model.save(save_dir, training=False)
+            ori_results = model.test_batch(tensor_img)
             fluid.disable_dygraph() if dynamic else None
 
             place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
@@ -574,6 +547,7 @@ class TestModelFunction(unittest.TestCase):
                 np.testing.assert_allclose(
                     results, ori_results, rtol=1e-5, atol=1e-7)
                 shutil.rmtree(save_dir)
+            paddle.enable_static()
 
 
 class TestRaiseError(unittest.TestCase):
@@ -585,6 +559,14 @@ class TestRaiseError(unittest.TestCase):
         with self.assertRaises(ValueError):
             model = Model(net, inputs, labels)
 
+    def test_input_without_input_spec(self):
+        for dynamic in [True, False]:
+            paddle.disable_static() if dynamic else None
+            net = MyModel(classifier_activation=None)
+            with self.assertRaises(TypeError):
+                model = Model(net)
+            paddle.enable_static()
+
 
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From 63203c4abc2d1472c361fe7962f3885f7896559e Mon Sep 17 00:00:00 2001
From: Jack Zhou <136876878@qq.com>
Date: Thu, 17 Sep 2020 15:47:11 +0800
Subject: [PATCH 122/261] enhance reduce op which can reduce tensor with
 arbitrary rank

enhance reduce op which can reduce tensor with arbitrary rank
---
 paddle/fluid/operators/math/math_function.cc  |  51 ++++
 paddle/fluid/operators/math/math_function.cu  |  99 +++++++-
 paddle/fluid/operators/math/math_function.h   |   8 +
 paddle/fluid/operators/reduce_ops/reduce_op.h | 197 ++++++++++++----
 paddle/fluid/operators/transpose_op.h         |   7 +-
 paddle/fluid/platform/device_context.cc       |  17 ++
 paddle/fluid/platform/device_context.h        |   7 +
 .../fluid/tests/unittests/test_reduce_op.py   | 219 +++++++++++++++++-
 .../tests/unittests/test_transpose_op.py      |  12 +
 9 files changed, 558 insertions(+), 59 deletions(-)

diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index f44b33fcf2f..b8af5a21ca5 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -22,10 +22,12 @@ limitations under the License. */
 #include <cblas.h>
 #endif
 
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/float16.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace operators {
@@ -63,6 +65,55 @@ DEFINE_CPU_TRANS(4);
 DEFINE_CPU_TRANS(5);
 DEFINE_CPU_TRANS(6);
 
+template <typename T>
+struct TransposeNormal<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = framework::stride(in.dims());
+    auto out_stride = framework::stride(out->dims());
+    const T* in_ptr = in.data<T>();
+    T* out_ptr = out->data<T>();
+
+    auto transpose_helper = [&](int64_t beg, int64_t end) {
+      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
+        int64_t in_idx = 0;
+        int64_t tmp_idx = out_idx;
+        // calculate the input index
+        for (int i = 0; i < rank; ++i) {
+          const int64_t coordinate = tmp_idx / out_stride[i];
+          tmp_idx -= coordinate * out_stride[i];
+          in_idx += coordinate * in_stride[axis[i]];
+        }
+        out_ptr[out_idx] = in_ptr[in_idx];
+      }
+    };
+    double cost_per_iteration =
+        rank * (Eigen::TensorOpCost::DivCost<int64_t>() +
+                2 * Eigen::TensorOpCost::MulCost<int64_t>() +
+                2 * Eigen::TensorOpCost::AddCost<int64_t>());
+    Eigen::TensorOpCost cost(sizeof(T), sizeof(T), cost_per_iteration);
+    auto* cpu_device = context.eigen_pool_device();
+    cpu_device->parallelFor(out->numel(), cost, std::move(transpose_helper));
+  }
+};
+
+// define transpose normal
+#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
+
+DEFINE_CPU_TRANS_NORMAL(platform::float16);
+DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
+DEFINE_CPU_TRANS_NORMAL(float);
+DEFINE_CPU_TRANS_NORMAL(double);
+DEFINE_CPU_TRANS_NORMAL(int);
+DEFINE_CPU_TRANS_NORMAL(int64_t);
+DEFINE_CPU_TRANS_NORMAL(bool);
+DEFINE_CPU_TRANS_NORMAL(int16_t);
+DEFINE_CPU_TRANS_NORMAL(uint8_t);
+DEFINE_CPU_TRANS_NORMAL(int8_t);
+
 struct TensorSetConstantCPU {
   TensorSetConstantCPU(framework::Tensor* tensor, float value)
       : tensor_(tensor), value_(value) {}
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 1c519d226eb..4d7c1a49286 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -11,8 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
@@ -23,6 +26,7 @@ namespace operators {
 namespace math {
 
 using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
 
 template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
 template struct SetConstant<platform::CUDADeviceContext, float>;
@@ -31,12 +35,13 @@ template struct SetConstant<platform::CUDADeviceContext, int>;
 template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
 
-#define DEFINE_GPU_TRANS(RANK)                                           \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, float16, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>; \
+#define DEFINE_GPU_TRANS(RANK)                                            \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>;    \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, float16, RANK>;  \
+  template struct Transpose<platform::CUDADeviceContext, bfloat16, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>;  \
   template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;
 
 DEFINE_GPU_TRANS(1);
@@ -46,6 +51,88 @@ DEFINE_GPU_TRANS(4);
 DEFINE_GPU_TRANS(5);
 DEFINE_GPU_TRANS(6);
 
+#define REINTERPRET(T, DST_PTR, SRC_PTR) \
+  T* DST_PTR = reinterpret_cast<T*>(SRC_PTR)
+
+template <typename T>
+__global__ void TransposeNormalKernel(const T* in_ptr, T* out_ptr,
+                                      int64_t element,
+                                      const int64_t* in_stride_ptr,
+                                      const int64_t* out_stride_ptr,
+                                      const int64_t* axis_ptr, int rank) {
+  CUDA_KERNEL_LOOP(out_idx, element) {
+    int64_t in_idx = 0;
+    int64_t tmp_idx = out_idx;
+    for (int i = 0; i < rank; ++i) {
+      const int64_t coordinate = tmp_idx / out_stride_ptr[i];
+      tmp_idx -= coordinate * out_stride_ptr[i];
+      in_idx += coordinate * in_stride_ptr[axis_ptr[i]];
+    }
+    out_ptr[out_idx] = in_ptr[in_idx];
+  }
+}
+
+template <typename T>
+struct TransposeNormal<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = framework::stride(in.dims());
+    auto out_stride = framework::stride(out->dims());
+    auto* in_ptr = in.data<T>();
+    auto* out_ptr = out->data<T>();
+
+    // copy in_stride, out_stride, axis to gpu device
+    const platform::CUDAPlace& cuda_place =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
+    platform::CPUPlace cpu_place = platform::CPUPlace();
+    size_t size = 3 * rank * sizeof(int64_t);
+    auto cpu_buf_holder = memory::AllocShared(cpu_place, size);
+    auto cuda_buf_holder = memory::AllocShared(cuda_place, size);
+    REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
+    REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
+    for (int i = 0; i < rank; ++i) {
+      cpu_buf[i] = in_stride[i];
+      cpu_buf[rank + i] = out_stride[i];
+      cpu_buf[2 * rank + i] = axis[i];
+    }
+    memory::Copy(cuda_place, cuda_buf, cpu_place, cpu_buf, size,
+                 context.stream());
+    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
+    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
+    REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
+
+    const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
+    const int MAX_GRID_DIM =
+        context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+    int64_t elements = in.numel();
+    int block_size = (elements >= MAX_BLOCK_DIM)
+                         ? MAX_BLOCK_DIM
+                         : (1 << static_cast<int>(std::log2(elements)));
+    int grid_size = elements / block_size;
+    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
+    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
+        in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr,
+        rank);
+  }
+};
+
+// define transpose normal
+#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<platform::CUDADeviceContext, TYPE>
+
+DEFINE_GPU_TRANS_NORMAL(float16);
+DEFINE_GPU_TRANS_NORMAL(bfloat16);
+DEFINE_GPU_TRANS_NORMAL(float);
+DEFINE_GPU_TRANS_NORMAL(double);
+DEFINE_GPU_TRANS_NORMAL(int);
+DEFINE_GPU_TRANS_NORMAL(int64_t);
+DEFINE_GPU_TRANS_NORMAL(bool);
+DEFINE_GPU_TRANS_NORMAL(int16_t);
+DEFINE_GPU_TRANS_NORMAL(uint8_t);
+DEFINE_GPU_TRANS_NORMAL(int8_t);
+
 struct TensorSetConstantGPU {
   TensorSetConstantGPU(const platform::DeviceContext& context,
                        framework::Tensor* tensor, float value)
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 333552a0c1a..6af0278d825 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -26,6 +26,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 namespace math {
+
+template <typename DeviceContext, typename T>
+struct TransposeNormal {
+  // for dims >= 7 situation
+  void operator()(const DeviceContext& context, const framework::Tensor& in,
+                  framework::Tensor* out, const std::vector<int>& axis);
+};
+
 template <typename DeviceContext, typename T, int Rank>
 struct Transpose {
   void operator()(const DeviceContext& context, const framework::Tensor& in,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 67a19cb83c3..25f9453571a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -18,9 +18,10 @@ limitations under the License. */
 #include <set>
 #include <string>
 #include <vector>
-
 #include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/cast_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 
 namespace paddle {
@@ -34,6 +35,110 @@ namespace operators {
   }
 
 using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+inline void GetShuffledDim(const DDim& src_dims, DDim* dst_dims,
+                           const std::vector<int>& reduced_dims,
+                           std::vector<int>* perm_axis) {
+  // check if it's a reduced dim
+  std::vector<bool> src_dims_check(src_dims.size(), false);
+  size_t src_size = src_dims.size();
+  size_t reduce_size = reduced_dims.size();
+  for (size_t i = 0; i < reduce_size; ++i) {
+    dst_dims->at(src_size - reduce_size + i) = src_dims[reduced_dims[i]];
+    (*perm_axis)[src_size - reduce_size + i] = reduced_dims[i];
+    src_dims_check[reduced_dims[i]] = true;
+  }
+
+  size_t offset = 0;
+  for (size_t i = 0; i < src_dims_check.size(); ++i) {
+    bool is_reduced = src_dims_check[i];
+    if (!is_reduced) {
+      (*perm_axis)[offset] = i;
+      dst_dims->at(offset++) = src_dims[i];
+    }
+  }
+}
+
+template <typename DeviceContext, typename OutT>
+void GetShuffledInput(const framework::ExecutionContext& context,
+                      const Tensor* input, Tensor* shuffled_input,
+                      const std::vector<int>& dims) {
+  DDim shuffled_dims(input->dims());
+  std::vector<int> perm_axis(input->dims().size());
+  GetShuffledDim(input->dims(), &shuffled_dims, dims, &perm_axis);
+
+  shuffled_input->Resize(shuffled_dims);
+  shuffled_input->mutable_data<OutT>(context.GetPlace());
+
+  math::TransposeNormal<DeviceContext, OutT> trans;
+  trans(context.template device_context<DeviceContext>(), *input,
+        shuffled_input, perm_axis);
+}
+
+inline void GetOriginDimFromShuffled(const DDim& src_dim,
+                                     const std::vector<int>& dims,
+                                     std::vector<int>* origin_dim) {
+  DDim shuffled_dims(src_dim);
+  size_t n = src_dim.size();
+  std::vector<int> perm_axis(n);
+  GetShuffledDim(src_dim, &shuffled_dims, dims, &perm_axis);
+  for (size_t i = 0; i < n; ++i) {
+    (*origin_dim)[perm_axis[i]] = i;
+  }
+}
+
+template <typename DeviceContext, typename OutT, typename Functor>
+void HandleLargeDim(const framework::ExecutionContext& context,
+                    const Tensor* input, Tensor* output,
+                    const std::vector<int>& dims, bool keep_dim) {
+  //  shuffle the reduced dim to the end
+  Tensor shuffled_input;
+  GetShuffledInput<DeviceContext, OutT>(context, input, &shuffled_input, dims);
+
+  // transpose to 2D tensor whose shape is {unreduced, reduced}.
+  const int64_t unreduced = output->numel();
+  const int64_t reduced = shuffled_input.numel() / unreduced;
+  shuffled_input.Resize({unreduced, reduced});
+  DDim output_dim = output->dims();
+  output->Resize({unreduced});
+  ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
+      context.template device_context<DeviceContext>(), shuffled_input, output,
+      {1}, keep_dim);
+  output->Resize(output_dim);
+}
+
+template <typename DeviceContext, typename T, typename Functor>
+void HandleLargeDimGrad(const framework::ExecutionContext& context,
+                        const framework::Tensor* x,
+                        const framework::Tensor* out,
+                        const framework::Tensor* dout, framework::Tensor* dx,
+                        const std::vector<int>& dims) {
+  const int64_t unreduced = out->numel();
+  const int64_t reduced = x->numel() / unreduced;
+  DDim out_dim(out->dims());
+  DDim x_dim(x->dims());
+  // transpose and reshape X
+  Tensor shuffled_x;
+  GetShuffledInput<DeviceContext, T>(context, x, &shuffled_x, dims);
+  DDim shuffled_dim = shuffled_x.dims();
+  shuffled_x.Resize({unreduced, reduced});
+  // reshape dX {unreduced, reduced}
+  dx->Resize({unreduced, reduced});
+  ReduceGradFunctor<DeviceContext, T, 2, Functor>(
+      context.template device_context<DeviceContext>(), shuffled_x, *out, *dout,
+      dx, {1});
+  // transpose dX
+  std::vector<int> origin_axis(x_dim.size());
+  GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
+  Tensor dx_tmp;
+  framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp);
+  dx_tmp.Resize(shuffled_dim);
+  dx->Resize(x_dim);
+  math::TransposeNormal<DeviceContext, T> trans;
+  trans(context.template device_context<DeviceContext>(), dx_tmp, dx,
+        origin_axis);
+}
 
 template <typename DeviceContext, typename T, typename Functor>
 struct ReduceKernelFunctor {
@@ -69,22 +174,27 @@ struct ReduceKernelFunctor {
     } else {
       int ndim = input->dims().size();
       int rdim = dims.size();
-      HANDLE_DIM(6, 5);
-      HANDLE_DIM(6, 4);
-      HANDLE_DIM(6, 3);
-      HANDLE_DIM(6, 2);
-      HANDLE_DIM(6, 1);
-      HANDLE_DIM(5, 4);
-      HANDLE_DIM(5, 3);
-      HANDLE_DIM(5, 2);
-      HANDLE_DIM(5, 1);
-      HANDLE_DIM(4, 3);
-      HANDLE_DIM(4, 2);
-      HANDLE_DIM(4, 1);
-      HANDLE_DIM(3, 2);
-      HANDLE_DIM(3, 1);
-      HANDLE_DIM(2, 1);
-      HANDLE_DIM(1, 1);
+      if (ndim > 6) {
+        HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
+                                                     dims, keep_dim);
+      } else {
+        HANDLE_DIM(6, 5);
+        HANDLE_DIM(6, 4);
+        HANDLE_DIM(6, 3);
+        HANDLE_DIM(6, 2);
+        HANDLE_DIM(6, 1);
+        HANDLE_DIM(5, 4);
+        HANDLE_DIM(5, 3);
+        HANDLE_DIM(5, 2);
+        HANDLE_DIM(5, 1);
+        HANDLE_DIM(4, 3);
+        HANDLE_DIM(4, 2);
+        HANDLE_DIM(4, 1);
+        HANDLE_DIM(3, 2);
+        HANDLE_DIM(3, 1);
+        HANDLE_DIM(2, 1);
+        HANDLE_DIM(1, 1);
+      }
     }
   }
 };
@@ -137,7 +247,6 @@ class ReduceKernel : public framework::OpKernel<T> {
     }
   }
 };
-
 template <typename DeviceContext, typename OutT, typename Functor>
 class BoolReduceKernel : public framework::OpKernel<OutT> {
  public:
@@ -175,22 +284,27 @@ class BoolReduceKernel : public framework::OpKernel<OutT> {
       int ndim = input->dims().size();
       int rdim = dims.size();
       // comments for accelerating compiling temporarily.
-      //      HANDLE_DIM(6, 5);
-      //      HANDLE_DIM(6, 4);
-      //      HANDLE_DIM(6, 3);
-      //      HANDLE_DIM(6, 2);
-      //      HANDLE_DIM(6, 1);
-      //      HANDLE_DIM(5, 4);
-      //      HANDLE_DIM(5, 3);
-      //      HANDLE_DIM(5, 2);
-      //      HANDLE_DIM(5, 1);
-      HANDLE_DIM(4, 3);
-      HANDLE_DIM(4, 2);
-      HANDLE_DIM(4, 1);
-      HANDLE_DIM(3, 2);
-      HANDLE_DIM(3, 1);
-      HANDLE_DIM(2, 1);
-      HANDLE_DIM(1, 1);
+      if (ndim > 6) {
+        HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
+                                                     dims, keep_dim);
+      } else {
+        HANDLE_DIM(6, 5);
+        HANDLE_DIM(6, 4);
+        HANDLE_DIM(6, 3);
+        HANDLE_DIM(6, 2);
+        HANDLE_DIM(6, 1);
+        HANDLE_DIM(5, 4);
+        HANDLE_DIM(5, 3);
+        HANDLE_DIM(5, 2);
+        HANDLE_DIM(5, 1);
+        HANDLE_DIM(4, 3);
+        HANDLE_DIM(4, 2);
+        HANDLE_DIM(4, 1);
+        HANDLE_DIM(3, 2);
+        HANDLE_DIM(3, 1);
+        HANDLE_DIM(2, 1);
+        HANDLE_DIM(1, 1);
+      }
     }
   }
 };
@@ -279,6 +393,10 @@ class ReduceGradKernel : public framework::OpKernel<T> {
               context.template device_context<DeviceContext>(), *input0,
               *input1, *input2, output, dims);
           break;
+        default:
+          HandleLargeDimGrad<DeviceContext, T, Functor>(context, input0, input1,
+                                                        input2, output, dims);
+          break;
       }
     }
   }
@@ -313,12 +431,6 @@ class ReduceOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceOp");
     auto x_dims = ctx->GetInputDim("X");
     auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6,
-                      platform::errors::InvalidArgument(
-                          "The input tensor X's dimensions of ReduceOp "
-                          "should be less equal than 6. But received X's "
-                          "dimensions = %d, X's shape = [%s].",
-                          x_rank, x_dims));
     auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
     PADDLE_ENFORCE_GT(dims.size(), 0,
                       platform::errors::InvalidArgument(
@@ -402,11 +514,6 @@ class ReduceGradOp : public framework::OperatorWithKernel {
                    "Out@GRAD", "ReduceOp");
     auto x_dims = ctx->GetInputDim("X");
     auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6,
-                      platform::errors::InvalidArgument(
-                          "Tensors with rank at most 6 are supported by "
-                          "ReduceOp. Received tensor with rank %d.",
-                          x_rank));
     auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
     for (size_t i = 0; i < dims.size(); ++i) {
       PADDLE_ENFORCE_LT(dims[i], x_rank,
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index d7f5c3dd457..e4e5dfdba9f 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -53,10 +53,9 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
       trans6(dev_ctx, in, out, axis);
       break;
     default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Tensors with rank at most 6 are supported"
-          ", but received input tensor's rank is %d,",
-          dim));
+      // for dim >= 7 situation
+      math::TransposeNormal<DeviceContext, T> trans_normal;
+      trans_normal(dev_ctx, in, out, axis);
   }
 }
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 29982c13c8c..34305c404b4 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include <set>
 #include <string>
+#include <thread>  //NOLINT
 #include <unordered_set>
 #include <vector>
 
@@ -23,6 +24,7 @@ limitations under the License. */
 #endif
 
 #include "glog/logging.h"
+#include "unsupported/Eigen/CXX11/ThreadPool"
 
 namespace paddle {
 namespace memory {
@@ -131,16 +133,31 @@ DeviceContextPool::DeviceContextPool(
 
 CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
+  InitPoolDevice();
 }
 
 CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
   eigen_device_.reset(new Eigen::DefaultDevice());
+  InitPoolDevice();
+}
+
+void CPUDeviceContext::InitPoolDevice() {
+  using EigenEnv = Eigen::StlThreadEnvironment;
+  using EigenThreadPool = Eigen::ThreadPoolTempl<EigenEnv>;
+  int num_threads = std::thread::hardware_concurrency();
+  eigen_threadpool_.reset(new EigenThreadPool(num_threads));
+  eigen_pool_device_.reset(
+      new Eigen::ThreadPoolDevice(eigen_threadpool_.get(), num_threads));
 }
 
 Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
 
+Eigen::ThreadPoolDevice* CPUDeviceContext::eigen_pool_device() const {
+  return eigen_pool_device_.get();
+}
+
 Place CPUDeviceContext::GetPlace() const { return place_; }
 
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 8bfdfc8a1c6..28d94627f95 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -41,6 +41,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #endif
+#define EIGEN_USE_THREADS
 #include "unsupported/Eigen/CXX11/Tensor"
 
 #ifdef PADDLE_WITH_XPU
@@ -65,11 +66,17 @@ class CPUDeviceContext : public DeviceContext {
 
   Eigen::DefaultDevice* eigen_device() const;
 
+  Eigen::ThreadPoolDevice* eigen_pool_device() const;
+
   Place GetPlace() const override;
 
+  inline void InitPoolDevice();
+
  private:
   CPUPlace place_;
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_pool_device_;
+  std::unique_ptr<Eigen::ThreadPool> eigen_threadpool_;
 };
 
 template <typename Place>
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index b0b85f633a2..80b201d0842 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -67,6 +67,22 @@ class TestSumOp6D(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestSumOp8D(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float64")
+        }
+        self.attrs = {'dim': (0, 3)}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=(0, 3))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
@@ -103,6 +119,40 @@ class TestMinOp(OpTest):
         self.check_output()
 
 
+class TestMin6DOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.inputs = {
+            'X': np.random.random((2, 4, 3, 5, 6, 10)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMin8DOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.inputs = {
+            'X': np.random.random((2, 4, 3, 5, 6, 3, 2, 4)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestProdOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_prod"
@@ -116,6 +166,42 @@ class TestProdOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestProd6DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.inputs = {
+            'X': np.random.random((5, 6, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestProd8DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 class TestAllOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_all"
@@ -127,12 +213,40 @@ class TestAllOp(OpTest):
         self.check_output()
 
 
+class TestAll8DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'reduce_all': True, 'dim': (2, 3, 4)}
+        self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAllOpWithDim(OpTest):
     def setUp(self):
         self.op_type = "reduce_all"
         self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.attrs = {'dim': [1]}
-        self.outputs = {'Out': self.inputs['X'].all(axis=1)}
+        self.attrs = {'dim': (1, )}
+        self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAll8DOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (1, 3, 4)}
+        self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
 
     def test_check_output(self):
         self.check_output()
@@ -152,6 +266,23 @@ class TestAllOpWithKeepDim(OpTest):
         self.check_output()
 
 
+class TestAll8DOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (5, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].all(axis=self.attrs['dim']), axis=5)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAllOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
@@ -175,6 +306,20 @@ class TestAnyOp(OpTest):
         self.check_output()
 
 
+class TestAny8DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'reduce_all': True, 'dim': (3, 5, 4)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAnyOpWithDim(OpTest):
     def setUp(self):
         self.op_type = "reduce_any"
@@ -186,14 +331,45 @@ class TestAnyOpWithDim(OpTest):
         self.check_output()
 
 
+class TestAny8DOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (3, 6)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAnyOpWithKeepDim(OpTest):
     def setUp(self):
         self.op_type = "reduce_any"
         self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAny8DOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
         self.outputs = {
             'Out': np.expand_dims(
-                self.inputs['X'].any(axis=1), axis=1)
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
         }
 
     def test_check_output(self):
@@ -283,6 +459,18 @@ class Test3DReduce3(Test1DReduce):
         }
 
 
+class Test8DReduce0(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': (4, 2, 3)}
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
 class TestKeepDimReduce(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
@@ -294,6 +482,19 @@ class TestKeepDimReduce(Test1DReduce):
         }
 
 
+class TestKeepDim8DReduce(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': (3, 4, 5), 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=self.attrs['keep_dim'])
+        }
+
+
 class TestReduceAll(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
@@ -302,6 +503,16 @@ class TestReduceAll(Test1DReduce):
         self.outputs = {'Out': self.inputs['X'].sum()}
 
 
+class TestReduceAll(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'reduce_all': True, 'dim': (3, 4, 5)}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=self.attrs['dim'])}
+
+
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index d5d1fdc5b20..56333211469 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -99,6 +99,18 @@ class TestCase7(TestTransposeOp):
         self.axis = (0, 1, 3, 2)
 
 
+class TestCase8(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (0, 1, 3, 2, 4, 5, 6, 7)
+
+
+class TestCase9(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
+
+
 class TestTransposeOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
-- 
GitLab


From bcb4a583aae31b3f98f8c84556e1c889b6f31023 Mon Sep 17 00:00:00 2001
From: guofei <52460041+gfwm2013@users.noreply.github.com>
Date: Thu, 17 Sep 2020 16:13:51 +0800
Subject: [PATCH 123/261] Replace the 'spawn' start method with 'fork' start
 method for multiprocessing, on MacOS with python>=3.8 (#27317)

* Replace the 'spawn' start method with 'fork' start method for multiprocessing, on MacOs when python>=3.8

test=develop
---
 python/paddle/reader/decorator.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index ff09f4c562a..aadfb3f49ed 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -32,6 +32,21 @@ import random
 import zlib
 import paddle.compat as cpt
 
+# On macOS, the 'spawn' start method is now the default in Python3.8 multiprocessing,
+# Paddle is currently unable to solve this, so forces the process to start using 
+# the 'fork' start method.
+#
+# TODO: This solution is not good, because the fork start method could lead to 
+# crashes of the subprocess. Figure out how to make 'spawn' work.
+#
+# For more details, please refer to
+# https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+# https://bugs.python.org/issue33725
+if sys.version_info >= (3, 8):
+    fork_context = multiprocessing.get_context('fork')
+else:
+    fork_context = multiprocessing
+
 
 def cache(reader):
     """
@@ -560,9 +575,9 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
             six.reraise(*sys.exc_info())
 
     def queue_reader():
-        queue = multiprocessing.Queue(queue_size)
+        queue = fork_context.Queue(queue_size)
         for reader in readers:
-            p = multiprocessing.Process(
+            p = fork_context.Process(
                 target=_read_into_queue, args=(reader, queue))
             p.start()
 
@@ -593,9 +608,9 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
     def pipe_reader():
         conns = []
         for reader in readers:
-            parent_conn, child_conn = multiprocessing.Pipe()
+            parent_conn, child_conn = fork_context.Pipe()
             conns.append(parent_conn)
-            p = multiprocessing.Process(
+            p = fork_context.Process(
                 target=_read_into_pipe, args=(reader, child_conn))
             p.start()
 
-- 
GitLab


From da583edf6d6d8adfd30dbd7adce36d4dcffdd2b9 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Thu, 17 Sep 2020 16:44:46 +0800
Subject: [PATCH 124/261] Fix paddle build install requirements (#27378)

* Fix install pr requirements.txt

* test=document_fix
---
 paddle/scripts/paddle_build.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8bbd27a0fd3..3de577d847d 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -621,6 +621,7 @@ function generate_upstream_develop_api_spec() {
     git checkout -b develop_base_pr upstream/$BRANCH
     cmake_gen $1
     build $2
+    cp ${PADDLE_ROOT}/python/requirements.txt /tmp
 
     git checkout $cur_branch
     generate_api_spec "$1" "DEV"
@@ -641,7 +642,12 @@ function generate_api_spec() {
     cd ${PADDLE_ROOT}/build/.check_api_workspace
     virtualenv .${spec_kind}_env
     source .${spec_kind}_env/bin/activate
-    pip install -r ${PADDLE_ROOT}/python/requirements.txt
+
+    if [ "$spec_kind" == "DEV" ]; then
+        pip install -r /tmp/requirements.txt
+    else
+        pip install -r ${PADDLE_ROOT}/python/requirements.txt
+    fi
     pip --no-cache-dir install ${PADDLE_ROOT}/build/python/dist/*whl
     spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path
-- 
GitLab


From e9a0fbfff2bac1ec4c22b9dc713492cc19859401 Mon Sep 17 00:00:00 2001
From: Yi Liu <gavin1332@gmail.com>
Date: Thu, 17 Sep 2020 17:50:46 +0800
Subject: [PATCH 125/261] =?UTF-8?q?OP=E6=8A=A5=E9=94=99=E4=BF=A1=E6=81=AF?=
 =?UTF-8?q?=E4=BC=98=E5=8C=96=20(#27301)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

paddle/fluid/operators/distributed_ops OP报错信息优化
---
 .../operators/distributed_ops/fake_init_op.cc |  4 +-
 .../distributed_ops/listen_and_serv_op.cc     | 45 ++++++++++++++-----
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/distributed_ops/fake_init_op.cc b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
index 1da164175e1..cb27dc75eb2 100644
--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
@@ -43,9 +43,9 @@ class FakeInitOp : public framework::OperatorBase {
       tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
       tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
     } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::InvalidArgument(
           "fake init op's output only"
-          "supports SelectedRows and LoDTensor");
+          "supports SelectedRows and LoDTensor"));
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 5e1e408eb2c..43de8488a0e 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -134,7 +134,10 @@ void ListenAndServOp::RunSyncLoop(
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
   PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
+                    platform::errors::PreconditionNotMet(
+                        "Invalid number of blocks in server program. Expected "
+                        "equal or greater than 2. Recieved %zu",
+                        num_blocks));
 
   // Prepare all the server block
   std::vector<int> optimize_blocks_list;
@@ -218,7 +221,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
       VLOG(3) << "reset sparse var: " << varname;
       var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
     } else {
-      PADDLE_THROW("The type of sparse var should be SelectedRows");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "The type of sparse var should be SelectedRows"));
     }
   }
   if (UNLIKELY(reset_all)) {
@@ -235,7 +239,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
         math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
                            static_cast<float>(0));
       } else {
-        PADDLE_THROW("The type of dense var should be in [LoDTensor, Tensor]");
+        PADDLE_THROW(platform::errors::PreconditionNotMet(
+            "The type of dense var should be in [LoDTensor, Tensor]"));
       }
     }
   }
@@ -254,8 +259,15 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
     std::vector<std::string> pieces;
     split(grad_and_id, ':', &pieces);
     VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
-    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0);
+    PADDLE_ENFORCE_EQ(pieces.size(), 2,
+                      platform::errors::PreconditionNotMet(
+                          "Invalid format of grad_and_id argument. "
+                          "Expected \"grad:block_id\". Recieved %s",
+                          grad_and_id.c_str()));
+    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
+                      platform::errors::AlreadyExists(
+                          "The gradient name %s has already existed in out_map",
+                          pieces[0].c_str()));
 
     int block_id = std::stoi(pieces[1]);
     (*out_map)[pieces[0]] = block_id;
@@ -267,7 +279,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
 
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
+                    platform::errors::PreconditionNotMet(
+                        "Invalid number of blocks in server program. Expected "
+                        "equal or greater than 2. Recieved %zu",
+                        num_blocks));
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
     block_list.push_back(blkid);
@@ -342,9 +357,9 @@ void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
                var->IsType<framework::Tensor>()) {
       dense_vars_.push_back(varname);
     } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
           "The type of received var should be in [SelectedRows, LoDTensor, "
-          "Tensor].");
+          "Tensor]."));
     }
   }
 }
@@ -450,7 +465,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     split(prefetch_var_name_and_id, ':', &pieces);
     VLOG(3) << "after split, prefetch_var = " << pieces[0]
             << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    PADDLE_ENFORCE_EQ(
+        pieces.size(), 2,
+        platform::errors::PreconditionNotMet(
+            "Invalid format of prefetch_var_name_and_id argument. "
+            "Expected \"xxx:xxx\". Recieved %s",
+            prefetch_var_name_and_id.c_str()));
 
     int block_id = std::stoi(pieces[1]);
     prefetch_block_id_list.push_back(block_id);
@@ -476,7 +496,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
        sparse_grad_name_to_param_name_str) {
     std::vector<std::string> pieces;
     split(sparse_grad_name_and_param_name, ':', &pieces);
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    PADDLE_ENFORCE_EQ(
+        pieces.size(), 2,
+        platform::errors::PreconditionNotMet(
+            "Invalid format of sparse_grad_name_and_param_name argument. "
+            "Expected \"xxx:xxx\". Recieved %s",
+            sparse_grad_name_and_param_name.c_str()));
     VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
             << ", param_name = " << pieces[1];
     sparse_grad_name_to_param_name[pieces[0]] = pieces[1];
-- 
GitLab


From f36b9a7f790d6c3fbd08b5a37fe205f770a53ed5 Mon Sep 17 00:00:00 2001
From: 123malin <malin10@baidu.com>
Date: Thu, 17 Sep 2020 19:05:47 +0800
Subject: [PATCH 126/261] =?UTF-8?q?=E3=80=90Fleet2.0=20Util=E3=80=91=20add?=
 =?UTF-8?q?=20documents=20(#26698)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* test=develop, util documents
---
 .../distributed/fleet/base/role_maker.py      |   3 +-
 .../distributed/fleet/base/util_factory.py    | 217 +++++++-
 python/paddle/distributed/fleet/launch.py     |  23 +-
 .../distributed/fleet/utils/__init__.py       |   5 -
 python/paddle/distributed/fleet/utils/fs.py   | 476 +++++++++++++++++-
 .../incubate/checkpoint/auto_checkpoint.py    |   4 +-
 .../fluid/tests/unittests/hdfs_test_utils.py  |   2 +-
 .../test_auto_checkpoint_dist_basic.py        |   4 +-
 .../tests/unittests/test_fleet_rolemaker_4.py |   6 +-
 .../fluid/tests/unittests/test_fleet_util.py  |   4 +-
 .../tests/unittests/test_fs_interface.py      |   2 +-
 .../fluid/tests/unittests/test_hdfs1.py       |   5 +-
 .../fluid/tests/unittests/test_hdfs2.py       |   5 +-
 .../fluid/tests/unittests/test_hdfs3.py       |   5 +-
 14 files changed, 694 insertions(+), 67 deletions(-)

diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 8614b186134..a3a809ee375 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -637,7 +637,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         return "lo"
 
     def __start_kv_server(self, http_server_d, size_d):
-        from paddle.distributed.fleet.utils import KVServer
+        from paddle.distributed.fleet.utils.http_server import KVServer
         http_server = KVServer(int(self._http_ip_port[1]), size_d)
         http_server.start()
         wait_seconds = 5
@@ -651,6 +651,7 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
     def __init__(self, is_collective=False, init_gloo=False, **kwargs):
         super(UserDefinedRoleMaker, self).__init__(
             is_collective=is_collective, init_gloo=init_gloo, **kwargs)
+        self._init_gloo = init_gloo
 
     def _user_defined_ps_env(self):
         self._server_endpoints = self._kwargs.get("server_endpoints")
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
index f5a6c417c0c..4fa247c3196 100644
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -16,20 +16,18 @@
 """basic collective operations in python"""
 """remote file system"""
 
-__all__ = ['UtilBase']
-
-import numpy as np
-import os
-
-import subprocess
-from paddle.fluid import core
-from collections import OrderedDict
-import paddle.fluid as fluid
-from google.protobuf import text_format
-from paddle.fluid import debugger
-from paddle.fluid.framework import Program
-from paddle.fluid.proto import framework_pb2
 from ..utils.fs import FS, LocalFS, HDFSClient
+from paddle.fluid.proto import framework_pb2
+from paddle.fluid.framework import Program
+from paddle.fluid import debugger
+from google.protobuf import text_format
+import paddle.fluid as fluid
+from collections import OrderedDict
+from paddle.fluid import core
+import subprocess
+import os
+import numpy as np
+__all__ = ['UtilBase']
 
 
 class UtilFactory(object):
@@ -53,7 +51,7 @@ class UtilBase(object):
     def _set_role_maker(self, role_maker):
         self.role_maker = role_maker
 
-    def set_file_system(self, fs_client):
+    def _set_file_system(self, fs_client):
         assert isinstance(
             fs_client, FS
         ), "fs_client must be the instance of paddle.distributed.fleet.utils.FS"
@@ -87,36 +85,183 @@ class UtilBase(object):
         return _comm_world
 
     def all_reduce(self, input, mode, comm_world="worker"):
+        """
+        All reduce `input` between specified collection. This is a distributed API.
+
+        Args:
+            input (list|numpy.array): The input variable to do all_reduce between specified collection.
+            mode (str): "sum" or "min" or "max".
+            comm_world (str, optional): Collection used to execute all_reduce operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
+
+        Returns:
+            output(Numpy.array|None): A numpy array with the same shape as the `input` .
+
+        Examples:
+            .. code-block:: python
+
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+                import numpy as np
+
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+
+                    if fleet.is_server():
+                        input = [1, 2]
+                        output = fleet_util.all_reduce(input, "sum", "server")
+                        print(output)
+                        # [2, 4]
+                    elif fleet.is_worker():
+                        input = np.array([3, 4])
+                        output = fleet_util.all_reduce(input, "sum", "worker")
+                        print(output)
+                        # [6, 8]
+                    output = fleet_util.all_reduce(input, "sum", "all")
+                    print(output)
+                    # [8, 12]
+                if __name__ == "__main__":
+                    train()
+        """
         _comm_world = self.__check_comm_world(comm_world)
         return self.role_maker._all_reduce(_comm_world, input, mode)
 
     def barrier(self, comm_world="worker"):
+        """
+        Barrier between specified collection.
+
+        Args:
+            comm_world (str, optional): Collection used to execute barrier operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
+
+        Examples:
+
+            .. code-block:: python
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+
+                    if fleet.is_server():
+                        fleet_util.barrier("server")
+                        print("all server arrive here")
+                    elif fleet.is_worker():
+                        fleet_util.barrier("worker")
+                        print("all server arrive here")
+                    fleet_util.barrier("all")
+                    print("all servers and workers arrive here")
+
+                if __name__ == "__main__":
+                    train()
+        """
         _comm_world = self.__check_comm_world(comm_world)
         self.role_maker._barrier(_comm_world)
 
     def all_gather(self, input, comm_world="worker"):
+        """
+        All gather `input` between specified collection.
+
+        Args:
+            input (Int|Float): The input variable to do all_gather between specified collection.
+            comm_world (str, optional): Collection used to execute all_reduce operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
+
+        Returns:
+            output (List): A list of gathered values.
+
+        Examples:
+
+            .. code-block:: python
+
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+
+                    if fleet.is_server():
+                        input = fleet.server_index()
+                        output = fleet_util.all_gather(input, "server")
+                        print(output)
+                        # output = [0, 1]
+                    elif fleet.is_worker():
+                        input = fleet.worker_index()
+                        output = fleet_util.all_gather(input, "worker")
+                        # output = [0, 1]
+                        print(output)
+                    output = fleet_util.all_gather(input, "all")
+                    print(output)
+                    # output = [0, 1, 0, 1]
+
+                if __name__ == "__main__":
+                    train()
+        """
         _comm_world = self.__check_comm_world(comm_world)
         return self.role_maker._all_gather(_comm_world, input)
 
-    def broadcast(self):
+    def _broadcast(self):
         pass
 
-    def scatter(self):
+    def _scatter(self):
         pass
 
     def get_file_shard(self, files):
         """
-        split files before distributed training,
-        example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
-                   0 gets [a, b, c] and trainer 1 gets [d, e].
-        example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
-                   [a], trainer 1 gets [b],  trainer 2 gets []
+        Split files before distributed training, and return filelist assigned to the current trainer.
+
+        .. code-block:: text
+
+            example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
+                    0 gets [a, b, c] and trainer 1 gets [d, e].
+            example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
+                    [a], trainer 1 gets [b],  trainer 2 gets []
 
         Args:
-            files(list): file list need to be read.
+            files(list): File list need to be read.
 
         Returns:
-            list: files belongs to this worker.
+            List: Files belong to this worker.
+
+        Examples:
+
+            .. code-block:: python
+
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet.base.role_maker as role_maker
+
+                role = role_maker.UserDefinedRoleMaker(
+                    is_collective=False,
+                    init_gloo=False,
+                    current_id=0,
+                    role=role_maker.Role.WORKER,
+                    worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
+                    server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+                fleet_util._set_role_maker(role)
+                files = fleet_util.get_file_shard(["file1", "file2", "file3"])
+                # files = ["file1", "file2"]
         """
         if not isinstance(files, list):
             raise TypeError("files should be a list of file need to be read.")
@@ -140,6 +285,30 @@ class UtilBase(object):
         return trainer_files[trainer_id]
 
     def print_on_rank(self, message, rank_id):
+        """
+        Woker of rank `rank_id` print some message. 
+
+        Args:
+            message(str): Log to be printed.
+            rank_id(int): trainer id.
+
+        Examples:
+
+            .. code-block:: python
+
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet.base.role_maker as role_maker
+
+                role = role_maker.UserDefinedRoleMaker(
+                    is_collective=False,
+                    init_gloo=False,
+                    current_id=0,
+                    role=role_maker.Role.WORKER,
+                    worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
+                    server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+                fleet_util._set_role_maker(role)
+                fleet_util.print_on_rank("I'm worker 0", 0)
+        """
         if self.role_maker.worker_index() != rank_id:
             return
         print(message)
@@ -297,7 +466,7 @@ class UtilBase(object):
         with fluid.scope_guard(scope):
             inference_program, feed_target_names, fetch_targets = \
                 fluid.io.load_inference_model(config.dump_model_dir, exe, model_filename=model_filename,
-                                            params_filename=config.save_params_filename)
+                                              params_filename=config.save_params_filename)
 
             # check program vars and saved vars shape
             orig_para_shape = {
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 6dba385c569..a527393f602 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -87,7 +87,7 @@ def _parse_args():
 see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
 ''')
 
-    #Optional arguments for the launch helper
+    # Optional arguments for the launch helper
     parser.add_argument(
         "--ips",
         type=str,
@@ -115,7 +115,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         default="log",
         help="The path for each process's log.If it's not set, the log will printed to default pipe."
     )
-    #positional
+    # positional
     parser.add_argument(
         "training_script",
         type=str,
@@ -124,7 +124,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         "followed by all the arguments for the "
         "training script")
 
-    #rest from the training program
+    # rest from the training program
     parser.add_argument('training_script_args', nargs=REMAINDER)
     return parser.parse_args()
 
@@ -138,7 +138,7 @@ def get_cluster_from_args(args, gpus):
 
     # node_ip = args.node_ip
     assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
-                % (node_ip, node_ips)
+        % (node_ip, node_ips)
     node_rank = node_ips.index(node_ip)
 
     logger.debug("parsed from args: node_ips:{} node_ip:{} node_rank:{}".format(
@@ -280,7 +280,7 @@ def launch_ps(args):
         _, current_node_ip = get_host_name_ip()
 
     assert current_node_ip in node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
-                % (current_node_ip, node_ips)
+        % (current_node_ip, node_ips)
     node_rank = node_ips.index(current_node_ip)
     logger.debug(
         "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}, server_ports:{}".
@@ -323,10 +323,12 @@ def launch_ps(args):
     for idx, cur_server in enumerate(pod.servers):
         proc_env = {
             "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
+            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
             "PADDLE_PORT": cur_server.endpoint.split(":")[1],
             "TRAINING_ROLE": "PSERVER",
             "PADDLE_TRAINERS_NUM": str(worker_num),
-            "POD_IP": cur_server.endpoint.split(":")[0]
+            "POD_IP": cur_server.endpoint.split(":")[0],
+            "PADDLE_WITH_GLOO": "1"
         }
         current_env.update(proc_env)
 
@@ -365,7 +367,8 @@ def launch_ps(args):
             "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
             "PADDLE_TRAINERS_NUM": str(worker_num),
             "TRAINING_ROLE": "TRAINER",
-            "PADDLE_TRAINER_ID": str(cur_worker.rank)
+            "PADDLE_TRAINER_ID": str(cur_worker.rank),
+            "PADDLE_WITH_GLOO": "1"
         }
         current_env.update(proc_env)
 
@@ -430,7 +433,11 @@ def launch():
         co_arg for co_arg in collective_args
         if co_arg in " ".join(sys.argv[1:-1])
     ]
-    cuda_device_num = fluid.core.get_cuda_device_count()
+    if fluid.core.is_compiled_with_cuda():
+        cuda_device_num = fluid.core.get_cuda_device_count()
+    else:
+        cuda_device_num = 0
+
     if len(has_ps_args) > 0 or cuda_device_num == 0:
         logger.info(
             "Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}".
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index f1911408c84..abf198b97e6 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -11,8 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .fs import *
-from .http_server import KVHandler, KVHTTPServer, KVServer
-
-#__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 966b7219d60..b7c50bda3ea 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -32,10 +32,7 @@ import functools
 from pathlib import PurePosixPath, Path
 import shutil
 
-__all__ = [
-    'FS', 'LocalFS', 'HDFSClient', 'ExecuteError', 'FSTimeOut',
-    'FSFileExistsError', 'FSFileNotExistsError', 'FSShellCmdAborted'
-]
+__all__ = ['LocalFS', 'HDFSClient']
 
 
 class ExecuteError(Exception):
@@ -117,7 +114,37 @@ class FS(object):
 
 
 class LocalFS(FS):
+    """
+    A tool of local file system.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.distributed.fleet.utils.fs import LocalFS
+
+            client = LocalFS()
+            subdirs, files = client.ls_dir("./")
+    """
+
     def ls_dir(self, fs_path):
+        """	
+        List directorys and files under `fs_path` .
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Tuple: Return a 2-tuple, the first is a list of all its subdirectories, 
+            and the second is a list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                subdirs, files = client.ls_dir("./")
+        """
         if not self.is_exist(fs_path):
             return [], []
 
@@ -132,11 +159,46 @@ class LocalFS(FS):
         return dirs, files
 
     def mkdirs(self, fs_path):
+        """
+        Create a remote HDFS directory.
+
+        Args:
+            fs_path(str): The local directory path.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.mkdirs("test_mkdirs")
+                client.delete("test_mkdirs")
+        """
         assert not os.path.isfile(fs_path), "{} is already a file".format(
             fs_path)
         os.system("mkdir -p {}".format(fs_path))
 
     def rename(self, fs_src_path, fs_dst_path):
+        """
+        Rename the file.
+
+        Args:
+            fs_src_path(str): The actual name of the file or directory
+            fs_dst_path(str): The new name of the file or directory.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_rename_src")
+                print(client.is_exists("test_rename_src")) # True
+                client.rename("test_rename_src", "test_rename_dst")
+                print(client.is_exists("test_rename_src")) # False
+                print(client.is_exists("test_rename_dst")) # True
+                client.delete("test_rename_dst")
+        """
         os.rename(fs_src_path, fs_dst_path)
 
     def _rmr(self, fs_path):
@@ -146,6 +208,21 @@ class LocalFS(FS):
         os.remove(fs_path)
 
     def delete(self, fs_path):
+        """
+        Delete the local file path, whether it's a file or directory.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.mkdirs("test_localFS_mkdirs")
+                client.delete("test_localFS_mkdirs")
+        """
         if not self.is_exist(fs_path):
             return
 
@@ -158,15 +235,88 @@ class LocalFS(FS):
         return False
 
     def is_file(self, fs_path):
+        """
+        Whether the local file path is a file.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a file, otherwise return false.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_is_file")
+                print(client.is_file("test_is_file")) # True
+                client.delete("test_is_file")
+        """
         return os.path.isfile(fs_path)
 
     def is_dir(self, fs_path):
+        """
+        Whether the local file path is a directory.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a directory, otherwise return false.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.mkdirs("test_is_dir")
+                print(client.is_dir("test_is_file")) # True
+                client.delete("test_is_dir")
+        """
         return os.path.isdir(fs_path)
 
     def is_exist(self, fs_path):
+        """
+        Whether the local file path exists.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Bool: Wheter it's a file or directory, return true if the path exists, 
+            otherwise return false.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                ret = local_fs.is_exist("test_is_exist")
+        """
         return os.path.exists(fs_path)
 
     def touch(self, fs_path, exist_ok=True):
+        """
+        Create a local file.
+
+        Args:
+            fs_path(str): The local file path.
+            exist_ok(bool): When `fs_path` exists, if `exist_ok` is set false,
+            program will throw an Exception. Default is true.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_touch")
+                client.delete("test_touch")
+        """
         if self.is_exist(fs_path):
             if exist_ok:
                 return
@@ -175,6 +325,26 @@ class LocalFS(FS):
         return Path(fs_path).touch(exist_ok=True)
 
     def mv(self, src_path, dst_path, overwrite=False, test_exists=False):
+        """
+        Move a local file or directory from `src_path` to `dst_path` .
+
+        Args:
+            src_path(str):  Name of the file or directory, that's needed to be moved.
+            dst_path(str):  Name of the file or directory to which to move to.
+            overwrite(bool): Whether to re-write `dst_path` if that exists. Default is False.
+            test_exists(bool): Check the existence of `src_path` and `dst_path` . 
+            When `test_exists` is set true, if `src_path` doesn't exist or `dst_path` exists, program will throw an Excetption. 
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_mv_src")
+                client.mv("test_mv_src", "test_mv_dst")
+                client.delete("test_mv_dst")
+        """
         if not self.is_exist(src_path):
             raise FSFileNotExistsError
 
@@ -188,7 +358,21 @@ class LocalFS(FS):
 
     def list_dirs(self, fs_path):
         """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
+        Only list directorys under `fs_path` .
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            List: A list of all its subdirectories, e.g. [subdirname1, subdirname1, ...].
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                subdirs = client.list_dirs("./")
         """
         if not self.is_exist(fs_path):
             return []
@@ -217,7 +401,7 @@ def _handle_errors(max_time_out=None):
             while True:
                 try:
                     return f(*args, **kwargs)
-                #important: only ExecuteError need to retry
+                # important: only ExecuteError need to retry
                 except ExecuteError as e:
                     if time.time() - start >= time_out:
                         raise FSTimeOut("args:{} timeout:{}".format(
@@ -236,12 +420,36 @@ def _handle_errors(max_time_out=None):
 
 
 class HDFSClient(FS):
+    """
+    A tool of HDFS.
+
+    Args:
+        hadoop_home(str): Hadoop home. 
+        configs(dict): Hadoop config. It is a dictionary and needs to contain the
+            keys: "fs.default.name" and "hadoop.job.ugi".
+
+    Examples:
+
+        .. code-block:: text
+
+            from paddle.distributed.fleet.utils.fs import HDFSClient
+            hadoop_home = "/home/client/hadoop-client/hadoop/"
+
+            configs = {
+                "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                "hadoop.job.ugi": "hello,hello123"
+            }
+
+            client = HDFSClient(hadoop_home, configs)
+            client.ls_dir("hdfs:/test_hdfs_client")
+    """
+
     def __init__(
             self,
             hadoop_home,
             configs,
-            time_out=5 * 60 * 1000,  #ms
-            sleep_inter=1000):  #ms
+            time_out=5 * 60 * 1000,  # ms
+            sleep_inter=1000):  # ms
         # Raise exception if JAVA_HOME not exists.
         java_home = os.environ["JAVA_HOME"]
 
@@ -272,6 +480,30 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def list_dirs(self, fs_path):
+        """	
+        Only list directorys under `fs_path` .
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            List: A list of all its subdirectories, e.g. [subdirname1, subdirname1, ...].
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                subdirs = client.list_dirs("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return []
 
@@ -281,7 +513,29 @@ class HDFSClient(FS):
     @_handle_errors()
     def ls_dir(self, fs_path):
         """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
+        List directorys and files under `fs_path` .
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            Tuple: Return a 2-tuple, the first element is the list of all its subdirectories, 
+            and the second one is the list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                subdirs, files = client.ls_dir("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return [], []
@@ -320,6 +574,30 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def is_dir(self, fs_path):
+        """
+        Whether the remote HDFS path is a directory.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a directory, otherwise return false.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_file("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return False
 
@@ -338,6 +616,30 @@ class HDFSClient(FS):
         return True
 
     def is_file(self, fs_path):
+        """
+        Whether the remote HDFS path is a file.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a file, otherwise return false.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_file("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return False
 
@@ -345,6 +647,31 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def is_exist(self, fs_path):
+        """
+        Whether the remote HDFS path exists.
+
+        Args:
+            fs_path(str): The hdfs file path.
+
+        Returns:
+            Bool: Whether it's is file or directory, return true if the path exists,
+            otherwise return false.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_exist("hdfs:/test_hdfs_client")
+        """
         cmd = "ls {} ".format(fs_path)
         ret, out = self._run_cmd(cmd, redirect_stderr=True)
         if ret != 0:
@@ -357,6 +684,28 @@ class HDFSClient(FS):
 
     # can't retry
     def upload(self, local_path, fs_path):
+        """
+        Upload the local path to remote HDFS.
+
+        Args:
+            local_path(str): The local path.
+            fs_path(str): The HDFS path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.upload("test_hdfs_client", "hdfs:/test_hdfs_client")
+        """
         if self.is_exist(fs_path):
             raise FSFileExistsError("{} exists".format(fs_path))
 
@@ -380,6 +729,28 @@ class HDFSClient(FS):
 
     # can't retry
     def download(self, fs_path, local_path):
+        """
+        Download remote HDFS path to the local.
+
+        Args:
+            fs_path(str):  The HDFS path.
+            local_path(str): The local path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.download("hdfs:/test_hdfs_client", "./")
+        """
         if self.is_exist(local_path):
             raise FSFileExistsError("{} exists".format(local_path))
 
@@ -403,6 +774,27 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def mkdirs(self, fs_path):
+        """
+        Create a remote HDFS directory.
+
+        Args:
+            fs_path(str): The HDFS directory path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.mkdirs("hdfs:/test_hdfs_client")
+        """
         if self.is_exist(fs_path):
             return
 
@@ -425,6 +817,30 @@ class HDFSClient(FS):
                 raise ExecuteError(cmd)
 
     def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
+        """
+        Move a remote HDFS file or directory from `fs_src_path` to `fs_dst_path` .
+
+        Args:
+            fs_src_path(str):  Name of the file or directory, that's needed to be moved.
+            fs_dst_path(str):  Name of the file or directory to which to move to.
+            overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False.
+            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption. 
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.mv("hdfs:/test_hdfs_client", "hdfs:/test_hdfs_client2")
+        """
         if overwrite and self.is_exist(fs_dst_path):
             self.delete(fs_dst_path)
 
@@ -467,6 +883,27 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def delete(self, fs_path):
+        """
+        Delete a remote HDFS path, whether it's a file or directory.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.delete("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return
 
@@ -477,6 +914,27 @@ class HDFSClient(FS):
         return self._rm(fs_path)
 
     def touch(self, fs_path, exist_ok=True):
+        """
+        Create a remote HDFS file.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.touch("hdfs:/test_hdfs_client")
+        """
         if self.is_exist(fs_path):
             if exist_ok:
                 return
diff --git a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
index ad51a043a0a..a8c1656b2b0 100644
--- a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
@@ -98,7 +98,7 @@ class AutoCheckpointChecker(object):
             self._fs_cache = os.getenv("PADDLE_EDL_FS_CACHE", ".cache")
 
             self._save_checkpoint_inter = int(
-                os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900"))  #s
+                os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900"))  # s
 
             if not self._ce_test:
                 assert len(self._hdfs_home) > 3 and \
@@ -132,7 +132,7 @@ class AutoCheckpointChecker(object):
         if in_dygraph_mode():
             return False
 
-        return  self._run_env is not None and \
+        return self._run_env is not None and \
             self._platform is not None and \
             self._job_id is not None and \
             self._hdfs_home is not None and \
diff --git a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
index 6a752bc3053..766dcc39af1 100644
--- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
+++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
@@ -19,7 +19,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
index 90db9595d92..3c78438bdf6 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
@@ -67,13 +67,13 @@ class AutoCheckpointTestDist(AutoCheckPointACLBase):
         save_dir = "./run_save_0"
         fs.delete(save_dir)
 
-        #basic
+        # basic
         exe, main_prog, startup_prog = self._generate()
 
         compiled, data_loader, optimizer, loss, image, label = \
             self._init_env(exe, main_prog, startup_prog, minimize=False)
 
-        #fleet
+        # fleet
         os.environ["TRAINING_ROLE"] = "TRAINER"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
index 6414ef18d63..6cb40eef27e 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
@@ -40,9 +40,9 @@ class TestCloudRoleMaker(unittest.TestCase):
             from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
             from paddle.fluid.incubate.fleet.base.role_maker import \
                 GeneralRoleMaker
-            from paddle.distributed.fleet.utils import KVHandler
-            from paddle.distributed.fleet.utils import KVServer
-            from paddle.distributed.fleet.utils import KVHTTPServer
+            from paddle.distributed.fleet.utils.http_server import KVHandler
+            from paddle.distributed.fleet.utils.http_server import KVServer
+            from paddle.distributed.fleet.utils.http_server import KVHTTPServer
         except:
             print("warning: no fleet, skip test_pslib_4")
             return
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py
index dde36e073fb..d506088fde0 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -81,12 +81,12 @@ class TestFleetUtil(unittest.TestCase):
         self.assertEqual(user_id, 10)
 
     def test_fs(self):
-        from paddle.distributed.fleet.utils import LocalFS
+        from paddle.distributed.fleet.utils.fs import LocalFS
         fs = LocalFS()
         dirs, files = fs.ls_dir("test_tmp")
         dirs, files = fs.ls_dir("./")
         self.assertFalse(fs.need_upload_download())
-        fleet_util.set_file_system(fs)
+        fleet_util._set_file_system(fs)
 
     def test_barrier(self):
         try:
diff --git a/python/paddle/fluid/tests/unittests/test_fs_interface.py b/python/paddle/fluid/tests/unittests/test_fs_interface.py
index c01876531c9..581fa973811 100644
--- a/python/paddle/fluid/tests/unittests/test_fs_interface.py
+++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py
@@ -20,7 +20,7 @@ import os
 import sys
 import inspect
 
-from paddle.distributed.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 
 class FSTest(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs1.py b/python/paddle/fluid/tests/unittests/test_hdfs1.py
index 430ed1abe86..1aac1236156 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs1.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs1.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
-
 
 class FSTest1(FSTestBase):
     def test_timeout(self):
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs2.py b/python/paddle/fluid/tests/unittests/test_hdfs2.py
index 7754f89e3c9..1fa019bb9cd 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs2.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
-
 
 class FSTest2(FSTestBase):
     def test_hdfs(self):
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs3.py b/python/paddle/fluid/tests/unittests/test_hdfs3.py
index 1a045f4b17f..218bf12ca60 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs3.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs3.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
-
 
 class FSTest3(FSTestBase):
     def test_hdfs(self):
-- 
GitLab


From 515efe42406f918ac26b17304e24ab0451577bda Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Thu, 17 Sep 2020 20:34:35 +0800
Subject: [PATCH 127/261] add empty_like op (python, and unit test), use c++
 implementation of empty op, (#27287)

and optimize the c++ implmentation of empty op as PR#26659 reviews,
and add bool for shape op.
---
 paddle/fluid/operators/empty_op.cc            |  33 +--
 paddle/fluid/operators/shape_op.cc            |   2 +-
 paddle/fluid/operators/shape_op.cu            |   4 +-
 python/paddle/__init__.py                     |   1 +
 python/paddle/fluid/layers/nn.py              |   6 +-
 .../tests/unittests/test_empty_like_op.py     | 192 ++++++++++++++++++
 python/paddle/tensor/__init__.py              |   1 +
 python/paddle/tensor/creation.py              |  68 +++++++
 8 files changed, 288 insertions(+), 19 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_empty_like_op.py

diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index f539e2e6f6d..3d28ca90a5a 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -55,31 +55,38 @@ class EmptyOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
 
     if (context->HasInput("ShapeTensor")) {
-      auto dims = context->GetInputDim("ShapeTensor");
+      auto shape_dims = context->GetInputDim("ShapeTensor");
       int num_ele = 1;
-      for (int i = 0; i < dims.size(); ++i) {
-        num_ele *= dims[i];
+      for (int i = 0; i < shape_dims.size(); ++i) {
+        num_ele *= shape_dims[i];
       }
-
-      context->SetOutputDim("Out", framework::make_ddim({num_ele}));
+      auto vec_dims = std::vector<int>(num_ele, -1);
+      context->SetOutputDim("Out", framework::make_ddim(vec_dims));
     } else if (context->HasInputs("ShapeTensorList")) {
       std::vector<int> out_dims;
       auto dims_list = context->GetInputsDim("ShapeTensorList");
       for (size_t i = 0; i < dims_list.size(); ++i) {
         auto& dims = dims_list[i];
-        PADDLE_ENFORCE_EQ(
-            dims, framework::make_ddim({1}),
-            "ShapeError: The shape of Tensor in list must be [1]. "
-            "But received the shape "
-            "is [%s]",
-            dims);
-
-        out_dims.push_back(dims[0]);
+        PADDLE_ENFORCE_EQ(dims, framework::make_ddim({1}),
+                          platform::errors::InvalidArgument(
+                              "The shape of Tensor in list must be [1]. "
+                              "But received the shape is [%s]",
+                              dims));
+
+        out_dims.push_back(-1);
       }
 
       context->SetOutputDim("Out", framework::make_ddim(out_dims));
     } else {
       auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
+      for (size_t i = 0; i < shape.size(); ++i) {
+        PADDLE_ENFORCE_GE(
+            shape[i], 0,
+            platform::errors::InvalidArgument(
+                "Each value of attribute 'shape' is expected to be no less "
+                "than 0. But recieved: shape[%u] = %d; shape = [%s].",
+                i, shape[i], framework::make_ddim(shape)));
+      }
       context->SetOutputDim("Out", framework::make_ddim(shape));
     }
   }
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 62bffe63048..0ecf9bfb5d8 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -68,6 +68,6 @@ REGISTER_OPERATOR(
     shape, ops::ShapeOp, ops::ShapeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int32_t>,
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
                        ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
                        ops::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
index 4b9dca0d402..5d50b17818c 100644
--- a/paddle/fluid/operators/shape_op.cu
+++ b/paddle/fluid/operators/shape_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/shape_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
-    shape, paddle::operators::ShapeKernel<int>,
-    paddle::operators::ShapeKernel<int32_t>,
+    shape, paddle::operators::ShapeKernel<bool>,
+    paddle::operators::ShapeKernel<int>,
     paddle::operators::ShapeKernel<int64_t>,
     paddle::operators::ShapeKernel<float>,
     paddle::operators::ShapeKernel<double>,
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 016726633ea..661471599cb 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -77,6 +77,7 @@ from .tensor.creation import triu  #DEFINE_ALIAS
 from .tensor.creation import tril  #DEFINE_ALIAS
 from .tensor.creation import meshgrid  #DEFINE_ALIAS
 from .tensor.creation import empty  #DEFINE_ALIAS
+from .tensor.creation import empty_like  #DEFINE_ALIAS
 from .tensor.linalg import matmul  #DEFINE_ALIAS
 from .tensor.linalg import dot  #DEFINE_ALIAS
 # from .tensor.linalg import einsum        #DEFINE_ALIAS
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bc9f182d95e..4a750f301a0 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -11229,7 +11229,7 @@ def shape(input):
                 input.shape = [3, 2]
 
     Args:
-        input (Variable): The input can be N-D Tensor or SelectedRows with data type float16, float32, float64, int32, int64.
+        input (Variable): The input can be N-D Tensor or SelectedRows with data type bool, float16, float32, float64, int32, int64.
                           If input variable is type of SelectedRows, returns the shape of it's inner tensor.
 
     Returns:
@@ -11253,8 +11253,8 @@ def shape(input):
             print(res) # [array([  3, 100, 100], dtype=int32)]
     """
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'shape')
+        input, 'input',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'shape')
     helper = LayerHelper('shape', **locals())
     out = helper.create_variable_for_type_inference(dtype='int32')
     helper.append_op(
diff --git a/python/paddle/fluid/tests/unittests/test_empty_like_op.py b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
new file mode 100644
index 00000000000..32d732d9a80
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
@@ -0,0 +1,192 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.data_feeder import convert_dtype
+import paddle.fluid.core as core
+from paddle.static import program_guard, Program
+
+
+class TestEmptyLikeAPICommon(unittest.TestCase):
+    def __check_out__(self, out):
+        data_type = convert_dtype(out.dtype)
+        self.assertEqual(data_type, self.dst_dtype,
+                         'dtype should be %s, but get %s' %
+                         (self.dst_dtype, data_type))
+
+        shape = out.shape
+        self.assertTupleEqual(shape, self.dst_shape,
+                              'shape should be %s, but get %s' %
+                              (self.dst_shape, shape))
+
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(out)
+            min_value = np.nanmin(out)
+            always_non_full_zero = max_value > min_value
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = out.size
+            true_num = np.sum(out == True)
+            false_num = np.sum(out == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+
+
+class TestEmptyLikeAPI(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+        out = paddle.empty_like(self.x, self.dtype)
+        self.__check_out__(out.numpy())
+        paddle.enable_static()
+
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float32")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI2(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float64")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI3(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI4(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int64")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI5(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("bool")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI6(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float64")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI7(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI8(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int64")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI9(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("bool")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI10(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float32")
+        self.dtype = "bool"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI_Static(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+
+    def test_static_graph(self):
+        dtype = 'float32'
+
+        train_program = Program()
+        startup_program = Program()
+
+        with program_guard(train_program, startup_program):
+            x = np.random.random(self.x_shape).astype(dtype)
+            data_x = paddle.static.data(
+                'x', shape=self.data_x_shape, dtype=dtype)
+
+            out = paddle.empty_like(data_x)
+
+        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        res = exe.run(train_program, feed={'x': x}, fetch_list=[out])
+
+        self.dst_dtype = dtype
+        self.dst_shape = x.shape
+        self.__check_out__(res[0])
+
+    def init_config(self):
+        self.x_shape = (200, 3)
+        self.data_x_shape = [200, 3]
+
+
+class TestEmptyLikeAPI_Static2(TestEmptyLikeAPI_Static):
+    def init_config(self):
+        self.x_shape = (3, 200, 3)
+        self.data_x_shape = [-1, 200, 3]
+
+
+class TestEmptyError(unittest.TestCase):
+    def test_attr(self):
+        def test_dtype():
+            x = np.random.random((200, 3)).astype("float64")
+            dtype = 'uint8'
+            result = paddle.empty_like(x, dtype=dtype)
+
+        self.assertRaises(TypeError, test_dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 8bb584be236..a713663e182 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -41,6 +41,7 @@ from .creation import triu  #DEFINE_ALIAS
 from .creation import tril  #DEFINE_ALIAS
 from .creation import meshgrid  #DEFINE_ALIAS
 from .creation import empty  #DEFINE_ALIAS
+from .creation import empty_like  #DEFINE_ALIAS
 from .io import save  #DEFINE_ALIAS
 from .io import load  #DEFINE_ALIAS
 from .linalg import matmul  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 8011b92964b..9aee911e568 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -49,6 +49,7 @@ __all__ = [
     'full',
     'full_like',
     'empty',
+    'empty_like',
     'triu',
     'tril',
     'meshgrid'
@@ -1068,3 +1069,70 @@ def empty(shape, dtype=None, name=None):
         stop_gradient=True)
     out.stop_gradient = True
     return out
+
+
+def empty_like(x, dtype=None, name=None):
+    """
+    This Op returns a Tensor with uninitialized data which has identical shape of ``x`` and ``dtype``.
+    If the ``dtype`` is None, the data type of Tensor is same with ``x``.
+    
+    Args:
+        x(Tensor): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
+        dtype(np.dtype|str, optional): The data type of output. The data type can be one
+            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
+            data type is the same as input.
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        Tensor: Tensor which is created according to ``x`` and ``dtype``, and is uninitialized.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()   # Now we are in imperative mode
+          paddle.set_device("cpu")  # and use cpu device
+
+          x = paddle.randn([2, 3], 'float32')
+          output = paddle.empty_like(x)
+          #[[1.8491974e+20 1.8037303e+28 1.7443726e+28]     # uninitialized
+          # [4.9640171e+28 3.0186127e+32 5.6715899e-11]]    # uninitialized
+    """
+
+    if dtype is None:
+        dtype = x.dtype
+    dtype = convert_dtype(dtype)
+
+    if in_dygraph_mode():
+        out = core.ops.empty('shape', x.shape, 'dtype',
+                             convert_np_dtype_to_dtype_(dtype))
+        out.stop_gradient = True
+        return out
+
+    helper = LayerHelper("empty_like", **locals())
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'empty_like')
+    check_dtype(dtype, 'dtype',
+                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+                'empty_like')
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+
+    inputs = {}
+    attrs = {}
+    attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
+    shape = paddle.shape(x)
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='empty_like')
+
+    helper.append_op(
+        type='empty',
+        inputs=inputs,
+        outputs={'Out': [out]},
+        attrs=attrs,
+        stop_gradient=True)
+    out.stop_gradient = True
+    return out
-- 
GitLab


From 3c11717988b5bb3a320f65fe17dcf907a4d39de6 Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Thu, 17 Sep 2020 21:05:53 +0800
Subject: [PATCH 128/261] add op version checker to ir passes (#27329)

---
 .../embedding_eltwise_layernorm_fuse_pass.cc  |   6 +
 ...ding_eltwise_layernorm_fuse_pass_tester.cc |   9 +-
 .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc   |  19 ++
 .../conv_bias_mkldnn_fuse_pass_tester.cc      |   7 +
 .../conv_elementwise_add_mkldnn_fuse_pass.cc  |   6 +
 ...elementwise_add_mkldnn_fuse_pass_tester.cc |   7 +
 .../ir/mkldnn/depthwise_conv_mkldnn_pass.cc   |   5 +
 .../depthwise_conv_mkldnn_pass_tester.cc      |   8 +
 .../ir/multihead_matmul_fuse_pass.cc          |  11 ++
 .../ir/multihead_matmul_fuse_pass_tester.cc   |   7 +
 .../framework/ir/skip_layernorm_fuse_pass.cc  |   6 +
 .../ir/skip_layernorm_fuse_pass_tester.cc     |   7 +
 .../test_conv_bias_mkldnn_fuse_pass.py        | 171 ++++++++++++++++++
 13 files changed, 268 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py

diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 7612df9ab91..3f88a460d14 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -334,3 +335,8 @@ void EmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
 
 REGISTER_PASS(embedding_eltwise_layernorm_fuse_pass,
               paddle::framework::ir::EmbeddingEltwiseLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("lookup_table", 0)
+            .EQ("elementweise_add", 0));
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
index 71c9dbae1a4..727e42629f9 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
@@ -16,12 +16,13 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-TEST(SkipLayerNormFusePass, basic) {
+TEST(EmbeddingElewiseLayernormFusePass, basic) {
   // inputs                           operator            output
   // --------------------------------------------------------------------
   // (x, y)                       elementwise_add    -> elementwise_out
@@ -91,6 +92,12 @@ TEST(SkipLayerNormFusePass, basic) {
           "The number of fusion nodes does not meet expectations after fuse"));
 }
 
+TEST(EmbeddingElewiseLayernormFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("embedding_eltwise_layernorm_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index 82e0af3c198..f7a8e3e3f6c 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -84,6 +85,19 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
       VLOG(3) << "do not perform " + type() + "+bias fuse";
       return;
     }
+    if (conv->Op()->HasAttr("dilations")) {
+      auto dilations =
+          BOOST_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations"));
+      for (const auto& d : dilations) {
+        if (d != 1) {
+          LOG(WARNING)
+              << "dilation conv not supported in MKLDNN, fuse not apply "
+              << "and set conv attribute use_mkldnn = false";
+          conv->Op()->SetAttr("use_mkldnn", false);
+          return;
+        }
+      }
+    }
 
     auto* eltwise_bias_tensor =
         scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
@@ -151,3 +165,8 @@ REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
               paddle::framework::ir::Conv2DTransposeBiasFusePass);
 REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
               paddle::framework::ir::Conv3DBiasFusePass);
+REGISTER_PASS_CAPABILITY(conv_bias_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
index 88aac001a93..455350d2f70 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/platform/place.h"
 
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
 namespace paddle {
@@ -149,6 +150,12 @@ TEST(ConvBiasFusePass, conv2d_transpose) {
   ASSERT_EQ(pass.type(), std::string("conv2d_transpose"));
 }
 
+TEST(ConvBiasFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("conv_bias_mkldnn_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index af2b1308e08..2fb131aceaa 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <memory>
 #include <tuple>
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -341,3 +342,8 @@ void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
 
 REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
               paddle::framework::ir::ResidualConnectionMKLDNNFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
index 8a13596cd50..fd4910fc8e9 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -17,6 +17,7 @@
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -267,6 +268,12 @@ TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
   AssertOpsCount(graph, 2, 1);
 }
 
+TEST(ConvElementwiseAddMKLDNNFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("conv_elementwise_add_mkldnn_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
index c5965701a53..df5ba3314e6 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -57,3 +58,7 @@ void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(depthwise_conv_mkldnn_pass,
               paddle::framework::ir::DepthwiseConvMKLDNNPass);
+REGISTER_PASS_CAPABILITY(depthwise_conv_mkldnn_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "depthwise_conv2d", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
index a37565236cd..c6c72ba33d6 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
@@ -16,6 +16,8 @@
 
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -70,6 +72,12 @@ ProgramDesc BuildProgramDesc() {
   return prog;
 }
 
+TEST(DepthwiseConvMKLDNNPass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("depthwise_conv_mkldnn_pass"));
+}
+
 TEST(DepthwiseConvMKLDNNPass, basic) {
   auto prog = BuildProgramDesc();
 
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 198107ea082..9d2b4ebaf8c 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/errors.h"
 
 namespace paddle {
@@ -707,3 +708,13 @@ REGISTER_PASS(multihead_matmul_fuse_pass,
 
 REGISTER_PASS(multihead_matmul_fuse_pass_v2,
               paddle::framework::ir::MultiHeadMatmulV2FusePass);
+REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .EQ("matmul", 0)
+            .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
index d8a06b037bd..2eda643d4e5 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h"  // NOLINT
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -133,6 +134,12 @@ TEST(MultiHeadMatmulFusePass, basic) {
                         num_fused_nodes_after));
 }
 
+TEST(MultiHeadMatmulFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("multihead_matmul_fuse_pass_v2"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index 9dddc9154f8..2e3cd16d5ce 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -180,3 +181,8 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(skip_layernorm_fuse_pass,
               paddle::framework::ir::SkipLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(skip_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("elementwise_add", 0)
+            .EQ("layer_norm", 0));
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
index d2d74698728..eff5dcddf54 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -54,6 +55,12 @@ TEST(SkipLayerNormFusePass, basic) {
           "The number of fusion nodes does not meet expectations after fuse"));
 }
 
+TEST(SkipLayerNormFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("skip_layernorm_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
new file mode 100644
index 00000000000..5eb397b5a95
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+"""Test for fusion of conv and bias."""
+
+
+#padding SAME
+class ConvBiasMkldnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="SAME",
+                bias_attr=param_attr)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#padding VALID
+class ConvBiasMkldnnFusePassTest1(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                bias_attr=param_attr)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#padding number
+class ConvBiasMkldnnFusePassTest2(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding=[2, 4, 6, 8],
+                bias_attr=param_attr)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#dilation not supported yet, just print warning log and does not fuse
+class ConvBiasMkldnnFusePassTest3(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                dilation=2,
+                groups=3,
+                bias_attr=param_attr,
+                use_cudnn=False,
+                act="softmax",
+                data_format="NCHW")
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#all conv params except for dilation
+class ConvBiasMkldnnFusePassTest4(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                groups=3,
+                bias_attr=param_attr,
+                use_cudnn=False,
+                act="softmax",
+                data_format="NCHW")
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From 01659a6961668454cb593018d933cc290e466cce Mon Sep 17 00:00:00 2001
From: HappyAngel <chenjiaobuaa@126.com>
Date: Thu, 17 Sep 2020 21:41:11 +0800
Subject: [PATCH 129/261] Polish operators error message in average_accumlate
 OP (#27268)

* fix op print error info problem. test=develop

* fix build error

* fix format

* fix error msg info

* fix format
---
 paddle/fluid/operators/average_accumulates_op.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index 3958d3f6854..338e46111fc 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -54,9 +54,13 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     float average_window = ctx.Attr<float>("average_window");
     int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
     int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    PADDLE_ENFORCE_LE(min_average_window, max_average_window,
-                      "min_average_window shouldn't be larger than "
-                      "max_average_window");
+    PADDLE_ENFORCE_LE(
+        min_average_window, max_average_window,
+        platform::errors::InvalidArgument(
+            "The min_average_window > "
+            "max_average_window is not right, min_average_window is %ld, "
+            "max_average_window is %ld.",
+            min_average_window, max_average_window));
 
     // Get inputs
     auto* param = ctx.Input<Tensor>("param");
-- 
GitLab


From ac82baa80dd457032a54771c74505f2e07ef303e Mon Sep 17 00:00:00 2001
From: liym27 <33742067+liym27@users.noreply.github.com>
Date: Thu, 17 Sep 2020 23:01:26 +0800
Subject: [PATCH 130/261] [Dy2Stat-log] Add feature also_to_stdout and optimize
 log messages (#27285)

* Add env value to  log to stdout; 2.Add logger name

* Optimize log messages in dygraph-to-static

* Replace logging.warn and warnings.warn with logging_utils.warn
---
 .../dygraph_to_static/ast_transformer.py      |  2 +-
 .../dygraph_to_static/function_spec.py        |  5 +-
 .../dygraph_to_static/logging_utils.py        | 88 +++++++++++++++----
 .../dygraph_to_static/partial_program.py      |  8 +-
 .../dygraph_to_static/print_transformer.py    |  8 +-
 .../dygraph_to_static/program_translator.py   | 19 ++--
 python/paddle/fluid/dygraph/jit.py            |  5 +-
 .../dygraph_to_static/test_logging_utils.py   | 49 ++++++++++-
 8 files changed, 137 insertions(+), 47 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index 5152799ca72..5050067e48a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -60,7 +60,7 @@ class DygraphToStaticAst(gast.NodeTransformer):
     def transfer_from_node_type(self, node_wrapper):
         translator_logger = logging_utils.TranslatorLogger()
         translator_logger.log(
-            1, "   Source code: \n{}".format(ast_to_source_code(self.root)))
+            1, "Source code: \n{}".format(ast_to_source_code(self.root)))
         # Generic transformation
         self.visit(node_wrapper.node)
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index 37ce8b0a152..3d1ed836ff1 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -12,17 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
 import six
 import inspect
 import numpy as np
 import collections
+
 import paddle
 from paddle.fluid import core
 from paddle.fluid.dygraph import layers
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers.utils import pack_sequence_as
 from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
 from paddle.fluid.dygraph.dygraph_to_static.utils import type_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
@@ -291,7 +292,7 @@ def convert_to_input_spec(inputs, input_spec):
         if len(inputs) > len(input_spec):
             for rest_input in inputs[len(input_spec):]:
                 if isinstance(rest_input, (core.VarBase, np.ndarray)):
-                    logging.warning(
+                    logging_utils.warn(
                         "The inputs constain `{}` without specificing InputSpec, its shape and dtype will be treated immutable. "
                         "Please specific InputSpec information in `@declarative` if you expect them as mutable inputs.".
                         format(type_name(rest_input)))
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
index c52872b1501..4d9ed5916ad 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -26,6 +26,8 @@ CODE_LEVEL_ENV_NAME = 'TRANSLATOR_CODE_LEVEL'
 DEFAULT_VERBOSITY = -1
 DEFAULT_CODE_LEVEL = -1
 
+LOG_AllTransformer = 100
+
 
 def synchronized(func):
     def wrapper(*args, **kwargs):
@@ -53,10 +55,15 @@ class TranslatorLogger(object):
             return
 
         self._initialized = True
+        self.logger_name = "Dynamic-to-Static"
         self._logger = log_helper.get_logger(
-            __name__, 1, fmt='%(asctime)s-%(levelname)s: %(message)s')
+            self.logger_name,
+            1,
+            fmt='%(asctime)s %(name)s %(levelname)s: %(message)s')
         self._verbosity_level = None
         self._transformed_code_level = None
+        self._need_to_echo_log_to_stdout = None
+        self._need_to_echo_code_to_stdout = None
 
     @property
     def logger(self):
@@ -86,6 +93,28 @@ class TranslatorLogger(object):
         self.check_level(level)
         self._transformed_code_level = level
 
+    @property
+    def need_to_echo_log_to_stdout(self):
+        if self._need_to_echo_log_to_stdout is not None:
+            return self._need_to_echo_log_to_stdout
+        return False
+
+    @need_to_echo_log_to_stdout.setter
+    def need_to_echo_log_to_stdout(self, log_to_stdout):
+        assert isinstance(log_to_stdout, (bool, type(None)))
+        self._need_to_echo_log_to_stdout = log_to_stdout
+
+    @property
+    def need_to_echo_code_to_stdout(self):
+        if self._need_to_echo_code_to_stdout is not None:
+            return self._need_to_echo_code_to_stdout
+        return False
+
+    @need_to_echo_code_to_stdout.setter
+    def need_to_echo_code_to_stdout(self, code_to_stdout):
+        assert isinstance(code_to_stdout, (bool, type(None)))
+        self._need_to_echo_code_to_stdout = code_to_stdout
+
     def check_level(self, level):
         if isinstance(level, (six.integer_types, type(None))):
             rv = level
@@ -110,34 +139,56 @@ class TranslatorLogger(object):
 
     def error(self, msg, *args, **kwargs):
         self.logger.error(msg, *args, **kwargs)
+        if self.need_to_echo_log_to_stdout:
+            self._output_to_stdout('ERROR: ' + msg, *args)
 
     def warn(self, msg, *args, **kwargs):
-        self.logger.warn(msg, *args, **kwargs)
+        self.logger.warning(msg, *args, **kwargs)
+        if self.need_to_echo_log_to_stdout:
+            self._output_to_stdout('WARNING: ' + msg, *args)
 
     def log(self, level, msg, *args, **kwargs):
         if self.has_verbosity(level):
-            self.logger.log(level, msg, *args, **kwargs)
+            msg_with_level = '(Level {}) {}'.format(level, msg)
+            self.logger.info(msg_with_level, *args, **kwargs)
+            if self.need_to_echo_log_to_stdout:
+                self._output_to_stdout('INFO: ' + msg_with_level, *args)
 
     def log_transformed_code(self, level, ast_node, transformer_name, *args,
                              **kwargs):
         if self.has_code_level(level):
             source_code = ast_to_source_code(ast_node)
-            header_msg = "After the level {} ast transformer: '{}', the transformed code:\n"\
-                .format(level, transformer_name)
+            if level == LOG_AllTransformer:
+                header_msg = "After the last level ast transformer: '{}', the transformed code:\n" \
+                    .format(transformer_name)
+            else:
+                header_msg = "After the level {} ast transformer: '{}', the transformed code:\n"\
+                    .format(level, transformer_name)
 
             msg = header_msg + source_code
             self.logger.info(msg, *args, **kwargs)
 
+            if self.need_to_echo_code_to_stdout:
+                self._output_to_stdout('INFO: ' + msg, *args)
+
+    def _output_to_stdout(self, msg, *args):
+        msg = self.logger_name + ' ' + msg
+        print(msg % args)
+
 
 _TRANSLATOR_LOGGER = TranslatorLogger()
 
 
-def set_verbosity(level=0):
+def set_verbosity(level=0, also_to_stdout=False):
     """
-    Sets the verbosity level of log for dygraph to static graph.
+    Sets the verbosity level of log for dygraph to static graph. Logs can be output to stdout by setting `also_to_stdout`.
+
     There are two means to set the logging verbosity:
-     1. Call function `set_verbosity`
-     2. Set environment variable `TRANSLATOR_VERBOSITY`
+
+    1. Call function `set_verbosity`
+
+    2. Set environment variable `TRANSLATOR_VERBOSITY`
+
 
     **Note**:
     `set_verbosity` has a higher priority than the environment variable.
@@ -145,6 +196,7 @@ def set_verbosity(level=0):
     Args:
         level(int): The verbosity level. The larger value idicates more verbosity.
             The default value is 0, which means no logging.
+        also_to_stdout(bool): Whether to also output log messages to `sys.stdout`.
 
     Examples:
         .. code-block:: python
@@ -159,27 +211,30 @@ def set_verbosity(level=0):
             # The verbosity level is now 3, but it has no effect because it has a lower priority than `set_verbosity`
     """
     _TRANSLATOR_LOGGER.verbosity_level = level
+    _TRANSLATOR_LOGGER.need_to_echo_log_to_stdout = also_to_stdout
 
 
 def get_verbosity():
     return _TRANSLATOR_LOGGER.verbosity_level
 
 
-LOG_AllTransformer = 100
-
-
-def set_code_level(level=LOG_AllTransformer):
+def set_code_level(level=LOG_AllTransformer, also_to_stdout=False):
     """
-    Sets the level to print code from specific level of Ast Transformer.
+    Sets the level to print code from specific level Ast Transformer. Code can be output to stdout by setting `also_to_stdout`.
+
     There are two means to set the code level:
-     1. Call function `set_code_level`
-     2. Set environment variable `TRANSLATOR_CODE_LEVEL`
+
+    1. Call function `set_code_level`
+
+    2. Set environment variable `TRANSLATOR_CODE_LEVEL`
+
 
     **Note**:
     `set_code_level` has a higher priority than the environment variable.
 
     Args:
         level(int): The level to print code. Default is 100, which means to print the code after all AST Transformers.
+        also_to_stdout(bool): Whether to also output code to `sys.stdout`.
 
     Examples:
         .. code-block:: python
@@ -195,6 +250,7 @@ def set_code_level(level=LOG_AllTransformer):
 
     """
     _TRANSLATOR_LOGGER.transformed_code_level = level
+    _TRANSLATOR_LOGGER.need_to_echo_code_to_stdout = also_to_stdout
 
 
 def get_code_level():
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 59cb5fb144e..1004665ca15 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -14,21 +14,17 @@
 
 from __future__ import print_function
 import numpy as np
-import logging
 import six
 
-from paddle.fluid import log_helper
 from paddle.fluid import framework, backward, core
 from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers.utils import pack_sequence_as
 import paddle.compat as cpt
 
-_logger = log_helper.get_logger(
-    __name__, logging.WARNING, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
 
 class NestSequence(object):
     """
@@ -72,7 +68,7 @@ class NestSequence(object):
                 if not isinstance(var, (framework.Variable, core.VarBase)):
                     warning_types.add(type(var))
             if warning_types:
-                _logger.warning(
+                logging_utils.warn(
                     "Output of traced function contains non-tensor type values: {}. "
                     "Currently, We don't support to update them while training and will return "
                     "what we first saw. Please try to return them as tensor.".
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
index d555c8ed28f..efde2481721 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -15,14 +15,8 @@
 from __future__ import print_function
 
 import gast
-import logging
 
-from paddle.fluid import log_helper
-from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, NodeVarType, StaticAnalysisVisitor
-from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
-
-_logger = log_helper.get_logger(
-    __name__, logging.WARNING, fmt='%(asctime)s-%(levelname)s: %(message)s')
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, StaticAnalysisVisitor
 
 
 class PrintTransformer(gast.NodeTransformer):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index dbf030ccda1..5218c0aac95 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -13,17 +13,15 @@
 # limitations under the License.
 
 from __future__ import print_function
-import gast
+
 import collections
-import logging
+import gast
 import inspect
 import six
 import textwrap
 import threading
-import warnings
 import weakref
 
-import gast
 from paddle.fluid import framework
 from paddle.fluid import in_dygraph_mode
 from paddle.fluid.dygraph import layers
@@ -451,7 +449,7 @@ class StaticLayer(object):
                     format(self._function_spec))
         # If more than one programs have been cached, return the recent converted program by default.
         elif cached_program_len > 1:
-            logging.warning(
+            logging_utils.warn(
                 "Current {} has more than one cached programs: {}, the last traced progam will be return by default.".
                 format(self._function_spec, cached_program_len))
 
@@ -632,7 +630,7 @@ class ProgramCache(object):
             # Note: raise warnings if number of traced program is more than `max_tracing_count`
             current_tracing_count = len(self._caches)
             if current_tracing_count > MAX_TRACED_PROGRAM_COUNT:
-                logging.warning(
+                logging_utils.warn(
                     "Current traced program number: {} > `max_tracing_count`:{}. Too much cached programs will bring expensive overhead. "
                     "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors.".
                     format(current_tracing_count, MAX_TRACED_PROGRAM_COUNT))
@@ -804,8 +802,9 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_output"
+
         if not self.enable_to_static:
-            warnings.warn(
+            logging_utils.warn(
                 "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable to False. "
                 "We will just return dygraph output. "
                 "Please call ProgramTranslator.enable(True) if you would like to get static output."
@@ -879,8 +878,9 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_func"
+
         if not self.enable_to_static:
-            warnings.warn(
+            logging_utils.warn(
                 "The ProgramTranslator.get_func doesn't work when setting ProgramTranslator.enable to False. We will "
                 "just return dygraph output. Please call ProgramTranslator.enable(True) if you would like to get static output."
             )
@@ -933,8 +933,9 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_program"
+
         if not self.enable_to_static:
-            warnings.warn(
+            logging_utils.warn(
                 "The ProgramTranslator.get_program doesn't work when setting ProgramTranslator.enable to False."
                 "We will just return dygraph output. "
                 "Please call ProgramTranslator.enable(True) if you would like to get static output."
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 834c1a737d7..10819e4b320 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -26,6 +26,7 @@ from paddle.fluid import core
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticLayer, unwrap_decorators
 from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
@@ -120,7 +121,7 @@ def _dygraph_to_static_func_(dygraph_func):
     def __impl__(*args, **kwargs):
         program_translator = ProgramTranslator()
         if in_dygraph_mode() or not program_translator.enable_to_static:
-            warnings.warn(
+            logging_utils.warn(
                 "The decorator 'dygraph_to_static_func' doesn't work in "
                 "dygraph mode or set ProgramTranslator.enable to False. "
                 "We will just return dygraph output.")
@@ -215,7 +216,7 @@ def declarative(function=None, input_spec=None):
         if isinstance(function, Layer):
             if isinstance(function.forward, StaticLayer):
                 class_name = function.__class__.__name__
-                warnings.warn(
+                logging_utils.warn(
                     "`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one.".
                     format(class_name))
             function.forward = decorated(function.forward)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
index 510b6156547..b8a18179742 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
@@ -56,8 +56,30 @@ class TestLoggingUtils(unittest.TestCase):
         with self.assertRaises(TypeError):
             paddle.jit.set_verbosity(3.3)
 
-    def test_code_level(self):
+    def test_also_to_stdout(self):
+        logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout = None
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout, False)
 
+        paddle.jit.set_verbosity(also_to_stdout=False)
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout, False)
+
+        logging_utils._TRANSLATOR_LOGGER.need_to_echo_node_to_stdout = None
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_code_to_stdout, False)
+
+        paddle.jit.set_code_level(also_to_stdout=True)
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_code_to_stdout, True)
+
+        with self.assertRaises(AssertionError):
+            paddle.jit.set_verbosity(also_to_stdout=1)
+
+        with self.assertRaises(AssertionError):
+            paddle.jit.set_code_level(also_to_stdout=1)
+
+    def test_set_code_level(self):
         paddle.jit.set_code_level(None)
         os.environ[logging_utils.CODE_LEVEL_ENV_NAME] = '2'
         self.assertEqual(logging_utils.get_code_level(), 2)
@@ -71,7 +93,25 @@ class TestLoggingUtils(unittest.TestCase):
         with self.assertRaises(TypeError):
             paddle.jit.set_code_level(3.3)
 
-    def test_log(self):
+    def test_log_api(self):
+        # test api for CI Converage
+        logging_utils.set_verbosity(1, True)
+
+        logging_utils.warn("warn")
+        logging_utils.error("error")
+
+        logging_utils.log(1, "log level 1")
+        logging_utils.log(2, "log level 2")
+
+        source_code = "x = 3"
+        ast_code = gast.parse(source_code)
+        logging_utils.set_code_level(1, True)
+        logging_utils.log_transformed_code(1, ast_code, "TestTransformer")
+        logging_utils.set_code_level(logging_utils.LOG_AllTransformer, True)
+        logging_utils.log_transformed_code(logging_utils.LOG_AllTransformer,
+                                           ast_code, "TestTransformer")
+
+    def test_log_message(self):
         stream = io.BytesIO() if six.PY2 else io.StringIO()
         log = self.translator_logger.logger
         stdout_handler = logging.StreamHandler(stream)
@@ -84,13 +124,14 @@ class TestLoggingUtils(unittest.TestCase):
 
         if six.PY3:
             with mock.patch.object(sys, 'stdout', stream):
+                logging_utils.set_verbosity(1, False)
                 logging_utils.warn(warn_msg)
                 logging_utils.error(error_msg)
-                self.translator_logger.verbosity_level = 1
                 logging_utils.log(1, log_msg_1)
                 logging_utils.log(2, log_msg_2)
 
-            result_msg = '\n'.join([warn_msg, error_msg, log_msg_1, ""])
+            result_msg = '\n'.join(
+                [warn_msg, error_msg, "(Level 1) " + log_msg_1, ""])
             self.assertEqual(result_msg, stream.getvalue())
 
     def test_log_transformed_code(self):
-- 
GitLab


From b6a4349dd40eee17e485e149e09af4b29caa3d66 Mon Sep 17 00:00:00 2001
From: wawltor <fangzeyang0904@hotmail.com>
Date: Fri, 18 Sep 2020 10:14:47 +0800
Subject: [PATCH 131/261] fix the error message for the math dir

https://github.com/PaddlePaddle/Paddle/pull/27332
---
 paddle/fluid/operators/math/beam_search.cc |   5 +-
 paddle/fluid/operators/math/beam_search.cu |   5 +-
 paddle/fluid/operators/math/blas.cc        |   6 +-
 paddle/fluid/operators/math/blas_impl.cu.h |  26 ++--
 paddle/fluid/operators/math/blas_impl.h    | 148 ++++++++++++++++-----
 5 files changed, 146 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index 0155ef188ef..550de1aadde 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -87,7 +87,10 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
     lod[0].assign(high_level.begin(), high_level.end());
     lod[1].assign(low_level.begin(), low_level.end());
     if (!framework::CheckLoD(lod)) {
-      PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "lod %s is not right in"
+          " beam_search, please check your code.",
+          framework::LoDToString(lod)));
     }
     selected_ids->set_lod(lod);
     selected_scores->set_lod(lod);
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index cf6d44c1abc..ed3ead47d17 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -400,7 +400,10 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
 
     context.Wait();
     if (!framework::CheckLoD(selected_lod)) {
-      PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "lod %s is not right in"
+          " beam_search, please check your code.",
+          framework::LoDToString(selected_lod)));
     }
 
     selected_ids->set_lod(selected_lod);
diff --git a/paddle/fluid/operators/math/blas.cc b/paddle/fluid/operators/math/blas.cc
index 6a143b3c056..2a7ce83967f 100644
--- a/paddle/fluid/operators/math/blas.cc
+++ b/paddle/fluid/operators/math/blas.cc
@@ -20,7 +20,11 @@ namespace operators {
 namespace math {
 MatDescriptor CreateMatrixDescriptor(const framework::DDim &tensor_dim,
                                      int num_flatten_cols, bool trans) {
-  PADDLE_ENFORCE_GT(tensor_dim.size(), 1);
+  PADDLE_ENFORCE_GT(
+      tensor_dim.size(), 1,
+      platform::errors::InvalidArgument("The tensor dim size should be greater "
+                                        "than 1, but reveived dim size is %d",
+                                        tensor_dim.size()));
   MatDescriptor retv;
   if (num_flatten_cols > 1) {
     auto flatten_dim = framework::flatten_to_2d(tensor_dim, num_flatten_cols);
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index d0c5f74d4ef..a0464cf70e2 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -60,7 +60,8 @@ struct CUBlas<float> {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cublasSgemmStridedBatched(args...));
 #else
-    PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "SgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
   }
 
@@ -85,7 +86,8 @@ struct CUBlas<float> {
           beta, C, Ctype, ldc));
     });
 #else
-    PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasSgemmEx is not supported on cuda <= 7.5"));
 #endif
   }
 
@@ -146,13 +148,15 @@ struct CUBlas<double> {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cublasDgemmStridedBatched(args...));
 #else
-    PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "DgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
   }
 
   template <typename... ARGS>
   static void GEMM_EX(ARGS... args) {
-    PADDLE_THROW("Currently there are not cublasDgemmEx.");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Currently there are not cublasDgemmEx."));
   }
 
   template <typename... ARGS>
@@ -216,7 +220,8 @@ struct CUBlas<platform::float16> {
         reinterpret_cast<const __half *>(beta), reinterpret_cast<__half *>(C),
         ldc, strideC, batchCount));
 #else
-    PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "HgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
   }
 
@@ -247,7 +252,8 @@ struct CUBlas<platform::float16> {
           beta, C, Ctype, ldc, computeType, algo));
     });
 #else
-    PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
   }
 };
@@ -302,8 +308,12 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
   // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
-                    "cublas fp16 gemm requires GPU compute capability >= 53");
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
 
   float h_alpha = static_cast<float>(alpha);
   float h_beta = static_cast<float>(beta);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 892bf157381..515d6a2435e 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -29,7 +29,8 @@ template <>
 struct CBlas<int8_t> {
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_THROW("Blas VCOPY don't support int8_t");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Blas VCOPY do not supported on CPU, please check your code"));
   }
 };
 
@@ -347,22 +348,47 @@ struct CBlas<double> {
 
 template <>
 struct CBlas<platform::float16> {
-  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+  static void GEMM(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 GEMM not supported on CPU, please check your code"));
+  }
+
   static void SMM_GEMM(...) {
-    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 SMM_GEMM not supported on CPU, please check your code"));
   }
-  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
-  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
-  static void VSQUARE(...) {
-    PADDLE_THROW("float16 VSQUARE not supported on CPU");
+  static void VMUL(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VMUL not supported on CPU, please check your code"));
   }
-  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
-  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
-  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
-  static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
+  static void VEXP(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VEXP not supported on CPU, please check your code"));
+  }
+  static void VSQUARE(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VSQUARE not supported on CPU, please check your code"));
+  }
+  static void VPOW(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VPOW not supported on CPU, please check your code"));
+  }
+  static void DOT(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 DOT not supported on CPU, please check your code"));
+  };
+  static void SCAL(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 SCAL not supported on CPU, please check your code"));
+  };
+  static void ASUM(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 ASUM not supported on CPU, please check your code"));
+  };
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
-    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 GEMM_BATCH not supported on CPU, please check your code"));
   }
 #endif
 };
@@ -446,11 +472,18 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a,
   auto dim_a = mat_a.dims();
   auto dim_b = mat_b.dims();
   auto dim_out = mat_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-  PADDLE_ENFORCE(
-      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(),
-      "The places of matrices must be same");
+  PADDLE_ENFORCE_EQ(
+      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, true,
+      platform::errors::InvalidArgument(
+          "The input and output of matmul should be matrix, the dim size must "
+          "be 2,"
+          "but received dim size input_a:%d, input_b:%d, output:%d",
+          dim_a.size(), dim_b.size(), dim_out.size()));
+  PADDLE_ENFORCE_EQ(
+      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(), true,
+      platform::errors::InvalidArgument("The places of matrices in the matmul "
+                                        "should be same, please check your "
+                                        "code."));
 
   int M = dim_out[0];
   int N = dim_out[1];
@@ -715,7 +748,13 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMMWithHead(
     }
 
   } else {
-    PADDLE_ENFORCE_EQ(W1, H2);
+    PADDLE_ENFORCE_EQ(
+        W1, H2,
+        platform::errors::InvalidArgument(
+            "The fisrt matrix width should be same as second matrix height,"
+            "but received fisrt matrix width %d"
+            ", second matrix height %d",
+            W1, H2));
     int ldc = W2 * head_number;
     int sub_width = W1 / head_number;
 
@@ -785,7 +824,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
                                  const framework::Tensor &mat_b,
                                  const MatDescriptor &dim_b, T alpha,
                                  framework::Tensor *mat_out, T beta) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
+  PADDLE_ENFORCE_EQ(
+      dim_a.width_, dim_b.height_,
+      platform::errors::InvalidArgument(
+          "The fisrt matrix width should be same as second matrix height,"
+          "but received fisrt matrix width %d"
+          ", second matrix height %d",
+          dim_a.width_, dim_b.height_));
+
   CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
   if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
@@ -793,12 +839,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
                            dim_a.width_, alpha, mat_a.data<T>(),
                            mat_b.data<T>(), beta, mat_out->data<T>());
   } else {
-    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
-                       dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0,
-                   "dim_a.batch_size should be equal to dim_b.batch_size, or "
-                   "one of dim_a.batch_size and dim_b.batch_size should be 0. "
-                   "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
-                   dim_a.batch_size_, dim_b.batch_size_);
+    PADDLE_ENFORCE_EQ(
+        dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
+            dim_b.batch_size_ == 0,
+        true, platform::errors::InvalidArgument(
+                  "dim_a.batch_size should be equal to dim_b.batch_size, or "
+                  "one of dim_a.batch_size and dim_b.batch_size should be 0. "
+                  "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
+                  dim_a.batch_size_, dim_b.batch_size_));
     this->template BatchedGEMM<T>(
         transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
         mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
@@ -834,15 +882,42 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
                                          int head_number,
                                          framework::Tensor *mat_out, T beta,
                                          bool mat_b_split_vertical) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_ % head_number, 0);
-  PADDLE_ENFORCE_GE(head_number, 1);
-  PADDLE_ENFORCE_LE(head_number, dim_a.width_);
+  PADDLE_ENFORCE_EQ(
+      dim_a.width_ % head_number, 0,
+      platform::errors::InvalidArgument(
+          "The first input width must be some times the head number"
+          "but received first input width %d"
+          ",  head_number %d",
+          dim_a.width_, head_number));
+  PADDLE_ENFORCE_GE(head_number, 1,
+                    platform::errors::InvalidArgument(
+                        "The head number should be greater equal 1,"
+                        "but received head number %d",
+                        head_number));
+  PADDLE_ENFORCE_LE(
+      head_number, dim_a.width_,
+      platform::errors::InvalidArgument(
+          "The head number should be less equal first input width,"
+          "but received first input width %d"
+          ",  head_number %d",
+          dim_a.width_, head_number));
   CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
 
   if (mat_b_split_vertical) {
-    PADDLE_ENFORCE_EQ(dim_b.height_, dim_a.width_ / head_number);
-    PADDLE_ENFORCE_EQ(dim_b.width_ % head_number, 0);
+    PADDLE_ENFORCE_EQ(
+        dim_b.height_, dim_a.width_ / head_number,
+        platform::errors::InvalidArgument(
+            "The second input height should be equal than first input width,"
+            "but received second input height %d, first input width %d",
+            dim_b.height_, dim_a.width_ / head_number));
+    PADDLE_ENFORCE_EQ(
+        dim_a.width_ % head_number, 0,
+        platform::errors::InvalidArgument(
+            "The second input width should be some times the head number"
+            "but received second input width %d"
+            ",  head_number %d",
+            dim_b.width_, head_number));
   }
 
   if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
@@ -888,9 +963,16 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
                              mat_out->data<T>() + sub_matC_offset, ldc);
     }
   } else {
-    PADDLE_ENFORCE_EQ((dim_a.batch_size_ == dim_b.batch_size_ ||
-                       dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0),
-                      true);
+    PADDLE_ENFORCE_EQ(
+        (dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
+         dim_b.batch_size_ == 0),
+        true,
+        platform::errors::InvalidArgument(
+            "The first input batch size should be equal than second input,"
+            "either two input batch size is 0, but received first input batch "
+            "size"
+            " %d, second input batch size %d",
+            dim_a.batch_size_, dim_b.batch_size_));
 
     this->template BatchedGEMMWithHead<T>(
         transA, transB, dim_a.width_, dim_a.height_, dim_b.width_,
-- 
GitLab


From d28162b97fd2d224968c18c9da735900ef280e7c Mon Sep 17 00:00:00 2001
From: Zhen Wang <wangzhen31@baidu.com>
Date: Fri, 18 Sep 2020 10:19:07 +0800
Subject: [PATCH 132/261] Remove save_quantized_model in ImperativeQuantAware.
 (#27240)

---
 .../slim/quantization/imperative/qat.py       | 83 ++-----------------
 .../contrib/slim/tests/test_imperative_qat.py | 36 ++++----
 2 files changed, 27 insertions(+), 92 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 8d399c92901..7b276293638 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -67,6 +67,7 @@ class ImperativeQuantAware(object):
         Examples:
         .. code-block:: python
 
+            import paddle
             from paddle.fluid.contrib.slim.quantization \
                 import ImperativeQuantAware
             from paddle.vision.models \
@@ -86,13 +87,12 @@ class ImperativeQuantAware(object):
             # ...
             
             # Save quant model for the inference.
-            imperative_qat.save_quantized_model(
-                dirname="./resnet50_qat",
-                model=model,
-                input_shape=[(3, 224, 224)],
-                input_dtype=['float32'],
-                feed=[0],
-                fetch=[0])
+            paddle.jit.save(
+                layer=model,
+                model_path="./resnet50_qat",
+                input_spec=[
+                    paddle.static.InputSpec(
+                    shape=[None, 3, 224, 224], dtype='float32')])
         """
         super(ImperativeQuantAware, self).__init__()
         self._weight_bits = weight_bits
@@ -148,75 +148,6 @@ class ImperativeQuantAware(object):
             quant_layer = self._get_quantized_counterpart(layer)
             setattr(obj, target, quant_layer)
 
-    def save_quantized_model(self,
-                             dirname,
-                             model,
-                             input_shape,
-                             input_dtype,
-                             feed,
-                             fetch,
-                             append_batch_size=True):
-        """
-        Save the quantized model for the inference.
-
-        Args:
-            dirname (str): the directory to save the quantized model.
-            model(fluid.dygraph.Layer): the quantized model to be saved.
-            input_shape(list[tuple(int)]): The shape value for each input,
-                e.g. [(3, 224, 224)].
-            input_dtype(list[str]): The dtype value for each input,
-                e.g. ['float32'].
-            feed(list[int]): the indices of the input variables of the
-                imperative functions which will be saved as input variables in
-                inference model.
-            fetch(list[int]): the indices of the returned variable of the
-                imperative functions which will be saved as output variables in
-                inference model.
-            append_batch_size(bool, optional):
-                If true, it prepends an extra axis to the input_shape, meanwhile,
-                the input_shape shouldn't contain the batch size dimension.
-                Otherwise, it just uses the input_shape. Default True.
-        Returns:
-            None
-        """
-        assert isinstance(
-            input_shape, list), "The parameter `input_shape` shoubld be a list."
-        assert isinstance(
-            input_dtype, list), "The parameter `input_dtype` shoubld be a list."
-        assert isinstance(feed, list), "The parameter `feed` shoubld be a list."
-        assert isinstance(fetch,
-                          list), "The parameter `fetch` shoubld be a list."
-        assert len(input_shape) == len(
-            input_dtype
-        ), "The length of input_shape should be equal to  input_dtype's."
-        assert len(input_dtype) == len(
-            feed), "The length of input_shape should be equal to  feed's."
-
-        with dygraph.guard():
-            model.eval()
-            input_vars = []
-            for i, (shape, dtype) in enumerate(zip(input_shape, input_dtype)):
-                if append_batch_size:
-                    shape = [None] + list(shape)
-                # Note(Aurelius84): need a elegant way to name this.
-                in_spec = paddle.static.InputSpec(shape, dtype, 'feed_%d' % i)
-                input_vars.append(in_spec)
-            # use `declarative` to convert dygraph into static program
-            model.forward = dygraph.jit.declarative(
-                model.forward, input_spec=input_vars)
-            outputs = model.forward.concrete_program.outputs
-        input_spec = [input_vars[i] for i in feed]
-        configs = dygraph.jit.SaveLoadConfig()
-        configs.separate_params = True
-        if not isinstance(outputs, (tuple, list)):
-            outputs = [outputs]
-        configs.output_spec = [outputs[i] for i in fetch]
-        dygraph.jit.save(
-            layer=model,
-            model_path=dirname,
-            input_spec=input_spec,
-            configs=configs)
-
     def _get_quantized_counterpart(self, layer):
         quant_layers = tuple(self._quant_layers_map.values())
         quantized_counterpart = tuple('Quantized' + k
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 79b0bbd6a4d..f076d274b64 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -221,7 +221,7 @@ class TestImperativeQat(unittest.TestCase):
             model_dict = lenet.state_dict()
             fluid.save_dygraph(model_dict, "save_temp")
 
-            # test the correctness of `save_quantized_model`
+            # test the correctness of `paddle.jit.save`
             data = next(test_reader())
             test_data = np.array([x[0].reshape(1, 28, 28)
                                   for x in data]).astype('float32')
@@ -231,13 +231,14 @@ class TestImperativeQat(unittest.TestCase):
 
         # save inference quantized model
         path = "./mnist_infer_model"
-        imperative_qat.save_quantized_model(
-            dirname=path,
-            model=lenet,
-            input_shape=[(1, 28, 28)],
-            input_dtype=['float32'],
-            feed=[0],
-            fetch=[0])
+        paddle.jit.save(
+            layer=lenet,
+            model_path=path,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
+
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
         else:
@@ -245,7 +246,10 @@ class TestImperativeQat(unittest.TestCase):
         exe = fluid.Executor(place)
         [inference_program, feed_target_names, fetch_targets] = (
             fluid.io.load_inference_model(
-                dirname=path, executor=exe))
+                dirname=path,
+                executor=exe,
+                model_filename="__model__",
+                params_filename="__variables__"))
         after_save, = exe.run(inference_program,
                               feed={feed_target_names[0]: test_data},
                               fetch_list=fetch_targets)
@@ -332,13 +336,13 @@ class TestImperativeQat(unittest.TestCase):
                 if batch_id % 100 == 0:
                     _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
 
-        imperative_qat.save_quantized_model(
-            dirname="./dynamic_mnist",
-            model=lenet,
-            input_shape=[(1, 28, 28)],
-            input_dtype=['float32'],
-            feed=[0],
-            fetch=[0])
+        paddle.jit.save(
+            layer=lenet,
+            model_path="./dynamic_mnist",
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
 
         # static graph train
         _logger.info(
-- 
GitLab


From fef94eac4e531b8043c422e45ba6bde3a4e5eac9 Mon Sep 17 00:00:00 2001
From: chajchaj <57249073+chajchaj@users.noreply.github.com>
Date: Fri, 18 Sep 2020 10:46:09 +0800
Subject: [PATCH 133/261] fix cross_entropy bug of the axis parameter in
 log_softmax (#27311)

---
 python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py | 2 +-
 python/paddle/nn/functional/loss.py                            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 4982cd19582..c6190590108 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -26,7 +26,7 @@ def stable_softmax(x):
     return exps / np.sum(exps)
 
 
-def log_softmax(x, axis=-1):
+def log_softmax(x, axis=1):
     softmax_out = np.apply_along_axis(stable_softmax, axis, x)
     return np.log(softmax_out)
 
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index da086c0955e..4395520eec7 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1093,7 +1093,7 @@ def cross_entropy(input,
             " 'none', but received %s, which is not allowed." % reduction)
 
     #step 1. log_softmax
-    log_softmax_out = paddle.nn.functional.log_softmax(input)
+    log_softmax_out = paddle.nn.functional.log_softmax(input, axis=1)
     if weight is not None and not isinstance(weight, Variable):
         raise ValueError(
             "The weight' is not a Variable, please convert to Variable.")
-- 
GitLab


From 1a7559718e0983e321ebf821d4dd452d3965cdee Mon Sep 17 00:00:00 2001
From: GaoWei8 <53294385+GaoWei8@users.noreply.github.com>
Date: Fri, 18 Sep 2020 11:16:02 +0800
Subject: [PATCH 134/261] fix cudnn dyload (#27308)

* fix cudnn dyload error
---
 paddle/fluid/operators/cudnn_lstm_cache.h | 10 ++++++++++
 paddle/fluid/platform/cudnn_helper.h      |  2 ++
 paddle/fluid/platform/dynload/cudnn.cc    |  4 ++++
 paddle/fluid/platform/dynload/cudnn.h     | 21 +++++++++++++--------
 4 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h
index 4b46e2b475e..3181e4b1d99 100644
--- a/paddle/fluid/operators/cudnn_lstm_cache.h
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
@@ -54,6 +54,8 @@ class ScopedRNNBase {
       x_descs_.emplace_back(x_desc_.descriptor<T>(dims_x, strides_x));
       y_descs_.emplace_back(y_desc_.descriptor<T>(dims_y, strides_y));
     }
+
+#if CUDNN_VERSION >= 7201
     if (!sequence_length.empty()) {
       x_seq_desc_.descriptor<T>(seq_length_, batch_size_, input_size_, true,
                                 sequence_length);
@@ -61,6 +63,7 @@ class ScopedRNNBase {
                                 hidden_size_ * numDirections, true,
                                 sequence_length);
     }
+#endif
 
     // ------------------- cudnn hx, hy, cx, cy descriptors----------
     std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
@@ -96,10 +99,13 @@ class ScopedRNNBase {
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
         cudnn_type));
 #endif
+
+#if CUDNN_VERSION >= 7201
     if (!sequence_length.empty()) {
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
           rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
     }
+#endif
 
     // ------------------- cudnn weights_size ---------------------
     size_t weights_size_;
@@ -125,8 +131,10 @@ class ScopedRNNBase {
   }
   cudnnTensorDescriptor_t* x_descs() { return x_descs_.data(); }
   cudnnTensorDescriptor_t* y_descs() { return y_descs_.data(); }
+#if CUDNN_VERSION >= 7201
   cudnnRNNDataDescriptor_t x_seq_desc() { return x_seq_desc_.desc(); }
   cudnnRNNDataDescriptor_t y_seq_desc() { return y_seq_desc_.desc(); }
+#endif
   cudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
   cudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
   cudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
@@ -151,8 +159,10 @@ class ScopedRNNBase {
 
   platform::ScopedTensorDescriptor x_desc_;
   platform::ScopedTensorDescriptor y_desc_;
+#if CUDNN_VERSION >= 7201
   platform::ScopedRNNTensorDescriptor x_seq_desc_;
   platform::ScopedRNNTensorDescriptor y_seq_desc_;
+#endif
   platform::ScopedTensorDescriptor init_h_desc_;
   platform::ScopedTensorDescriptor init_c_desc_;
   platform::ScopedTensorDescriptor last_h_desc_;
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index bb4c2a89f6f..4b9c5c429da 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -294,6 +294,7 @@ class ScopedTensorDescriptor {
   DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
 };
 
+#if CUDNN_VERSION >= 7201
 class ScopedRNNTensorDescriptor {
  public:
   ScopedRNNTensorDescriptor() {
@@ -337,6 +338,7 @@ class ScopedRNNTensorDescriptor {
   cudnnRNNDataDescriptor_t desc_;
   DISABLE_COPY_AND_ASSIGN(ScopedRNNTensorDescriptor);
 };
+#endif
 
 class ScopedDropoutDescriptor {
  public:
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index 44a03d6f14a..1166dc5e4ad 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
+#endif
+
 #ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
 CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 #endif
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 7e85cb57f33..fba41417648 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -101,9 +101,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(cudnnDropoutGetStatesSize);                     \
   __macro(cudnnSetDropoutDescriptor);                     \
   __macro(cudnnRestoreDropoutDescriptor);                 \
-  __macro(cudnnCreateRNNDataDescriptor);                  \
-  __macro(cudnnDestroyRNNDataDescriptor);                 \
-  __macro(cudnnSetRNNDataDescriptor);                     \
   __macro(cudnnCreateRNNDescriptor);                      \
   __macro(cudnnGetRNNParamsSize);                         \
   __macro(cudnnGetRNNWorkspaceSize);                      \
@@ -112,11 +109,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(cudnnRNNBackwardData);                          \
   __macro(cudnnRNNBackwardWeights);                       \
   __macro(cudnnRNNForwardInference);                      \
-  __macro(cudnnRNNForwardTrainingEx);                     \
-  __macro(cudnnSetRNNPaddingMode);                        \
-  __macro(cudnnRNNBackwardDataEx);                        \
-  __macro(cudnnRNNBackwardWeightsEx);                     \
-  __macro(cudnnRNNForwardInferenceEx);                    \
   __macro(cudnnDestroyDropoutDescriptor);                 \
   __macro(cudnnDestroyRNNDescriptor);                     \
   __macro(cudnnSetTensorNdDescriptorEx);
@@ -188,6 +180,19 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
+#if CUDNN_VERSION >= 7201
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
+  __macro(cudnnCreateRNNDataDescriptor);             \
+  __macro(cudnnDestroyRNNDataDescriptor);            \
+  __macro(cudnnSetRNNDataDescriptor);                \
+  __macro(cudnnSetRNNPaddingMode);                   \
+  __macro(cudnnRNNForwardTrainingEx);                \
+  __macro(cudnnRNNBackwardDataEx);                   \
+  __macro(cudnnRNNBackwardWeightsEx);                \
+  __macro(cudnnRNNForwardInferenceEx);
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
 #if CUDNN_VERSION >= 7401
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                     \
   __macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
-- 
GitLab


From 03b0e0c42b571247e7dc0570eab58596fe601ae3 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Fri, 18 Sep 2020 11:21:04 +0800
Subject: [PATCH 135/261] Remove dependences of cv2 (#27286)

* rm dependence of cv2
---
 python/paddle/utils/__init__.py               |  1 +
 python/paddle/utils/lazy_import.py            | 34 +++++++++++
 python/paddle/vision/datasets/folder.py       |  3 +-
 python/paddle/vision/transforms/functional.py | 51 +++++++++++-----
 python/paddle/vision/transforms/transforms.py | 61 +++++++++++++++----
 python/requirements.txt                       |  1 -
 python/setup.py.in                            |  3 -
 7 files changed, 121 insertions(+), 33 deletions(-)
 create mode 100644 python/paddle/utils/lazy_import.py

diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 4a786679727..77f5ef7e966 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -16,6 +16,7 @@ from .profiler import ProfilerOptions
 from .profiler import Profiler
 from .profiler import get_profiler
 from .deprecated import deprecated
+from .lazy_import import try_import
 from ..fluid.framework import unique_name
 from ..fluid.framework import load_op_library
 from ..fluid.framework import require_version
diff --git a/python/paddle/utils/lazy_import.py b/python/paddle/utils/lazy_import.py
new file mode 100644
index 00000000000..69a32b77a8f
--- /dev/null
+++ b/python/paddle/utils/lazy_import.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lazy imports for heavy dependencies."""
+
+import importlib
+
+
+def try_import(module_name):
+    """Try importing a module, with an informative error message on failure."""
+    install_name = module_name
+    if module_name == 'cv2':
+        install_name = 'opencv-python'
+
+    try:
+        mod = importlib.import_module(module_name)
+        return mod
+    except ImportError:
+        err_msg = (
+            "Failed importing {}. This likely means that some paddle modules "
+            "requires additional dependencies that have to be "
+            "manually installed (usually with `pip install {}`). ").format(
+                module_name, install_name)
+        raise ImportError(err_msg)
diff --git a/python/paddle/vision/datasets/folder.py b/python/paddle/vision/datasets/folder.py
index 725fd9acafb..8a3053abefc 100644
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
@@ -14,9 +14,9 @@
 
 import os
 import sys
-import cv2
 
 from paddle.io import Dataset
+from paddle.utils import try_import
 
 __all__ = ["DatasetFolder", "ImageFolder"]
 
@@ -191,6 +191,7 @@ IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
 
 
 def cv2_loader(path):
+    cv2 = try_import('cv2')
     return cv2.imread(path)
 
 
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index b5668fa8c7d..acceb111e6f 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -18,10 +18,11 @@ import random
 import math
 import functools
 
-import cv2
 import numbers
 import numpy as np
 
+from paddle.utils import try_import
+
 if sys.version_info < (3, 3):
     Sequence = collections.Sequence
     Iterable = collections.Iterable
@@ -54,8 +55,8 @@ def flip(image, code):
     Accordding to the code (the type of flip), flip the input image
 
     Args:
-        image: Input image, with (H, W, C) shape
-        code: Code that indicates the type of flip.
+        image (np.ndarray): Input image, with (H, W, C) shape
+        code (int): Code that indicates the type of flip.
             -1 : Flip horizontally and vertically
             0 : Flip vertically
             1 : Flip horizontally
@@ -77,18 +78,28 @@ def flip(image, code):
             # flip horizontally
             F.flip(fake_img, 1)
     """
+    cv2 = try_import('cv2')
     return cv2.flip(image, flipCode=code)
 
 
 @keepdims
-def resize(img, size, interpolation=cv2.INTER_LINEAR):
+def resize(img, size, interpolation=1):
     """
     resize the input data to given size
 
     Args:
-        input: Input data, could be image or masks, with (H, W, C) shape
-        size: Target size of input data, with (height, width) shape.
-        interpolation: Interpolation method.
+        input (np.ndarray): Input data, could be image or masks, with (H, W, C) shape
+        size (int|list|tuple): Target size of input data, with (height, width) shape.
+        interpolation (int, optional): Interpolation method.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
 
     Examples:
         .. code-block:: python
@@ -102,7 +113,7 @@ def resize(img, size, interpolation=cv2.INTER_LINEAR):
 
             F.resize(fake_img, (200, 150))
     """
-
+    cv2 = try_import('cv2')
     if isinstance(interpolation, Sequence):
         interpolation = random.choice(interpolation)
 
@@ -179,6 +190,8 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
     assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \
         'Expected padding mode be either constant, edge, reflect or symmetric, but got {}'.format(padding_mode)
 
+    cv2 = try_import('cv2')
+
     PAD_MOD = {
         'constant': cv2.BORDER_CONSTANT,
         'edge': cv2.BORDER_REPLICATE,
@@ -214,18 +227,22 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
 
 
 @keepdims
-def rotate(img,
-           angle,
-           interpolation=cv2.INTER_LINEAR,
-           expand=False,
-           center=None):
+def rotate(img, angle, interpolation=1, expand=False, center=None):
     """Rotates the image by angle.
 
     Args:
         img (numpy.ndarray): Image to be rotated.
         angle (float|int): In degrees clockwise order.
-        interpolation (int, optional):
-            interpolation: Interpolation method.
+        interpolation (int, optional): Interpolation method. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
         expand (bool|optional): Optional expansion flag.
             If true, expands the output image to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -250,8 +267,9 @@ def rotate(img,
             fake_img = rotate(fake_img, 10)
             print(fake_img.shape)
     """
-    dtype = img.dtype
+    cv2 = try_import('cv2')
 
+    dtype = img.dtype
     h, w, _ = img.shape
     point = center or (w / 2, h / 2)
     M = cv2.getRotationMatrix2D(point, angle=-angle, scale=1)
@@ -312,6 +330,7 @@ def to_grayscale(img, num_output_channels=1):
             fake_img = to_grayscale(fake_img)
             print(fake_img.shape)
     """
+    cv2 = try_import('cv2')
 
     if num_output_channels == 1:
         img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 14809e0c1ac..9ea82827176 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -17,7 +17,6 @@ from __future__ import division
 import math
 import sys
 import random
-import cv2
 
 import numpy as np
 import numbers
@@ -26,6 +25,7 @@ import collections
 import warnings
 import traceback
 
+from paddle.utils import try_import
 from . import functional as F
 
 if sys.version_info < (3, 3):
@@ -214,7 +214,16 @@ class Resize(object):
             smaller edge of the image will be matched to this number.
             i.e, if height > width, then image will be rescaled to
             (size * height / width, size)
-        interpolation (int): Interpolation mode of resize. Default: cv2.INTER_LINEAR.
+        interpolation (int, optional): Interpolation mode of resize. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
 
     Examples:
     
@@ -232,7 +241,7 @@ class Resize(object):
             print(fake_img.shape)
     """
 
-    def __init__(self, size, interpolation=cv2.INTER_LINEAR):
+    def __init__(self, size, interpolation=1):
         assert isinstance(size, int) or (isinstance(size, Iterable) and
                                          len(size) == 2)
         self.size = size
@@ -252,6 +261,16 @@ class RandomResizedCrop(object):
         output_size (int|list|tuple): Target size of output image, with (height, width) shape.
         scale (list|tuple): Range of size of the origin size cropped. Default: (0.08, 1.0)
         ratio (list|tuple): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
+        interpolation (int, optional): Interpolation mode of resize. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
 
     Examples:
     
@@ -273,7 +292,7 @@ class RandomResizedCrop(object):
                  output_size,
                  scale=(0.08, 1.0),
                  ratio=(3. / 4, 4. / 3),
-                 interpolation=cv2.INTER_LINEAR):
+                 interpolation=1):
         if isinstance(output_size, int):
             self.output_size = (output_size, output_size)
         else:
@@ -328,7 +347,16 @@ class CenterCropResize(object):
     Args:
         size (int|list|tuple): Target size of output image, with (height, width) shape.
         crop_padding (int): Center crop with the padding. Default: 32.
-        interpolation (int): Interpolation mode of resize. Default: cv2.INTER_LINEAR.
+        interpolation (int, optional): Interpolation mode of resize. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
 
     Examples:
     
@@ -346,7 +374,7 @@ class CenterCropResize(object):
             print(fake_img.shape)
     """
 
-    def __init__(self, size, crop_padding=32, interpolation=cv2.INTER_LINEAR):
+    def __init__(self, size, crop_padding=32, interpolation=1):
         if isinstance(size, int):
             self.size = (size, size)
         else:
@@ -661,6 +689,7 @@ class ContrastTransform(object):
         if self.value == 0:
             return img
 
+        cv2 = try_import('cv2')
         dtype = img.dtype
         img = img.astype(np.float32)
         alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
@@ -701,6 +730,8 @@ class SaturationTransform(object):
         if self.value == 0:
             return img
 
+        cv2 = try_import('cv2')
+
         dtype = img.dtype
         img = img.astype(np.float32)
         alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
@@ -742,6 +773,7 @@ class HueTransform(object):
         if self.value == 0:
             return img
 
+        cv2 = try_import('cv2')
         dtype = img.dtype
         img = img.astype(np.uint8)
         hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
@@ -1036,7 +1068,16 @@ class RandomRotate(object):
         degrees (sequence or float or int): Range of degrees to select from.
             If degrees is a number instead of sequence like (min, max), the range of degrees
             will be (-degrees, +degrees) clockwise order.
-        interpolation (int|optional): Interpolation mode of resize. Default: cv2.INTER_LINEAR.
+        interpolation (int, optional): Interpolation mode of resize. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
         expand (bool|optional): Optional expansion flag. Default: False.
             If true, expands the output to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -1061,11 +1102,7 @@ class RandomRotate(object):
             print(fake_img.shape)
     """
 
-    def __init__(self,
-                 degrees,
-                 interpolation=cv2.INTER_LINEAR,
-                 expand=False,
-                 center=None):
+    def __init__(self, degrees, interpolation=1, expand=False, center=None):
         if isinstance(degrees, numbers.Number):
             if degrees < 0:
                 raise ValueError(
diff --git a/python/requirements.txt b/python/requirements.txt
index 47888424755..6a88d61a94c 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,4 +1,3 @@
-opencv-python<=4.2.0.32
 requests>=2.20.0
 numpy>=1.13, <=1.16.4 ; python_version<"3.5"
 numpy>=1.13 ; python_version>="3.5"
diff --git a/python/setup.py.in b/python/setup.py.in
index 77316640034..d85a23a5edd 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -237,9 +237,6 @@ if sys.version_info >= (3,7):
         setup_requires_tmp+=[setup_requires_i]
     setup_requires = setup_requires_tmp
 
-if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
-    setup_requires+=['opencv-python']
-
 # the prefix is sys.prefix which should always be usr
 paddle_bins = ''
 
-- 
GitLab


From 7e6dfcf9b22e650b9c9631c4fc6ec329d59e54b4 Mon Sep 17 00:00:00 2001
From: haozech <chenhaoze94@gmail.com>
Date: Fri, 18 Sep 2020 12:30:11 +0800
Subject: [PATCH 136/261] Add 3 pass version check (#27283)

---
 .../framework/ir/conv_elementwise_add2_act_fuse_pass.cc  | 9 ++++++++-
 .../framework/ir/conv_elementwise_add_act_fuse_pass.cc   | 8 ++++++++
 .../fluid/framework/ir/conv_elementwise_add_fuse_pass.cc | 9 +++++++--
 .../test_conv_elementwise_add2_act_fuse_pass.py          | 4 ++++
 .../inference/test_conv_elementwise_add_act_fuse_pass.py | 4 ++++
 .../ir/inference/test_conv_elementwise_add_fuse_pass.py  | 3 +++
 6 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index 2627da7dc40..ad6af69ae02 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -11,9 +11,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
 #include <string>
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -116,3 +116,10 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
               paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add2_act_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0)
+            .EQ("identity", 0));
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index 0b454a0407e..c5fa47ec55f 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -102,3 +103,10 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(conv_elementwise_add_act_fuse_pass,
               paddle::framework::ir::ConvElementwiseAddActFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_act_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0)
+            .EQ("identity", 0));
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
index 007770cf57d..38c0b773dde 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string>
-
 #include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
+#include <string>
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -89,3 +89,8 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(conv_elementwise_add_fuse_pass,
               paddle::framework::ir::ConvElementwiseAddFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
index d6dbd397b90..6907b6a7eb5 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
@@ -19,6 +19,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
 """Test for fusion of conv, elementwise_add and 2 act."""
 
@@ -46,6 +47,9 @@ class ConvElementwiseAdd2ActFusePassTest(InferencePassTest):
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_elementwise_add2_act_fuse_pass'))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
index 2e9035420d7..6ff60aa6deb 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
@@ -19,6 +19,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
 """Test for fusion of conv, elementwise_add and act."""
 
@@ -48,6 +49,9 @@ class ConvElementwiseAddActFusePassTest(InferencePassTest):
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_elementwise_add_act_fuse_pass'))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
index 7c4e0d6e76e..96b046edaec 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
@@ -19,6 +19,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
 """Test for fusion of conv and elementwise_add."""
 
@@ -44,6 +45,8 @@ class ConvElementwiseAddFusePassTest(InferencePassTest):
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_elementwise_add_fuse_pass'))
 
 
 if __name__ == "__main__":
-- 
GitLab


From fd7ab4e63c5960c0ba6a9c0ce0d00478cbc78c7f Mon Sep 17 00:00:00 2001
From: Pei Yang <peiyang@baidu.com>
Date: Fri, 18 Sep 2020 13:39:33 +0800
Subject: [PATCH 137/261] register pass compatibility  (#27357)

* pass compatibility

* add compatibility registry

* add unittests for different padding

* add assert

* drop errmsg
---
 .../ir/conv_affine_channel_fuse_pass.cc       |  12 +
 .../fluid/framework/ir/conv_bn_fuse_pass.cc   |  12 +
 .../ir/repeated_fc_relu_fuse_pass.cc          |  10 +
 .../ir/shuffle_channel_detect_pass.cc         |   8 +
 .../test_conv_affine_channel_fuse_pass.py     | 228 ++++++++++++++++++
 .../ir/inference/test_conv_bn_fuse_pass.py    | 177 ++++++++++++++
 .../test_repeated_fc_relu_fuse_pass.py        |  94 ++++++++
 .../test_trt_shuffle_channel_detect_pass.py   |  51 ++++
 8 files changed, 592 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py

diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index b50b4f37cae..fd8b55a6b7d 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -225,3 +226,14 @@ REGISTER_PASS(conv_affine_channel_fuse_pass,
               paddle::framework::ir::ConvAffineChannelFusePass);
 REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
               paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
+REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("affine_channel", 0));
+REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("affine_channel", 0));
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 9d3e0806ac7..fb787e08814 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -372,3 +373,14 @@ REGISTER_PASS(depthwise_conv_bn_fuse_pass,
               paddle::framework::ir::DepthwiseConvBNFusePass);
 REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
               paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
+REGISTER_PASS_CAPABILITY(conv_bn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("batch_norm", 0));
+REGISTER_PASS_CAPABILITY(conv_eltwiseadd_bn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("batch_norm", 0));
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index 2396a7f3c4f..23f794c11c2 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 #define MAX_NUM_FC 10
 
@@ -174,6 +175,10 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
             if (x->outputs.size() <= 0 || x->inputs.size() <= 0U) {
               return false;
             }
+            if (x->IsVar() && x->Var() && x->Var()->GetShape().size() > 2) {
+              LOG(WARNING) << "repeated fc relu only supports input dims = 2";
+              return false;
+            }
             int fc_idx = FindFCIdx(x);
             if (fc_idx < 0) {
               return false;
@@ -384,3 +389,8 @@ void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(repeated_fc_relu_fuse_pass,
               paddle::framework::ir::RepeatedFCReluFusePass);
+REGISTER_PASS_CAPABILITY(repeated_fc_relu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fc", 0)
+            .EQ("relu", 0));
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
index d9a65e71592..74ba0093a17 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -34,6 +35,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
   const std::string pattern_name = "shufflechannel_pattern";
   FusePassBase::Init(pattern_name, graph);
 
+  LOG(WARNING) << "There is fluid.layers.shuffle_channel API already, you can "
+                  "use it instead of (reshape + transpose +reshape)";
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
                 ->NewNode("x")
@@ -93,3 +96,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(shuffle_channel_detect_pass,
               paddle::framework::ir::ShuffleChannelDetectPass);
+REGISTER_PASS_CAPABILITY(shuffle_channel_detect_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0));
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
new file mode 100644
index 00000000000..ec0bd52e926
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class ConvAffineChannelFusePassExplicitPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=False,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
+
+
+class ConvAffineChannelFusePassValidPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='VALID',
+                bias_attr=False,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
+
+
+class ConvAffineChannelFusePassSamePaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='SAME',
+                bias_attr=False,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
+
+
+class ConvEltwiseAddAffineChannelFusePassExplicitPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=param_attr,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_eltwiseadd_affine_channel_fuse_pass'))
+
+
+class ConvEltwiseAddAffineChannelFusePassValidPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='VALID',
+                bias_attr=param_attr,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_eltwiseadd_affine_channel_fuse_pass'))
+
+
+class ConvEltwiseAddAffineChannelFusePassSamePaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='Same',
+                bias_attr=param_attr,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_eltwiseadd_affine_channel_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py
new file mode 100644
index 00000000000..ffe177e59d8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class ConvBnFusePassExplicitPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=False,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
+
+
+class ConvBnFusePassValidPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='VALID',
+                bias_attr=False,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
+
+
+class ConvBnFusePassSamePaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='SAME',
+                bias_attr=False,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
+
+
+class ConvEltwiseAddBnFuseExplicitPaddingPass(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=None,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
+
+
+class ConvEltwiseAddBnFuseValidPaddingPass(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='VALID',
+                bias_attr=None,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
+
+
+class ConvEltwiseAddBnFuseSamePaddingPass(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='SAME',
+                bias_attr=None,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py
new file mode 100644
index 00000000000..c78884480da
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class RepeatedFcReluFusePass3Test(InferencePassTest):
+    def setUp(self):
+        fc_num = 3
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                bias_attr=param_attr,
+                act=None)
+            fc_outs = []
+            fc_outs.append(
+                fluid.layers.fc(input=[conv_out], act="relu", size=1000))
+            for i in range(1, fc_num):
+                fc_outs.append(
+                    fluid.layers.fc(
+                        input=[fc_outs[i - 1]], act="relu", size=1000))
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [fc_outs[fc_num - 1]]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('repeated_fc_relu_fuse_pass'))
+
+
+class RepeatedFcReluFusePass9Test(InferencePassTest):
+    def setUp(self):
+        fc_num = 9
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                bias_attr=param_attr,
+                act=None)
+            fc_outs = []
+            fc_outs.append(
+                fluid.layers.fc(input=[conv_out], act="relu", size=1000))
+            for i in range(1, fc_num):
+                fc_outs.append(
+                    fluid.layers.fc(
+                        input=[fc_outs[i - 1]], act="relu", size=1000))
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [fc_outs[fc_num - 1]]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('repeated_fc_relu_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
new file mode 100644
index 00000000000..e9c304496af
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class ShuffleChannelFuseTRTPassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            reshape1 = fluid.layers.reshape(x=data, shape=[-1, 2, 3, 64, 64])
+            trans = fluid.layers.transpose(x=reshape1, perm=[0, 2, 1, 3, 4])
+            reshape2 = fluid.layers.reshape(x=trans, shape=[-1, 6, 64, 64])
+            out = fluid.layers.batch_norm(reshape2, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = ShuffleChannelFuseTRTPassTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('shuffle_channel_detect_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From 9fdcfe89819ec3f0ba13a4fb3126a836a4b36a37 Mon Sep 17 00:00:00 2001
From: guofei <52460041+gfwm2013@users.noreply.github.com>
Date: Fri, 18 Sep 2020 15:45:59 +0800
Subject: [PATCH 138/261] Support python3.8 (#26850)

* Support python3.8

test=notest
---
 paddle/scripts/paddle_build.sh | 45 +++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 3de577d847d..ac89116fc49 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -121,6 +121,18 @@ function cmake_base() {
             else
                 exit 1
             fi
+        elif [ "$1" == "cp38-cp38" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.8" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.8/include/python3.8/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/libpython3.8.dylib"
+                pip3.8 install --user -r ${PADDLE_ROOT}/python/requirements.txt
+            else
+                exit 1
+            fi
         fi
         # delete `gym` to avoid modifying requirements.txt in *.whl
         sed -i .bak "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
@@ -176,6 +188,13 @@ function cmake_base() {
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
                 pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt
+            elif [ "$1" == "cp38-cp38" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.8.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.8.0/bin/python3.8
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so"
+                pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt
            fi
         else
             pip install -r ${PADDLE_ROOT}/python/requirements.txt
@@ -514,6 +533,8 @@ EOF
             pip3.6 uninstall -y paddlepaddle
         elif [ "$1" == "cp37-cp37m" ]; then
             pip3.7 uninstall -y paddlepaddle
+        elif [ "$1" == "cp38-cp38" ]; then
+            pip3.8 uninstall -y paddlepaddle
         fi
         set -ex
 
@@ -527,6 +548,8 @@ EOF
             pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         elif [ "$1" == "cp37-cp37m" ]; then
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp38-cp38" ]; then
+            pip3.8 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
         tmpfile_rand=`date +%s%N`
         tmpfile=$tmp_dir/$tmpfile_rand
@@ -666,7 +689,7 @@ function generate_api_spec() {
 
     awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
     awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
-    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then 
+    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
         sed -i 's/arg0: str/arg0: unicode/g' $spec_path
         sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path
@@ -1244,21 +1267,25 @@ EOF
     ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
 
     ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
 
     if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then
         ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
         ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
         ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
+        ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
         ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
         ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
         ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
+        ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
     fi
 
     #ref_paddle2_mv1=""
@@ -1363,6 +1390,22 @@ EOF
         apt-get clean -y && \
         rm -f ${ref_paddle37} && \
         ldconfig
+EOF
+    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \
+        tar -xzf Python-3.8.0.tgz && cd Python-3.8.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.8.0.tgz
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
+        pip3.8 install opencv-python && wget ${ref_web}/${ref_paddle38} && pip3.8 install ${ref_paddle38_whl}; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f ${ref_paddle38} && \
+        ldconfig
 EOF
     cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     # run paddle version to install python packages first
-- 
GitLab


From d726fd5e862332abef1fae8ab0573b08ffc4cad7 Mon Sep 17 00:00:00 2001
From: yaoxuefeng <yaoxuefeng@baidu.com>
Date: Fri, 18 Sep 2020 16:18:00 +0800
Subject: [PATCH 139/261] enhance dataset err msg (#27363)

---
 paddle/fluid/framework/data_feed.cc | 70 ++++++++++++++++++++++++-----
 1 file changed, 58 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 96d54ec8691..aec27bd9d91 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -527,6 +527,8 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
         VLOG(0) << "error: the number of ids is a negative number: " << num;
         VLOG(0) << "please check line<" << instance_cout << "> in file<"
                 << filename << ">";
+        VLOG(0) << "Error occured when parsing " << i
+                << " th slot with total slots number: " << all_slots_.size();
         return false;
       } else if (num == 0) {
         VLOG(0)
@@ -536,42 +538,66 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
                "characters.";
         VLOG(0) << "please check line<" << instance_cout << "> in file<"
                 << filename << ">";
+        VLOG(0) << "Error occured when parsing " << i
+                << " th slot with total slots number: " << all_slots_.size();
         return false;
       } else if (errno == ERANGE || num > INT_MAX) {
         VLOG(0) << "error: the number of ids greater than INT_MAX";
         VLOG(0) << "please check line<" << instance_cout << "> in file<"
                 << filename << ">";
+        VLOG(0) << "Error occured when parsing " << i
+                << " th slot with total slots number: " << all_slots_.size();
         return false;
       }
       if (all_slots_type_[i] == "float") {
-        for (int i = 0; i < num; ++i) {
+        for (int j = 0; j < num; ++j) {
           strtof(endptr, &endptr);
           if (errno == ERANGE) {
             VLOG(0) << "error: the value is out of the range of "
                        "representable values for float";
             VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
+            VLOG(0) << "Error occured when parsing " << i
+                    << " th slot with total slots number: "
+                    << all_slots_.size();
+            VLOG(0) << "and in this slot: " << j
+                    << " th id with total id number: " << num;
             return false;
           }
-          if (i + 1 != num && endptr - str == len) {
+          if (j + 1 != num && endptr - str == len) {
             VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "Error occured when parsing " << i
+                    << " th slot with total slots number: "
+                    << all_slots_.size();
+            VLOG(0) << "and in this slot: " << j
+                    << " th id with total id number: " << num;
             VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
             return false;
           }
         }
       } else if (all_slots_type_[i] == "uint64") {
-        for (int i = 0; i < num; ++i) {
+        for (int j = 0; j < num; ++j) {
           strtoull(endptr, &endptr, 10);
           if (errno == ERANGE) {
             VLOG(0) << "error: the value is out of the range of "
                        "representable values for uint64_t";
+            VLOG(0) << "Error occured when parsing " << i
+                    << " th slot with total slots number: "
+                    << all_slots_.size();
+            VLOG(0) << "and in this slot: " << j
+                    << " th id with total id number: " << num;
             VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
             return false;
           }
-          if (i + 1 != num && endptr - str == len) {
+          if (j + 1 != num && endptr - str == len) {
             VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "Error occured when parsing " << i
+                    << " th slot with total slots number: "
+                    << all_slots_.size();
+            VLOG(0) << "and in this slot: " << j
+                    << " th id with total id number: " << num;
             VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
             return false;
@@ -632,8 +658,13 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
               "The number of ids can not be zero, you need padding "
               "it in data generator; or if there is something wrong with "
               "the data, please check if the data contains unresolvable "
-              "characters.\nplease check this error line: %s",
-              str));
+              "characters.\nplease check this error line: %s, \n Specifically, "
+              "something wrong happened(the length of this slot's feasign is 0)"
+              "when we parse the %d th slots."
+              "Maybe something wrong around this slot",
+              "\nWe detect the feasign number of this slot is %d, "
+              "which is illegal.",
+              str, i, num));
       if (idx != -1) {
         (*instance)[idx].Init(all_slots_type_[i]);
         if ((*instance)[idx].GetType()[0] == 'f') {  // float
@@ -683,8 +714,13 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
               "The number of ids can not be zero, you need padding "
               "it in data generator; or if there is something wrong with "
               "the data, please check if the data contains unresolvable "
-              "characters.\nplease check this error line: %s.",
-              str));
+              "characters.\nplease check this error line: %s, \n Specifically, "
+              "something wrong happened(the length of this slot's feasign is 0)"
+              "when we parse the %d th slots."
+              "Maybe something wrong around this slot",
+              "\nWe detect the feasign number of this slot is %d, "
+              "which is illegal.",
+              str, i, num));
 
       if (idx != -1) {
         (*instance)[idx].Init(all_slots_type_[i]);
@@ -916,8 +952,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
               "The number of ids can not be zero, you need padding "
               "it in data generator; or if there is something wrong with "
               "the data, please check if the data contains unresolvable "
-              "characters.\nplease check this error line: %s.",
-              str));
+              "characters.\nplease check this error line: %s, \n Specifically, "
+              "something wrong happened(the length of this slot's feasign is 0)"
+              "when we parse the %d th slots."
+              "Maybe something wrong around this slot",
+              "\nWe detect the feasign number of this slot is %d, "
+              "which is illegal.",
+              str, i, num));
       if (idx != -1) {
         if (all_slots_type_[i][0] == 'f') {  // float
           for (int j = 0; j < num; ++j) {
@@ -982,8 +1023,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
               "The number of ids can not be zero, you need padding "
               "it in data generator; or if there is something wrong with "
               "the data, please check if the data contains unresolvable "
-              "characters.\nplease check this error line: %s.",
-              str));
+              "characters.\nplease check this error line: %s, \n Specifically, "
+              "something wrong happened(the length of this slot's feasign is 0)"
+              "when we parse the %d th slots."
+              "Maybe something wrong around this slot",
+              "\nWe detect the feasign number of this slot is %d, "
+              "which is illegal.",
+              str, i, num));
 
       if (idx != -1) {
         if (all_slots_type_[i][0] == 'f') {  // float
-- 
GitLab


From 4c5cfdea1b4fee23c0d3dfdcd93d2e0731790727 Mon Sep 17 00:00:00 2001
From: liu zhengxi <380185688@qq.com>
Date: Fri, 18 Sep 2020 16:18:49 +0800
Subject: [PATCH 140/261] fix paddle.nn.Transformer api (#27391)

---
 .../tests/unittests/test_transformer_api.py   | 135 ++++++++++++++++++
 python/paddle/nn/layer/transformer.py         | 102 ++++++++++---
 2 files changed, 217 insertions(+), 20 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index 5fea9f69a18..bd76edc9d8c 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -474,6 +474,141 @@ class TestTransformer(unittest.TestCase):
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
+    def test_transformer_attr_1(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                weight_attr=[None],
+                bias_attr=[False])
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+    def test_transformer_attr_2(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                weight_attr=[None, None],
+                bias_attr=[False, False])
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+    def test_transformer_attr_3(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                weight_attr=[None, None, None],
+                bias_attr=[False, False, True])
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+    def test_transformer_attr_boolean(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                bias_attr=False)
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 63069e83952..4b199d5816c 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -53,7 +53,22 @@ def _convert_param_attr_to_list(param_attr, n):
     if isinstance(param_attr, (list, tuple)):
         assert len(param_attr) == n, (
             "length of param_attr should be %d when it is a list/tuple" % n)
-        param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+        param_attrs = []
+        for attr in param_attr:
+            if isinstance(attr, bool):
+                if attr:
+                    param_attrs.append(ParamAttr._to_attr(None))
+                else:
+                    param_attrs.append(False)
+            else:
+                param_attrs.append(ParamAttr._to_attr(attr))
+        # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+    elif isinstance(param_attr, bool):
+        param_attrs = []
+        if param_attr:
+            param_attrs = [ParamAttr._to_attr(None) for i in range(n)]
+        else:
+            param_attrs = [False] * n
     else:
         param_attrs = []
         attr = ParamAttr._to_attr(param_attr)
@@ -417,7 +432,7 @@ class TransformerEncoderLayer(Layer):
             Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
             Default: None, which means the default weight parameter property is used.
             See usage for details in :code:`ParamAttr` . 
-        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
             If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
             MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
             Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
@@ -986,22 +1001,31 @@ class Transformer(Layer):
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
         weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
-            self attention, `weight_attr[1]` would be used as `weight_attr` for
-            cross attention, and `weight_attr[2]` would be used as `weight_attr`
-            for linear in FFN. Otherwise, the three sub-layers all uses it as
-            `weight_attr` to create parameters. Default: None, which means the
-            default weight parameter property is used. See usage for details
+            If it is a tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, 
+            `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]` 
+            would be used as `weight_attr` for cross attention of `TransformerDecoder`, 
+            and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. 
+            If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention 
+            and cross attntion and `weight_attr[1]` would be used as `weight_attr` for 
+            linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr` 
+            for self attention, cross attention and linear in FFN. Otherwise, 
+            the three sub-layers all uses it as `weight_attr` to create parameters. 
+            Default: None, which means the default weight parameter property is used. 
+            See usage for details
             in :code:`ParamAttr` . 
         bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
-            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
-            self attention, `bias_attr[1]` would be used as `bias_attr` for
-            cross attention, and `bias_attr[2]` would be used as `bias_attr`
-            for linear in FFN. Otherwise, the three sub-layers all uses it as
-            `bias_attr` to create parameters. The `False` value means the
-            corresponding layer would not have trainable bias parameter. See
-            usage for details in :code:`ParamAttr` . Default: None,which means
-            the default bias parameter property is used.
+            If it is a tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
+            `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` 
+            would be used as `bias_attr` for cross attention of `TransformerDecoder`, 
+            and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. 
+            If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention 
+            and cross attntion and `bias_attr[1]` would be used as `bias_attr` for 
+            linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr` 
+            for self attention, cross attention and linear in FFN. Otherwise, 
+            the three sub-layers all uses it as `bias_attr` to create parameters. 
+            The `False` value means the corresponding layer would not have trainable 
+            bias parameter. See usage for details in :code:`ParamAttr` . 
+            Default: None,which means the default bias parameter property is used.
         custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
             Default None
         custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
@@ -1049,13 +1073,51 @@ class Transformer(Layer):
                  custom_decoder=None):
         super(Transformer, self).__init__()
 
+        if isinstance(bias_attr, (list, tuple)):
+            if len(bias_attr) == 1:
+                encoder_bias_attr = [bias_attr[0]] * 2
+                decoder_bias_attr = [bias_attr[0]] * 3
+            elif len(bias_attr) == 2:
+                encoder_bias_attr = bias_attr
+                decoder_bias_attr = [bias_attr[0], bias_attr[0], bias_attr[-1]]
+            elif len(bias_attr) == 3:
+                encoder_bias_attr = [bias_attr[0], bias_attr[-1]]
+                decoder_bias_attr = bias_attr
+            else:
+                assert False, (
+                    "length of bias_attr should be 1 or 2 or 3 when it is a list/tuple"
+                )
+        else:
+            encoder_bias_attr = bias_attr
+            decoder_bias_attr = bias_attr
+
+        if isinstance(weight_attr, (list, tuple)):
+            if len(weight_attr) == 1:
+                encoder_weight_attr = [weight_attr[0]] * 2
+                decoder_weight_attr = [weight_attr[0]] * 3
+            elif len(weight_attr) == 2:
+                encoder_weight_attr = weight_attr
+                decoder_weight_attr = [
+                    weight_attr[0], weight_attr[0], weight_attr[-1]
+                ]
+            elif len(weight_attr) == 3:
+                encoder_weight_attr = [weight_attr[0], weight_attr[-1]]
+                decoder_weight_attr = weight_attr
+            else:
+                assert False, (
+                    "length of weight_attr should be 1 or 2 or 3 when it is a list/tuple"
+                )
+        else:
+            encoder_weight_attr = weight_attr
+            decoder_weight_attr = weight_attr
+
         if custom_encoder is not None:
             self.encoder = custom_encoder
         else:
             encoder_layer = TransformerEncoderLayer(
                 d_model, nhead, dim_feedforward, dropout, activation,
-                attn_dropout, act_dropout, normalize_before, weight_attr,
-                bias_attr)
+                attn_dropout, act_dropout, normalize_before,
+                encoder_weight_attr, encoder_bias_attr)
             encoder_norm = LayerNorm(d_model)
             self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
                                               encoder_norm)
@@ -1065,8 +1127,8 @@ class Transformer(Layer):
         else:
             decoder_layer = TransformerDecoderLayer(
                 d_model, nhead, dim_feedforward, dropout, activation,
-                attn_dropout, act_dropout, normalize_before, weight_attr,
-                bias_attr)
+                attn_dropout, act_dropout, normalize_before,
+                decoder_weight_attr, decoder_bias_attr)
             decoder_norm = LayerNorm(d_model)
             self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
                                               decoder_norm)
-- 
GitLab


From a5ef246cac5012506861c8d54230c5024f891fc8 Mon Sep 17 00:00:00 2001
From: Pei Yang <peiyang@baidu.com>
Date: Fri, 18 Sep 2020 16:33:24 +0800
Subject: [PATCH 141/261] Optimize emb_eltwise_layernorm_plugin and support
 fp16 (#27128)

---
 cmake/cuda.cmake                              |   3 +
 .../tensorrt/convert/emb_eltwise_layernorm.cc |   6 +-
 .../plugin/emb_eltwise_layernorm_plugin.cu    | 214 +++++++++++-------
 .../plugin/emb_eltwise_layernorm_plugin.h     | 178 ++++++++++++---
 ...rt_dynamic_shape_ernie_deserialize_test.cc |  10 +-
 5 files changed, 287 insertions(+), 124 deletions(-)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 7a94bda0f5f..c78fe5f6c7f 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -107,6 +107,9 @@ function(select_nvcc_arch_flags out_variable)
   elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
     set(cuda_arch_bin "50")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
+      add_definitions("-DSUPPORTS_CUDA_FP16")
+    endif()
     set(cuda_arch_bin "60 61")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index cdc0e415d46..9fff558c583 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -80,10 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     nvinfer1::ILayer* layer = nullptr;
 
     if (engine_->with_dynamic_shape()) {
-      plugin::DynamicPluginTensorRT* plugin = nullptr;
-      plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
+      auto use_fp16 = engine_->WithFp16();
+      auto plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
           input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-          eps);
+          eps, use_fp16);
       layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
     } else {
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index 5e43be90de3..873631fea61 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -32,13 +32,34 @@ namespace plugin {
 #if IS_TRT_VERSION_GE(6000)
 
 template <typename T>
-int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
+EmbEltwiseLayernormPluginDynamicImpl<
+    T>::~EmbEltwiseLayernormPluginDynamicImpl() {
+  this->terminate();
+}
+
+inline half fp32tofp16(float x) { return static_cast<half>(x); }
+
+template <typename T>
+int EmbEltwiseLayernormPluginDynamicImpl<T>::initialize() {
   embs_gpu_.resize(embs_.size());
   for (int i = 0; i < embs_.size(); i++) {
     if (embs_[i]) {
-      cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
-      cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float),
+      T *host_ptr;
+      auto size = emb_sizes_[i];
+
+      if (std::is_same<T, half>::value) {
+        host_ptr = new T[size];
+        std::transform(embs_[i], (embs_[i] + size), host_ptr, fp32tofp16);
+      } else {
+        host_ptr = reinterpret_cast<T *>(embs_[i]);
+      }
+
+      cudaMalloc(&embs_gpu_[i], sizeof(T) * size);
+      cudaMemcpy(embs_gpu_[i], host_ptr, size * sizeof(T),
                  cudaMemcpyHostToDevice);
+      if (std::is_same<T, half>::value) {
+        delete[] host_ptr;
+      }
     }
   }
 
@@ -53,11 +74,105 @@ int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
                cudaMemcpyHostToDevice);
   }
 
+  int input_num = embs_.size();
+  in_ptr_tensor_.Resize({input_num});
+  emb_ptr_tensor_.Resize({input_num});
+
+  cudaGetDevice(&device_id_);
+  auto emb_ptr_gpu_d =
+      emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
+  cudaMemcpy(emb_ptr_gpu_d, embs_gpu_.data(), sizeof(uintptr_t) * input_num,
+             cudaMemcpyHostToDevice);
+
   return 0;
 }
 
 template <typename T>
-nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
+void EmbEltwiseLayernormPluginDynamicImpl<T>::terminate() {
+  for (int i = 0; i < embs_gpu_.size(); ++i) {
+    if (embs_gpu_[i]) {
+      cudaFree(embs_gpu_[i]);
+      embs_gpu_[i] = nullptr;
+    }
+  }
+
+  if (bias_gpu_) {
+    cudaFree(bias_gpu_);
+    bias_gpu_ = nullptr;
+  }
+
+  if (scale_gpu_) {
+    cudaFree(scale_gpu_);
+    scale_gpu_ = nullptr;
+  }
+}
+
+template <typename T>
+int EmbEltwiseLayernormPluginDynamicImpl<T>::enqueue(
+    const nvinfer1::PluginTensorDesc *input_desc,
+    const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
+    void *const *outputs, void *workspace, cudaStream_t stream) {
+  auto id_dims = input_desc[0].dims;
+  int batch = id_dims.d[0];
+  int seq_len = id_dims.d[1];
+  int input_num = embs_.size();
+
+  auto in_ptr_gpu_d =
+      in_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
+  auto emb_ptr_gpu_d =
+      emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
+
+  auto new_input_ptr = reinterpret_cast<uintptr_t>(inputs[0]);
+
+  if (old_input_ptr_ != new_input_ptr) {
+    old_input_ptr_ = new_input_ptr;
+
+    cudaMemcpyAsync(in_ptr_gpu_d, reinterpret_cast<const void *>(inputs),
+                    sizeof(uintptr_t) * input_num, cudaMemcpyHostToDevice,
+                    stream);
+  }
+
+  auto out_type = output_desc[0].type;
+
+  if (std::is_same<T, float>::value) {
+    PADDLE_ENFORCE_EQ(
+        out_type == nvinfer1::DataType::kFLOAT, true,
+        platform::errors::InvalidArgument(
+            "The EmbEltwiseLayernorm Plugin only support fp32 input."));
+  } else if (std::is_same<T, half>::value) {
+    PADDLE_ENFORCE_EQ(
+        out_type == nvinfer1::DataType::kHALF, true,
+        platform::errors::InvalidArgument(
+            "The EmbEltwiseLayernorm Plugin only support fp16 input."));
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Unsupport data type, the out type of EmbEltwiseLayernorm should be "
+        "float or half."));
+  }
+
+  auto *output_d = reinterpret_cast<T *>(outputs[0]);
+
+  operators::math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
+  emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
+                             scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
+                             eps_, input_num, stream);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+template class EmbEltwiseLayernormPluginDynamicImpl<float>;
+#ifdef SUPPORTS_CUDA_FP16
+template class EmbEltwiseLayernormPluginDynamicImpl<half>;
+#endif  // SUPPORTS_CUDA_FP16
+
+int EmbEltwiseLayernormPluginDynamic::initialize() {
+  impl_->initialize();
+
+  return 0;
+}
+
+void EmbEltwiseLayernormPluginDynamic::terminate() { impl_->terminate(); }
+
+nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
     nvinfer1::IExprBuilder &expr_builder) {  // NOLINT
   PADDLE_ENFORCE_EQ(output_index, 0,
@@ -76,18 +191,7 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
   return ret;
 }
 
-template <typename T>
-void EmbEltwiseLayernormPluginDynamic<T>::terminate() {
-  for (auto ptr : embs_gpu_) {
-    if (ptr) cudaFree(ptr);
-  }
-
-  if (bias_gpu_) cudaFree(bias_gpu_);
-  if (scale_gpu_) cudaFree(scale_gpu_);
-}
-
-template <typename T>
-bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
+bool EmbEltwiseLayernormPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
     int nb_outputs) {
   PADDLE_ENFORCE_NOT_NULL(
@@ -98,6 +202,11 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
                         "The EmbEltwiseLayerNorm's output should be one"
                         "but it's (%d) outputs.",
                         nb_outputs));
+  PADDLE_ENFORCE_EQ(nb_outputs, 1,
+                    platform::errors::InvalidArgument(
+                        "The EmbEltwiseLayerNorm's output should be one"
+                        "but it's (%d) outputs.",
+                        nb_outputs));
   PADDLE_ENFORCE_LT(
       pos, nb_inputs + nb_outputs,
       platform::errors::InvalidArgument("The pos(%d) should be less than the "
@@ -122,7 +231,7 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
   }
 
   if (pos == all_nums - 1) {
-    if (sizeof(T) == sizeof(float)) {
+    if (with_fp16_ == false) {
       return desc.type == nvinfer1::DataType::kFLOAT;
     } else {
       return desc.type == nvinfer1::DataType::kHALF;
@@ -131,84 +240,27 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
   return false;
 }
 
-template <typename T>
-nvinfer1::DataType EmbEltwiseLayernormPluginDynamic<T>::getOutputDataType(
+nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
   PADDLE_ENFORCE_EQ(
       index, 0, platform::errors::InvalidArgument(
                     "The EmbEltwiseLayernorm Plugin only has one input, so the "
                     "index value should be 0, but get %d.",
                     index));
-  return nvinfer1::DataType::kFLOAT;
+  if (with_fp16_)
+    return nvinfer1::DataType::kHALF;
+  else
+    return nvinfer1::DataType::kFLOAT;
 }
 
-template <typename T>
-int EmbEltwiseLayernormPluginDynamic<T>::enqueue(
+int EmbEltwiseLayernormPluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc *input_desc,
     const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
     void *const *outputs, void *workspace, cudaStream_t stream) {
-  auto id_dims = input_desc[0].dims;
-  int batch = id_dims.d[0];
-  int seq_len = id_dims.d[1];
-  int input_num = embs_.size();
-
-  framework::Tensor in_ptr_tensor, emb_ptr_tensor;
-  int device_id;
-  cudaGetDevice(&device_id);
-
-  in_ptr_tensor.Resize({input_num});
-  emb_ptr_tensor.Resize({input_num});
-  int64_t *in_ptr_gpu_d =
-      in_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
-  int64_t *emb_ptr_gpu_d =
-      emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
-
-  std::vector<uintptr_t> in_ptr, emb_ptr;
-  for (int i = 0; i < input_num; i++) {
-    in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
-    emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));
-  }
-
-  cudaMemcpyAsync(in_ptr_gpu_d, in_ptr.data(), sizeof(int64_t) * input_num,
-                  cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(emb_ptr_gpu_d, emb_ptr.data(), sizeof(int64_t) * input_num,
-                  cudaMemcpyHostToDevice, stream);
-
-  auto out_type = output_desc[0].type;
-
-  const unsigned tpb = 256;
-  const dim3 grid(seq_len, batch, 1);
-  const dim3 block(tpb, 1, 1);
-  if (sizeof(T) == sizeof(float)) {
-    PADDLE_ENFORCE_EQ(
-        out_type == nvinfer1::DataType::kFLOAT, true,
-        platform::errors::InvalidArgument(
-            "The EmbEltwiseLayernorm Plugin only support fp32 input."));
-  } else if (sizeof(T) == sizeof(int16_t)) {
-    PADDLE_ENFORCE_EQ(
-        out_type == nvinfer1::DataType::kHALF, true,
-        platform::errors::InvalidArgument(
-            "The EmbEltwiseLayernorm Plugin only support fp16 input."));
-  } else {
-    PADDLE_THROW(platform::errors::Fatal(
-        "Unsupport data type, the out type of EmbEltwiseLayernorm should be "
-        "float or half."));
-  }
-
-  T *output_d = static_cast<T *>(outputs[0]);
-
-  operators::math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
-  emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
-                             scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
-                             eps_, input_num, stream);
+  impl_->enqueue(input_desc, output_desc, inputs, outputs, workspace, stream);
   return cudaGetLastError() != cudaSuccess;
 }
 
-template class EmbEltwiseLayernormPluginDynamic<float>;
-#ifdef SUPPORTS_CUDA_FP16
-template class EmbEltwiseLayernormPluginDynamic<half>;
-#endif  // SUPPORTS_CUDA_FP16
-
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
index 5babd87db06..24ca853104e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -27,14 +27,76 @@ namespace tensorrt {
 namespace plugin {
 
 #if IS_TRT_VERSION_GE(6000)
+
+class EmbEltwiseLayernormPluginDynamicImplBase {
+ public:
+  EmbEltwiseLayernormPluginDynamicImplBase() {}
+  virtual ~EmbEltwiseLayernormPluginDynamicImplBase() {}
+
+  virtual int initialize() = 0;
+  virtual void terminate() = 0;
+  virtual int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                      const nvinfer1::PluginTensorDesc* outputDesc,
+                      const void* const* inputs, void* const* outputs,
+                      void* workspace, cudaStream_t stream) = 0;
+};
+
 template <typename T>
+class EmbEltwiseLayernormPluginDynamicImpl
+    : public EmbEltwiseLayernormPluginDynamicImplBase {
+ public:
+  explicit EmbEltwiseLayernormPluginDynamicImpl(std::vector<float*> input_embs,
+                                                float* bias, float* scale,
+                                                std::vector<int> emb_sizes,
+                                                int bias_size, int scale_size,
+                                                int hidden_size, float eps)
+      : embs_(input_embs),
+        bias_(bias),
+        scale_(scale),
+        emb_sizes_(emb_sizes),
+        bias_size_(bias_size),
+        scale_size_(scale_size),
+        hidden_size_(hidden_size),
+        eps_(eps) {}
+
+  ~EmbEltwiseLayernormPluginDynamicImpl();
+
+  int initialize();
+  void terminate();
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream);
+
+ private:
+  std::vector<float*> embs_;
+  float* bias_{nullptr};
+  float* scale_{nullptr};
+
+  // data on devices
+  float* bias_gpu_{nullptr};
+  float* scale_gpu_{nullptr};
+  std::vector<T*> embs_gpu_;
+
+  std::vector<int> emb_sizes_;
+  int bias_size_;
+  int scale_size_;
+  int hidden_size_;
+  float eps_;
+
+  framework::Tensor in_ptr_tensor_, emb_ptr_tensor_;
+  int device_id_{0};
+  uintptr_t old_input_ptr_{0};
+};
+
 class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
  public:
   explicit EmbEltwiseLayernormPluginDynamic(std::vector<float*> input_embs,
                                             float* bias, float* scale,
                                             std::vector<int> emb_sizes,
                                             int bias_size, int scale_size,
-                                            int hidden_size, float eps)
+                                            int hidden_size, float eps,
+                                            bool with_fp16)
       : embs_(input_embs),
         bias_(bias),
         scale_(scale),
@@ -42,51 +104,81 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
         bias_size_(bias_size),
         scale_size_(scale_size),
         hidden_size_(hidden_size),
-        eps_(eps) {}
+        eps_(eps),
+        with_fp16_(with_fp16),
+        own_host_buff_(false) {
+    if (with_fp16) {
+#ifdef SUPPORTS_CUDA_FP16
+      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
+          embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
+          hidden_size_, eps_);
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "Unsupported data type, current GPU doesn't support half."));
+#endif  // SUPPORTS_CUDA_FP16
+    } else {
+      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
+          embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
+          hidden_size_, eps_);
+    }
+  }
 
   EmbEltwiseLayernormPluginDynamic(void const* serial_data,
-                                   size_t serial_length) {
+                                   size_t serial_length)
+      : own_host_buff_(true) {
     DeserializeValue(&serial_data, &serial_length, &emb_sizes_);
 
-    embs_gpu_.resize(emb_sizes_.size());
     embs_.resize(emb_sizes_.size());
     for (size_t i = 0; i < emb_sizes_.size(); i++) {
-      cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
-      cudaMemcpy(embs_gpu_[i], serial_data, emb_sizes_[i] * sizeof(float),
-                 cudaMemcpyHostToDevice);
+      auto size = emb_sizes_[i];
+      auto ptr = new float[size];
+      memcpy(ptr, serial_data, sizeof(float) * size);
+      embs_[i] = ptr;
       reinterpret_cast<char const*&>(serial_data) +=
           emb_sizes_[i] * sizeof(float);
       serial_length -= emb_sizes_[i] * sizeof(float);
-      embs_[i] = nullptr;
     }
     DeserializeValue(&serial_data, &serial_length, &bias_size_);
     DeserializeValue(&serial_data, &serial_length, &scale_size_);
 
-    cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
-    cudaMemcpy(bias_gpu_, serial_data, bias_size_ * sizeof(float),
-               cudaMemcpyHostToDevice);
-    bias_ = nullptr;
+    if (bias_size_) {
+      bias_ = new float[bias_size_];
+      memcpy(bias_, serial_data, sizeof(float) * bias_size_);
+    }
     reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
     serial_length -= bias_size_ * sizeof(float);
 
-    cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
-    cudaMemcpy(scale_gpu_, serial_data, scale_size_ * sizeof(float),
-               cudaMemcpyHostToDevice);
-    scale_ = nullptr;
+    if (scale_size_) {
+      scale_ = new float[scale_size_];
+      memcpy(scale_, serial_data, sizeof(float) * scale_size_);
+    }
     reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float);
     serial_length -= scale_size_ * sizeof(float);
 
     DeserializeValue(&serial_data, &serial_length, &hidden_size_);
     DeserializeValue(&serial_data, &serial_length, &eps_);
+    DeserializeValue(&serial_data, &serial_length, &with_fp16_);
+
+    if (with_fp16_) {
+#ifdef SUPPORTS_CUDA_FP16
+      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
+          embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
+          hidden_size_, eps_);
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "Unsupported data type, current GPU doesn't support half."));
+#endif  // SUPPORTS_CUDA_FP16
+    } else {
+      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
+          embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
+          hidden_size_, eps_);
+    }
   }
 
   nvinfer1::IPluginV2DynamicExt* clone() const override {
     auto ptr = new EmbEltwiseLayernormPluginDynamic(
         embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
-        eps_);
-    ptr->embs_gpu_ = embs_gpu_;
-    ptr->bias_gpu_ = bias_gpu_;
-    ptr->scale_gpu_ = scale_gpu_;
+        eps_, with_fp16_);
     return ptr;
   }
 
@@ -95,6 +187,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   }
   int getNbOutputs() const override { return 1; }
   int initialize() override;
+  void terminate() override;
 
   size_t getSerializationSize() const override {
     int sum_num = 0;
@@ -110,24 +203,32 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
     sum_num += (bias_size_ + scale_size_) * sizeof(float);
     sum_num += SerializedSize(hidden_size_);
     sum_num += SerializedSize(eps_);
-    // sum_num += SerializedSize(with_fp16_);
+    sum_num += SerializedSize(with_fp16_);
 
     return sum_num;
   }
 
-  void terminate() override;
   void serialize(void* buffer) const override {
-    // SerializeValue(&buffer, with_fp16_);
     SerializeValue(&buffer, emb_sizes_);
     for (size_t i = 0; i < emb_sizes_.size(); i++) {
-      SerializeCudaPointer(&buffer, embs_gpu_[i], emb_sizes_[i]);
+      auto size = emb_sizes_[i];
+      for (int j = 0; j < size; ++j) {
+        SerializeValue(&buffer, embs_[i][j]);
+      }
     }
     SerializeValue(&buffer, bias_size_);
     SerializeValue(&buffer, scale_size_);
-    SerializeCudaPointer(&buffer, bias_gpu_, bias_size_);
-    SerializeCudaPointer(&buffer, scale_gpu_, scale_size_);
+    for (int i = 0; i < bias_size_; ++i) {
+      SerializeValue(&buffer, bias_[i]);
+    }
+
+    for (int i = 0; i < scale_size_; ++i) {
+      SerializeValue(&buffer, scale_[i]);
+    }
+
     SerializeValue(&buffer, hidden_size_);
     SerializeValue(&buffer, eps_);
+    SerializeValue(&buffer, with_fp16_);
   }
 
   nvinfer1::DimsExprs getOutputDimensions(
@@ -158,23 +259,33 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
                                        const nvinfer1::DataType* input_types,
                                        int nb_inputs) const override;
 
-  void destroy() override { delete this; }
+  void destroy() override {
+    if (own_host_buff_) {
+      for (auto ptr : embs_) {
+        delete[] ptr;
+      }
+      delete[] bias_;
+      delete[] scale_;
+    }
+
+    delete impl_;
+    delete this;
+  }
 
  private:
   std::vector<float*> embs_;
   float* bias_;
   float* scale_;
 
-  // data on devices
-  float* bias_gpu_;
-  float* scale_gpu_;
-  std::vector<float*> embs_gpu_;
-
   std::vector<int> emb_sizes_;
   int bias_size_;
   int scale_size_;
   int hidden_size_;
   float eps_;
+
+  bool with_fp16_;
+  bool own_host_buff_{false};
+  EmbEltwiseLayernormPluginDynamicImplBase* impl_{nullptr};
 };
 
 class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
@@ -198,8 +309,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
   nvinfer1::IPluginV2* deserializePlugin(const char* name,
                                          const void* serial_data,
                                          size_t serial_length) override {
-    return new EmbEltwiseLayernormPluginDynamic<float>(serial_data,
-                                                       serial_length);
+    return new EmbEltwiseLayernormPluginDynamic(serial_data, serial_length);
   }
 
   void setPluginNamespace(const char* lib_namespace) override {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
index 685f7b6600e..d49f83b9d38 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -151,7 +151,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
   run(config, &out_data);         // serialize
   run(*config_deser, &out_data);  // deserialize
   for (size_t i = 0; i < out_data.size(); i++) {
-    EXPECT_NEAR(result[i], out_data[i], 1e-6);
+    EXPECT_NEAR(result[i], out_data[i], 1e-2);
   }
 }
 
@@ -159,13 +159,11 @@ TEST(AnalysisPredictor, no_fp16) {
   std::vector<float> result = {0.597841, 0.219972, 0.182187};
   trt_ernie(false, result);
 }
-
-TEST(AnalysisPredictor, fp16) {
 #ifdef SUPPORTS_CUDA_FP16
-  std::vector<float> result = {0.598336, 0.219558, 0.182106};
+TEST(AnalysisPredictor, fp16) {
+  std::vector<float> result = {0.59923654, 0.21923761, 0.18152587};
   trt_ernie(true, result);
-#endif
 }
-
+#endif  // SUPPORTS_CUDA_FP16
 }  // namespace inference
 }  // namespace paddle
-- 
GitLab


From 99626502f747cc85d518d87267cec821ffbf69a3 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 18 Sep 2020 22:32:28 +0800
Subject: [PATCH 142/261] =?UTF-8?q?=E3=80=90paddle.fleet=E3=80=91gloo=20an?=
 =?UTF-8?q?d=20util=20(#27213)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix worker endpoints

* fix gloo wrapper for hdfs

* GPU fleetrun support gloo

* parameterserver fleetrun support gloo

* fix get server endpoint
---
 paddle/fluid/framework/fleet/gloo_wrapper.cc  |  25 +-
 python/paddle/distributed/fleet/__init__.py   |   1 +
 .../distributed/fleet/base/fleet_base.py      |  22 +-
 .../distributed/fleet/base/role_maker.py      | 592 ++++++++++++------
 .../distributed/fleet/base/util_factory.py    |  39 +-
 python/paddle/distributed/fleet/launch.py     |  26 +-
 .../paddle/distributed/fleet/launch_utils.py  |  10 +-
 .../fluid/tests/unittests/test_fleet_base.py  |  19 +-
 .../unittests/test_fleet_rolemaker_new.py     | 283 ++++++++-
 .../fluid/tests/unittests/test_fleet_util.py  |  97 +--
 10 files changed, 749 insertions(+), 365 deletions(-)

diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc
index bb958f1ac01..f195dde4084 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 namespace gloo {
 namespace rendezvous {
 
+constexpr int kNodeSize = 136;
+
 HdfsStore::HdfsStore(const std::string& path) {
   path_ = path;
   wait_sleep_ms_ = 10000;
@@ -213,12 +215,14 @@ void ParallelConnectContext::connectFullMesh(
   storeKey << rank;
   store.set(storeKey.str(), allBytes);
 
+  auto total_add_size = kNodeSize * (size - 1);
+
   std::vector<std::shared_ptr<std::thread>> connect_threads(thread_num_);
   // Connect every pair
   for (uint32_t i = 0; i < connect_threads.size(); ++i) {
     connect_threads[i].reset(new std::thread(
-        [&store, &transportContext, this](size_t thread_idx,
-                                          size_t thread_num) -> void {
+        [&store, &transportContext, total_add_size, this](
+            size_t thread_idx, size_t thread_num) -> void {
           for (int i = thread_idx; i < size; i += thread_num) {
             if (i == rank) {
               continue;
@@ -226,8 +230,23 @@ void ParallelConnectContext::connectFullMesh(
             // Wait for address of other side of this pair to become available
             std::string key = std::to_string(i);
             store.wait({key}, getTimeout());
+
+            std::vector<char> allAddrs;
+            auto max_retry_times = 5;
             // Connect to other side of this pair
-            auto allAddrs = store.get(key);
+
+            while (max_retry_times > 0) {
+              allAddrs = store.get(key);
+
+              VLOG(3) << "store get all address size: " << allAddrs.size()
+                      << " except: " << total_add_size;
+              if (allAddrs.size() == static_cast<size_t>(total_add_size)) {
+                break;
+              }
+
+              --max_retry_times;
+            }
+
             auto addr = extractAddress(allAddrs, i);
             transportContext->getPair(i)->connect(addr);
           }
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 2539fa57a34..f3ee09a6d9e 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -39,6 +39,7 @@ server_num = fleet.server_num
 server_index = fleet.server_index
 server_endpoints = fleet.server_endpoints
 is_server = fleet.is_server
+set_util = fleet.set_util
 util = fleet.util
 barrier_worker = fleet.barrier_worker
 init_worker = fleet.init_worker
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 805c2d1fc73..aeb8cac98e2 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -180,6 +180,8 @@ class Fleet(object):
                 raise ValueError(
                     "`role_maker` should be subclass of `RoleMakerBase`, but got {}".
                     format(type(role_maker)))
+        self._role_maker.generate_role()
+
         self.strategy_compiler = StrategyCompiler()
         if paddle.fluid.framework.in_dygraph_mode():
             if parallel_helper._is_parallel_ctx_initialized():
@@ -187,7 +189,6 @@ class Fleet(object):
                     "The dygraph parallel environment has been initialized.")
             else:
                 paddle.distributed.init_parallel_env()
-        return None
 
     def is_first_worker(self):
         """
@@ -275,13 +276,10 @@ class Fleet(object):
                 fleet.worker_endpoints()
 
         """
-        '''
         if to_string:
             return ",".join(self._role_maker.get_trainer_endpoints())
         else:
             return self._role_maker.get_trainer_endpoints()
-        '''
-        return ["127.0.0.1:1001", "127.0.0.1:1002"]
 
     def server_num(self):
         """
@@ -355,7 +353,9 @@ class Fleet(object):
         return self._role_maker.is_server(
         ) or self._role_maker._is_heter_worker()
 
-    @property
+    def set_util(self, util):
+        self._util = util
+
     def util(self):
         """
         Utility functions that can be used under certain runtime
@@ -376,16 +376,6 @@ class Fleet(object):
         """
         return self._util
 
-    @util.setter
-    def util(self, util):
-        """
-        Set Utility functions for userd-defined runtime
-
-        Returns:
-            None
-        """
-        self._util = util
-
     def barrier_worker(self):
         """
         barrier all workers
@@ -393,7 +383,7 @@ class Fleet(object):
         Returns:
             None
         """
-        self._role_maker.barrier_worker()
+        self._role_maker._barrier("worker")
 
     @is_non_distributed_check
     @inited_runtime_handler
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index a3a809ee375..d36c06047f5 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -13,18 +13,332 @@
 # limitations under the License.
 """Defination of Role Makers."""
 import os
+import time
 import numpy as np
 import warnings
 from multiprocessing import Process, Manager
-import paddle.fluid as fluid
 
-#__all__ = ['UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
+import paddle.fluid as fluid
 
 
 class Role:
     WORKER = 1
     SERVER = 2
     HETER_WORKER = 3
+    ALL = 4
+
+
+class Gloo(object):
+    """
+    Gloo is a universal class for barrier and collective communication
+    """
+
+    class RENDEZVOUS:
+        HDFS = 1
+        FILE = 2
+        HTTP = 3
+
+    def __init__(self):
+        self._worker_comm = None
+        self._server_comm = None
+        self._nodes_comm = None
+
+        self._comm_world = ["worker", "server", "all"]
+        self._err_init = "gloo is not initialized, will not communicator with other nodes"
+        self._err_type = "gloo initialized error, please check arguments"
+        self._err_world = "argument error, comm_world must in {}".format(
+            self._comm_world)
+
+        self._is_initialized = False
+        self._init_timeout_seconds = 3600
+        self._run_timeout_seconds = 9999999
+
+        self._rendezvous = None
+        self._role = None
+        self._iface = None
+
+        self._role_id = -1
+        self._worker_num = -1
+        self._server_num = -1
+        self._need_init_all = False
+
+    def init(self,
+             rendezvous,
+             role,
+             role_id,
+             worker_num,
+             server_num,
+             need_init_all=False,
+             kwargs=None):
+
+        self._rendezvous = rendezvous
+        self._role = role
+        self._role_id = role_id
+        self._worker_num = worker_num
+        self._server_num = server_num
+        self._need_init_all = need_init_all
+        self._iface = self.__get_default_iface()
+        self._prefix = kwargs.get("store.prefix", "")
+
+        if self._rendezvous == Gloo.RENDEZVOUS.HDFS:
+            dfs_name = kwargs.get("dfs.name", "")
+            dfs_ugi = kwargs.get("dfs.ugi", "")
+            dfs_path = kwargs.get("dfs.path", "")
+
+            if not dfs_name or not dfs_ugi or not dfs_path:
+                raise ValueError(self._err_type)
+            self._init_dfs(dfs_name, dfs_ugi, dfs_path, self._prefix)
+
+        elif self._rendezvous == Gloo.RENDEZVOUS.FILE:
+            fs_path = kwargs.get("dfs.path", "")
+
+            if not fs_path:
+                raise ValueError(self._err_type)
+            self._init_fs(fs_path, self._prefix)
+
+        elif self._rendezvous == Gloo.RENDEZVOUS.HTTP:
+            ip = kwargs.get("http.host", "")
+            port = kwargs.get("http.port", "")
+
+            if not ip or not port:
+                raise ValueError(self._err_type)
+            self._init_http(ip, port, self._prefix)
+
+        else:
+            raise ValueError(self._err_type)
+
+        self._is_initialized = True
+
+    def _init_fs(self, fs_path, prefix):
+        def init(rank, nodes, role):
+            gloo = fluid.core.Gloo()
+            gloo.set_rank(rank)
+            gloo.set_size(nodes)
+            gloo.set_prefix(prefix)
+            gloo.set_iface(self._iface)
+            gloo.set_timeout_seconds(self._init_timeout_seconds,
+                                     self._run_timeout_seconds)
+            gloo.set_hdfs_store(os.path.join(fs_path, role), "", "")
+            gloo.init()
+            return gloo
+
+        if self._role == Role.WORKER:
+            rank, nodes = self._get_rank_nodes(Role.WORKER)
+            gloo = init(rank, nodes, "WORKER")
+            self._worker_comm = gloo
+        else:
+            rank, nodes = self._get_rank_nodes(Role.SERVER)
+            gloo = init(rank, nodes, "SERVER")
+            self._server_comm = gloo
+
+        if self._need_init_all:
+            rank, nodes = self._get_rank_nodes(Role.ALL)
+            gloo = init(rank, nodes, "ALL")
+            self._nodes_comm = gloo
+
+    def _init_dfs(self, dfs_name, dfs_ugi, dfs_path, prefix):
+        def init(rank, nodes, role):
+            gloo = fluid.core.Gloo()
+            gloo.set_rank(rank)
+            gloo.set_size(nodes)
+            gloo.set_prefix(prefix)
+            gloo.set_iface(self._iface)
+            gloo.set_timeout_seconds(self._init_timeout_seconds,
+                                     self._run_timeout_seconds)
+            gloo.set_hdfs_store(os.path.join(dfs_path, role), dfs_name, dfs_ugi)
+            gloo.init()
+            return gloo
+
+        if self._role == Role.WORKER:
+            rank, nodes = self._get_rank_nodes(Role.WORKER)
+            gloo = init(rank, nodes, "WORKER")
+            self._worker_comm = gloo
+        else:
+            rank, nodes = self._get_rank_nodes(Role.SERVER)
+            gloo = init(rank, nodes, "SERVER")
+            self._server_comm = gloo
+
+        if self._need_init_all:
+            rank, nodes = self._get_rank_nodes(Role.ALL)
+            gloo = init(rank, nodes, "ALL")
+            self._nodes_comm = gloo
+
+    def _init_http(self, ip, port, prefix):
+        def __start_kv_server(http_server_d, size_d):
+            from paddle.distributed.fleet.utils.http_server import KVServer
+            http_server = KVServer(port, size_d)
+            http_server.start()
+            wait_seconds = 5
+            while http_server_d.get("running",
+                                    False) and not http_server.shoud_stop():
+                time.sleep(wait_seconds)
+            http_server.stop()
+
+        def init_kv_server():
+            size_d = {
+                "trainer": self._worker_num,
+                "pserver": self._server_num,
+                "all": self._worker_num + self._server_num
+            }
+
+            _http_server_d = {"running": True}
+            # child process for http server
+            _http_server = Process(
+                target=__start_kv_server, args=(_http_server_d, size_d))
+            _http_server.daemon = True
+            # set running status to True
+            # start child process
+            _http_server.start()
+
+        def init(rank, nodes, role):
+            gloo = fluid.core.Gloo()
+            gloo.set_rank(rank)
+            gloo.set_size(nodes)
+            gloo.set_prefix(prefix)
+            gloo.set_iface(self._iface)
+            gloo.set_timeout_seconds(self._init_timeout_seconds,
+                                     self._run_timeout_seconds)
+            gloo.set_http_store(ip, port, role)
+            return gloo
+
+        port = int(port)
+
+        if self._role == Role.SERVER and self._role_id == 0:
+            init_kv_server()
+
+        if self._role == Role.WORKER:
+            rank, nodes = self._get_rank_nodes(Role.WORKER)
+            gloo = init(rank, nodes, "WORKER")
+            self._worker_comm = gloo
+        else:
+            rank, nodes = self._get_rank_nodes(Role.SERVER)
+            gloo = init(rank, nodes, "SERVER")
+            self._server_comm = gloo
+
+        if self._need_init_all:
+            rank, nodes = self._get_rank_nodes(Role.ALL)
+            gloo = init(rank, nodes, "ALL")
+            self._nodes_comm = gloo
+
+    def _get_rank_nodes(self, role):
+        nodes = 0
+        rank = -1
+
+        if role == Role.WORKER:
+            nodes = self._worker_num
+            rank = self._role_id
+        elif role == Role.SERVER:
+            nodes = self._server_num
+            rank = self._role_id
+        elif role == Role.ALL:
+            nodes = self._worker_num + self._server_num
+
+            if self._role == Role.WORKER:
+                rank = self._role_id
+            else:
+                rank = self._worker_num + self._role_id
+        else:
+            ValueError(self._err_type)
+
+        return rank, nodes
+
+    def __get_default_iface(self):
+        """
+        get default physical interface
+        """
+        default1 = self.__get_default_iface_from_gateway()
+        default2 = self.__get_default_iface_from_interfaces()
+        return default2 if default1 == "lo" else default1
+
+    def __get_default_iface_from_gateway(self):
+        """
+        get default physical interface
+        """
+        import netifaces
+        gateways = netifaces.gateways()
+        if gateways.get(netifaces.AF_INET) != None:
+            gateway = gateways[netifaces.AF_INET]
+            if len(gateway) > 0 and len(gateway[0]) > 1:
+                return gateway[0][1]
+        return "lo"
+
+    def __get_default_iface_from_interfaces(self):
+        """
+        get default physical interface
+        """
+        import netifaces
+        for intf_name in netifaces.interfaces():
+            addresses = netifaces.ifaddresses(intf_name)
+            if netifaces.AF_INET in addresses:
+                ipv4_addresses = addresses[netifaces.AF_INET]
+                for ipv4_address in ipv4_addresses:
+                    if 'broadcast' in ipv4_address:
+                        return intf_name
+        return "lo"
+
+    def barrier(self, comm_world):
+        """
+        dummy barrier, do nothing
+        """
+        if not self._is_initialized:
+            warnings.warn(self._err_init)
+            return
+
+        if comm_world not in self._comm_world:
+            raise ValueError(self._err_world)
+
+        if comm_world == "worker":
+            self._worker_comm.barrier()
+        elif comm_world == "server":
+            self._server_comm.barrier()
+        else:
+            self._nodes_comm.barrier()
+
+    def all_reduce(self, input, mode="sum", comm_world="worker"):
+        if not self._is_initialized:
+            warnings.warn(self._err_init)
+            return input
+
+        if comm_world not in self._comm_world:
+            raise ValueError(self._err_world)
+
+        input = np.array(input)
+        input_shape = input.shape
+        input_list = input.reshape(-1).tolist()
+
+        self.barrier(comm_world)
+
+        if comm_world == "worker":
+            ans = self._worker_comm.all_reduce(input_list, mode)
+        elif comm_world == "server":
+            ans = self._server_comm.all_reduce(input_list, mode)
+        else:
+            ans = self._nodes_comm.all_reduce(input_list, mode)
+
+        output = np.array(ans).reshape(input_shape)
+        return output
+
+    def all_gather(self, input, comm_world="worker"):
+        """
+        dummy all gather, do nothing
+        Args:
+            obj(any): obj to do all gather
+        """
+        if not self._is_initialized:
+            warnings.warn(self._err_init)
+            return input
+
+        if comm_world not in self._comm_world:
+            raise ValueError(self._err_world)
+
+        if comm_world == "worker":
+            output = self._worker_comm.all_gather(input)
+        elif comm_world == "server":
+            output = self._server_comm.all_gather(input)
+        else:
+            output = self._nodes_comm.all_gather(input)
+
+        return output
 
 
 class RoleMakerBase(object):
@@ -47,10 +361,6 @@ class RoleMakerBase(object):
         self._heter_trainer_device = "CPU"
         self._is_heter_parameter_server_mode = False
 
-        self._node_type = None
-        self._node_type_comm = None
-        self._all_comm = None
-
     def is_worker(self):
         """
         return is_worker() of current process
@@ -142,19 +452,11 @@ class RoleMakerBase(object):
             self._role, self._current_id, self._worker_endpoints,
             self._server_endpoints)
 
-    def _all_gather(self, comm_world, input):
-        """
-
-        Args:
-            input(int|float): input value
-
-        Returns:
-            return a list of values
-        """
-        print("warning: RoleMakerBase does not have all gather.")
+    def _all_gather(self, input, comm_world="worker"):
+        print("warning: RoleMakerBase does not have all gather worker.")
         return None
 
-    def _all_reduce(self, comm_world, input, mode="sum"):
+    def _all_reduce(self, input, mode="sum", comm_world="worker"):
         """
         Args:
             input(list/numpy.array): array of one dim
@@ -221,73 +523,25 @@ class PaddleCloudRoleMaker(RoleMakerBase):
     def __init__(self, is_collective=False, **kwargs):
         super(PaddleCloudRoleMaker, self).__init__()
         self._is_collective = is_collective
-        self._init_gloo = False  # default no init gloo
-        self._kwargs = kwargs
 
+        self._non_distributed = False
+
+        self._kwargs = kwargs
         self._role_is_generated = False
 
         self._server_endpoints = None
         self._worker_endpoints = None
 
-        self._node_type_comm = None
-        self._all_comm = None
-
-        self._non_distributed = False
-
-        if not self._is_collective:
-            self._hdfs_name = kwargs.get("hdfs_name", "")
-            self._hdfs_ugi = kwargs.get("hdfs_ugi", "")
-            self._hdfs_path = kwargs.get("path", "").rstrip("/")
-            self._init_timeout_seconds = kwargs.get("init_timeout_seconds",
-                                                    3600)
-            self._run_timeout_seconds = kwargs.get("run_timeout_seconds",
-                                                   9999999)
-            ip_port = kwargs.get("http_ip_port", "")
-            self._http_ip_port = []
-            self._http_server = None
-            # if ip_port is not empty, it will use http instead of hdfs
-            if ip_port != "":
-                self._http_ip_port = ip_port.split(":")
-                # it's for communication between processes
-                self._manager = Manager()
-                # global dict to store status
-                self._http_server_d = self._manager.dict()
-                # set running status of http server
-                self._http_server_d["running"] = False
-            self._iface = self.__get_default_iface()
-            # this environment variable can be empty
-            self._prefix = os.getenv("SYS_JOB_ID", "")
+        self._gloo = Gloo()  # gloo instance
 
     def _barrier(self, comm_world):
-        if isinstance(comm_world, fluid.core.Gloo):
-            comm_world.barrier()
-        else:
-            print("warning: must init Gloo before using _barrier() function")
-
-    def _all_gather(self, comm_world, input):
-        if isinstance(comm_world, fluid.core.Gloo):
-            self._barrier(comm_world)
-            output = comm_world.all_gather(input)
-            return output
-        else:
-            print("warning: must init Gloo before using _all_gather() function")
-            return None
-
-    def _all_reduce(self, comm_world, input, mode="sum"):
-        if isinstance(comm_world, fluid.core.Gloo):
-
-            input = np.array(input)
+        self._gloo.barrier(comm_world)
 
-            input_shape = input.shape
-            input_list = input.reshape(-1).tolist()
+    def _all_gather(self, input, comm_world="worker"):
+        return self._gloo.all_gather(input, comm_world)
 
-            self._barrier(comm_world)
-            ans = comm_world.all_reduce(input_list, mode)
-            output = np.array(ans).reshape(input_shape)
-            return output
-        else:
-            print("warning: must init Gloo before using _all_reduce() function")
-            return None
+    def _all_reduce(self, input, mode="sum", comm_world="worker"):
+        return self._gloo.all_reduce(input, mode, comm_world)
 
     def is_worker(self):
         """
@@ -349,7 +603,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         """
         if not self._role_is_generated:
             self.generate_role()
-        return self._trainers_num
+        return len(self.get_pserver_endpoints())
 
     def node_num(self):
         """
@@ -421,8 +675,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
             # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
             # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
             self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST")
-            self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
-                                               "").split(",")
+
             if self._server_endpoints is None:
                 # back to non_distributed execution.
                 self._server_endpoints = ""
@@ -436,6 +689,13 @@ class PaddleCloudRoleMaker(RoleMakerBase):
                 return
 
             self._server_endpoints = self._server_endpoints.split(",")
+
+            self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+            if self._worker_endpoints:
+                self._worker_endpoints = self._worker_endpoints.split(",")
+            else:
+                self._worker_endpoints = []
+
             trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
             training_role = os.environ["TRAINING_ROLE"]
 
@@ -506,6 +766,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
         self._training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         assert (self._training_role == "TRAINER")
+        self._role = Role.WORKER
         self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
         self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
         if self._worker_endpoints is None:
@@ -518,74 +779,64 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         self._node_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
 
-    def _init_gloo_env(self):
-        def init_gloo_instance(role="trainer"):
-            role = role.lower()
-            assert role in ["trainer", "pserver", "all"]
-            if role == "trainer":
-                all_list = self._worker_endpoints
-                rank = self._current_id
-            elif role == "pserver":
-                all_list = self._server_endpoints
-                rank = self._current_id
-            else:
-                all_list = self._worker_endpoints + self._server_endpoints
-                rank = all_list.index(self._cur_endpoint)
-            gloo = fluid.core.Gloo()
-            gloo.set_rank(rank)
-            gloo.set_size(len(all_list))
-            gloo.set_prefix(self._prefix)
-            gloo.set_iface(self._iface)
-            gloo.set_timeout_seconds(self._init_timeout_seconds,
-                                     self._run_timeout_seconds)
-            if len(self._http_ip_port) != 0:
-                gloo.set_http_store(self._http_ip_port[0],
-                                    int(self._http_ip_port[1]), role)
-            else:
-                gloo.set_hdfs_store(self._hdfs_path + "/" + role,
-                                    self._hdfs_name, self._hdfs_ugi)
-            gloo.init()
-            return gloo
-
-        # paddlecloud support gloo
-        if self._role == Role.WORKER:
-            if self._current_id == 0 and len(self._http_ip_port) != 0:
-                size_d = {
-                    "trainer": len(self._worker_endpoints),
-                    "pserver": len(self._server_endpoints),
-                    "all":
-                    len(self._worker_endpoints) + len(self._server_endpoints)
-                }
-                # child process for http server
-                self._http_server = Process(
-                    target=self.__start_kv_server,
-                    args=(self._http_server_d, size_d))
-                self._http_server.daemon = True
-                # set running status to True
-                self._http_server_d["running"] = True
-                # start child process
-                self._http_server.start()
-            self._node_type = 1
-            gloo = init_gloo_instance("trainer")
-            self._node_type_comm = gloo
+    def _gloo_init(self):
+        # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier
+        use_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0"))
+        if use_gloo not in [1, 2]:
+            return
+
+        # PADDLE_GLOO_RENDEZVOUS 1: HDFS 2: FILE 3: HTTP
+        rendezvous_type = int(os.getenv("PADDLE_GLOO_RENDEZVOUS", "0"))
+        prefix = os.getenv("SYS_JOB_ID", "")
+        if rendezvous_type not in [
+                Gloo.RENDEZVOUS.HDFS, Gloo.RENDEZVOUS.HTTP, Gloo.RENDEZVOUS.FILE
+        ]:
+            raise ValueError(self._gloo._err_type)
+
+        need_init_all = True if use_gloo == 2 else False
+
+        if rendezvous_type == Gloo.RENDEZVOUS.HDFS:
+            dfs_name = os.getenv("PADDLE_GLOO_FS_NAME", "")
+            dfs_ugi = os.getenv("PADDLE_GLOO_FS_UGI", "")
+            dfs_path = os.getenv("PADDLE_GLOO_FS_PATH", "")
+            kwargs = {
+                "dfs.name": dfs_name,
+                "dfs.ugi": dfs_ugi,
+                "dfs.path": dfs_path,
+                "store.prefix": prefix,
+            }
+        elif rendezvous_type == Gloo.RENDEZVOUS.HTTP:
+            ip = os.getenv("PADDLE_GLOO_HTTP_HOST", "")
+            port = os.getenv("PADDLE_GLOO_HTTP_PORT", "")
+            kwargs = {
+                "http.host": ip,
+                "http.port": port,
+                "store.prefix": prefix,
+            }
         else:
-            assert self._role == Role.SERVER
-            self._node_type = 0
-            gloo = init_gloo_instance("pserver")
-            self._node_type_comm = gloo
-
-        all_list = self._worker_endpoints + self._server_endpoints
-        self._rank = all_list.index(self._cur_endpoint)
-        self._size = len(all_list)
-
-        gloo = init_gloo_instance("all")
-        self._all_comm = gloo
-
-        if self._http_server is not None:
-            # set running status to False
-            self._http_server_d["running"] = False
-            # wait until child process exits
-            self._http_server.join()
+            dfs_path = os.getenv("PADDLE_GLOO_FS_PATH", "")
+            kwargs = {
+                "dfs.path": dfs_path,
+                "store.prefix": prefix,
+            }
+
+        if rendezvous_type == Gloo.RENDEZVOUS.HDFS:
+            type = "HDFS"
+        elif rendezvous_type == Gloo.RENDEZVOUS.HTTP:
+            type = "HTTP"
+        else:
+            type = "FILE"
+        print("Gloo init with {}: need_init_all: {}, args: {}".format(
+            type, need_init_all, kwargs))
+
+        self._gloo.init(
+            rendezvous=rendezvous_type,
+            role=self._role,
+            role_id=self.role_id(),
+            worker_num=self.worker_num(),
+            server_num=self.server_num(),
+            need_init_all=need_init_all,
+            kwargs=kwargs)
 
     def generate_role(self):
         """
@@ -594,57 +845,10 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         if not self._role_is_generated:
             if not self._is_collective:
                 self._ps_env()
-                if "PADDLE_WITH_GLOO" in os.environ:
-                    self._init_gloo = bool(os.environ["PADDLE_WITH_GLOO"])
-                if self._init_gloo:
-                    self._init_gloo_env()
             else:
                 self._collective_env()
             self._role_is_generated = True
-
-    def __get_default_iface(self):
-        """
-        get default physical interface
-        """
-        default1 = self.__get_default_iface_from_gateway()
-        default2 = self.__get_default_iface_from_interfaces()
-        return default2 if default1 == "lo" else default1
-
-    def __get_default_iface_from_gateway(self):
-        """
-        get default physical interface
-        """
-        import netifaces
-        gateways = netifaces.gateways()
-        if gateways.get(netifaces.AF_INET) != None:
-            gateway = gateways[netifaces.AF_INET]
-            if len(gateway) > 0 and len(gateway[0]) > 1:
-                return gateway[0][1]
-        return "lo"
-
-    def __get_default_iface_from_interfaces(self):
-        """
-        get default physical interface
-        """
-        import netifaces
-        for intf_name in netifaces.interfaces():
-            addresses = netifaces.ifaddresses(intf_name)
-            if netifaces.AF_INET in addresses:
-                ipv4_addresses = addresses[netifaces.AF_INET]
-                for ipv4_address in ipv4_addresses:
-                    if 'broadcast' in ipv4_address:
-                        return intf_name
-        return "lo"
-
-    def __start_kv_server(self, http_server_d, size_d):
-        from paddle.distributed.fleet.utils.http_server import KVServer
-        http_server = KVServer(int(self._http_ip_port[1]), size_d)
-        http_server.start()
-        wait_seconds = 5
-        while http_server_d.get("running",
-                                False) and not http_server.shoud_stop():
-            time.sleep(wait_seconds)
-        http_server.stop()
+            self._gloo_init()
 
 
 class UserDefinedRoleMaker(PaddleCloudRoleMaker):
@@ -677,7 +881,7 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
         self._worker_endpoints = self._kwargs.get("worker_endpoints")
         self._current_id = self._kwargs.get("current_id")
         self._trainers_num = len(self._worker_endpoints)
-        self._training_role = Role.Worker
+        self._training_role = Role.WORKER
         self._node_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
 
@@ -688,8 +892,6 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
         if not self._role_is_generated:
             if not self._is_collective:
                 self._user_defined_ps_env()
-                if self._init_gloo:
-                    self._init_gloo_env()
             else:
                 self._user_defined_collective_env()
             self._role_is_generated = True
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
index 4fa247c3196..e822c3c92f4 100644
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -57,34 +57,7 @@ class UtilBase(object):
         ), "fs_client must be the instance of paddle.distributed.fleet.utils.FS"
         self.fs_client = fs_client
 
-    def __check_comm_world(self, comm_world="worker"):
-        if not self.role_maker._role_is_generated:
-            self.role_maker.generate_role()
-
-        _comm_world = None
-        comm_world_upper = comm_world.upper()
-        if comm_world_upper == "WORKER":
-            if not self.role_maker.is_worker():
-                print(
-                    "warning: current role is not worker in collective_func(comm_world=\"worker\")"
-                )
-            _comm_world = self.role_maker._node_type_comm
-        elif comm_world_upper == "SERVER":
-            if not self.role_maker.is_server():
-                print(
-                    "warning: current role is not server in collective_func(comm_world=\"server\")"
-                )
-            _comm_world = self.role_maker._node_type_comm
-        elif comm_world_upper == "ALL":
-            _comm_world = self.role_maker._all_comm
-        else:
-            raise ValueError(
-                "not support comm_world, please choose one from [worker, server, all]"
-            )
-
-        return _comm_world
-
-    def all_reduce(self, input, mode, comm_world="worker"):
+    def all_reduce(self, input, mode="sum", comm_world="worker"):
         """
         All reduce `input` between specified collection. This is a distributed API.
 
@@ -130,8 +103,7 @@ class UtilBase(object):
                 if __name__ == "__main__":
                     train()
         """
-        _comm_world = self.__check_comm_world(comm_world)
-        return self.role_maker._all_reduce(_comm_world, input, mode)
+        return self.role_maker._all_reduce(input, mode, comm_world)
 
     def barrier(self, comm_world="worker"):
         """
@@ -170,8 +142,7 @@ class UtilBase(object):
                 if __name__ == "__main__":
                     train()
         """
-        _comm_world = self.__check_comm_world(comm_world)
-        self.role_maker._barrier(_comm_world)
+        self.role_maker._barrier(comm_world)
 
     def all_gather(self, input, comm_world="worker"):
         """
@@ -219,8 +190,8 @@ class UtilBase(object):
                 if __name__ == "__main__":
                     train()
         """
-        _comm_world = self.__check_comm_world(comm_world)
-        return self.role_maker._all_gather(_comm_world, input)
+
+        return self.role_maker._all_gather(input, comm_world)
 
     def _broadcast(self):
         pass
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index a527393f602..4b629bc35ce 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -55,7 +55,10 @@ launch a process on each of the given gpu card or cpu machine.
 """
 
 from __future__ import print_function
+
+import shutil
 import sys
+import tempfile
 from sys import version
 import subprocess
 import os
@@ -213,12 +216,20 @@ def launch_collective(args):
         cluster, pod = get_cluster_from_args(args, gpus)
         logger.debug("get cluster from args:{}".format(cluster))
 
+    global_envs = copy.copy(os.environ.copy())
+    gloo_rendezvous_dir = tempfile.mkdtemp()
+    # add gloo env
+    global_envs["PADDLE_WITH_GLOO"] = "1"
+    global_envs["PADDLE_GLOO_RENDEZVOUS"] = "2"
+    global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
+
     procs = start_local_trainers(
         cluster,
         pod,
         training_script=args.training_script,
         training_script_args=args.training_script_args,
-        log_dir=args.log_dir)
+        log_dir=args.log_dir,
+        envs=global_envs)
 
     while True:
         alive = watch_local_trainers(procs, cluster.trainers_nranks())
@@ -230,6 +241,9 @@ def launch_collective(args):
 
         time.sleep(3)
 
+    if os.path.exists(gloo_rendezvous_dir):
+        shutil.rmtree(gloo_rendezvous_dir)
+
 
 def launch_ps(args):
     ports = None
@@ -315,6 +329,13 @@ def launch_ps(args):
 
     default_env = os.environ.copy()
     current_env = copy.copy(default_env)
+
+    gloo_rendezvous_dir = tempfile.mkdtemp()
+    # add gloo env
+    current_env["PADDLE_WITH_GLOO"] = "1"
+    current_env["PADDLE_GLOO_RENDEZVOUS"] = "2"
+    current_env["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
+
     current_env.pop("http_proxy", None)
     current_env.pop("https_proxy", None)
     procs = []
@@ -419,6 +440,9 @@ def launch_ps(args):
         procs[i].proc.terminate()
     print("all parameter server are killed", file=sys.stderr)
 
+    if os.path.exists(gloo_rendezvous_dir):
+        shutil.rmtree(gloo_rendezvous_dir)
+
 
 def launch():
     args = _parse_args()
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index b6f4c75a276..17d3b96cf44 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -398,8 +398,14 @@ def start_local_trainers(cluster,
                          pod,
                          training_script,
                          training_script_args,
-                         log_dir=None):
-    current_env = copy.copy(os.environ.copy())
+                         log_dir=None,
+                         envs=None):
+
+    if envs is None:
+        current_env = copy.copy(os.environ.copy())
+    else:
+        current_env = copy.copy(envs)
+
     #paddle broadcast ncclUniqueId use socket, and
     #proxy maybe make trainers unreachable, so delete them.
     #if we set them to "", grpc will log error message "bad uri"
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 4ced9841ee4..3a90b363f27 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -27,7 +27,7 @@ class TestFleetBase(unittest.TestCase):
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
         os.environ["PADDLE_TRAINERS_NUM"] = "2"
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+            "127.0.0.1:36001,127.0.0.2:36001"
 
     def test_init(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
@@ -88,7 +88,7 @@ class TestFleetBase(unittest.TestCase):
     def test_util(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        self.assertEqual(fleet.util, None)
+        self.assertEqual(fleet.util(), None)
 
     def test_barrier_worker(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
@@ -99,20 +99,17 @@ class TestFleetBase(unittest.TestCase):
     def test_init_worker(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        if fleet.is_worker():
-            fleet.init_worker()
 
-    def test_run_server(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        if fleet.is_worker():
-            fleet.run_worker()
+        with self.assertRaises(ValueError):
+            if fleet.is_worker():
+                fleet.init_worker()
 
     def test_stop_worker(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        if fleet.is_worker():
-            fleet.stop_worker()
+        with self.assertRaises(ValueError):
+            if fleet.is_worker():
+                fleet.stop_worker()
 
     def test_distributed_optimizer(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
index cf9b3e1e9a1..d786fa1eba8 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -15,7 +15,11 @@
 
 from __future__ import print_function
 import os
+import platform
+import shutil
+import tempfile
 import unittest
+import paddle
 import paddle.distributed.fleet.base.role_maker as role_maker
 
 
@@ -42,9 +46,9 @@ class TestRoleMakerBase(unittest.TestCase):
         self.assertTrue(len(pserver_endpoints) == 0)
 
         print(role.to_string())
-        self.assertTrue(role._all_gather(role._node_type_comm, 1) is None)
-        self.assertTrue(role._all_reduce(role._node_type_comm, 1) is None)
-        role._barrier(role._node_type_comm)
+        self.assertTrue(role._all_gather(1, "worker") is None)
+        self.assertTrue(role._all_reduce(1, "sum", "worker") is None)
+        role._barrier("worker")
 
 
 class TestCloudRoleMaker(unittest.TestCase):
@@ -72,8 +76,8 @@ class TestCloudRoleMaker(unittest.TestCase):
             print("warning: no netifaces, skip test_tr_rolemaker")
             return
 
-        ro = role_maker.PaddleCloudRoleMaker(
-            is_collective=False, init_gloo=False)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+
         self.assertTrue(ro.is_worker())
         self.assertFalse(ro.is_server())
         self.assertEqual(ro.worker_num(), 2)
@@ -108,8 +112,9 @@ class TestCloudRoleMaker(unittest.TestCase):
         self.assertEqual(ro.server_num(), 2)
         pserver_endpoints = ro.get_pserver_endpoints()
         self.assertEqual(pserver_endpoints[0], '127.0.0.1:36001')
-        self.assertTrue(ro._all_gather(ro._all_comm, 1) is None)
-        self.assertTrue(ro._all_reduce(ro._all_comm, 1) is None)
+
+        self.assertEqual(ro._all_gather(1, "worker"), 1)
+        self.assertEqual(ro._all_reduce(1, "sum", "worker"), 1)
 
     def test_traing_role(self):
         """Test training role."""
@@ -142,7 +147,7 @@ class TestUserDefinedRoleMaker(unittest.TestCase):
         ro = role_maker.UserDefinedRoleMaker(
             is_collective=False,
             init_gloo=False,
-            server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
+            server_endpoints=["127.0.0.1:36001", "127.0.0.1:36001"],
             role=role_maker.Role.SERVER,
             current_id=0,
             worker_num=2)
@@ -161,14 +166,274 @@ class TestUserDefinedRoleMaker(unittest.TestCase):
         ro = role_maker.UserDefinedRoleMaker(
             is_collective=False,
             init_gloo=False,
-            server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
+            server_endpoints=["127.0.0.1:36001", "127.0.0.1:36001"],
             role=role_maker.Role.WORKER,
             current_id=0,
             worker_num=2)
+
         self.assertIn("127.0.0.1:36001", ro.get_pserver_endpoints())
         self.assertTrue(ro.is_worker())
         self.assertEqual(ro.role_id(), 0)
 
 
+class TestGlooWithCloudRoleMaker(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def case(self, role, comm_world):
+        role._barrier(comm_world)
+
+        gather = role._all_gather(1, comm_world)
+        self.assertEqual(gather[0], 1)
+
+        all_reduce = role._all_reduce(1, "sum", comm_world)
+        self.assertEqual(1, all_reduce)
+
+    def mkdir(self):
+        tmp = tempfile.mkdtemp()
+        return tmp
+
+    def clean(self, tmp):
+        shutil.rmtree(tmp)
+
+    def test_hdfs_gloo(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role.generate_role()
+        self.case(role, "worker")
+        self.clean(tmp)
+
+    def test_fs_gloo(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role.generate_role()
+        self.case(role, "worker")
+        self.clean(tmp)
+
+    def test_fs_gloo2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role.generate_role()
+        self.case(role, "server")
+        self.clean(tmp)
+
+    def test_fs_gloo3(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role.generate_role()
+        self.case(role, "server")
+        self.clean(tmp)
+
+    def test_fs_gloo4(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
+        os.environ["PADDLE_GLOO_HTTP_HOST"] = "127.0.0.1"
+        os.environ["PADDLE_GLOO_HTTP_PORT"] = "30019"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role.generate_role()
+        import time
+        time.sleep(3)
+
+    def test_fs_gloo5(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role.generate_role()
+        self.case(role, "server")
+        self.case(role, "all")
+        self.clean(tmp)
+
+    def test_fs_gloo6(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role.generate_role()
+        self.case(role, "server")
+        self.case(role, "all")
+        self.clean(tmp)
+
+    def test_fs_gloo7(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "5"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role.generate_role)
+
+    def test_fs_gloo8(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        def net():
+            x = paddle.fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = paddle.fluid.layers.fc(input=x, size=1, act=None)
+            y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+            cost = paddle.fluid.layers.square_error_cost(
+                input=y_predict, label=y)
+            avg_cost = paddle.fluid.layers.mean(cost)
+            return avg_cost
+
+        from paddle.distributed import fleet
+
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        avg_cost = net()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = False
+
+        optimizer = paddle.optimizer.SGD(0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(avg_cost)
+
+        comm_world = "server"
+        fleet.util().barrier(comm_world)
+
+        gather = fleet.util().all_gather(1, comm_world)
+        self.assertEqual(gather[0], 1)
+
+        all_reduce = fleet.util().all_reduce(1, "sum", comm_world)
+        self.assertEqual(1, all_reduce)
+
+        self.clean(tmp)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py
index d506088fde0..1570912e740 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -59,7 +59,7 @@ class TestFleetUtil(unittest.TestCase):
         import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        default_util = fleet.util
+        default_util = fleet.util()
         self.assertEqual(default_util, None)
 
     def test_set_user_defined_util(self):
@@ -76,8 +76,8 @@ class TestFleetUtil(unittest.TestCase):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         my_util = UserDefinedUtil()
-        fleet.util = my_util
-        user_id = fleet.util.get_user_id()
+        fleet.set_util(my_util)
+        user_id = fleet.util().get_user_id()
         self.assertEqual(user_id, 10)
 
     def test_fs(self):
@@ -88,97 +88,6 @@ class TestFleetUtil(unittest.TestCase):
         self.assertFalse(fs.need_upload_download())
         fleet_util._set_file_system(fs)
 
-    def test_barrier(self):
-        try:
-            import netifaces
-        except:
-            print("warning: no netifaces, skip test_barrier")
-            return
-
-        gloo = fluid.core.Gloo()
-        gloo.set_rank(0)
-        gloo.set_size(1)
-        gloo.set_prefix("123")
-        gloo.set_iface("lo")
-        gloo.set_hdfs_store("./tmp_test_fleet_barrier", "", "")
-        gloo.init()
-
-        role = role_maker.UserDefinedRoleMaker(
-            is_collective=False,
-            init_gloo=False,
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_endpoints=["127.0.0.1:6003"],
-            server_endpoints=["127.0.0.1:6001"])
-        role._node_type_comm = gloo
-        role._role_is_generated = True
-        fleet_util._set_role_maker(role)
-
-        fleet_util.barrier("worker")
-
-    def test_all_reduce(self):
-        try:
-            import netifaces
-        except:
-            print("warning: no netifaces, skip test_all_reduce")
-            return
-
-        gloo = fluid.core.Gloo()
-        gloo.set_rank(0)
-        gloo.set_size(1)
-        gloo.set_prefix("123")
-        gloo.set_iface("lo")
-        gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
-        gloo.init()
-
-        role = role_maker.UserDefinedRoleMaker(
-            is_collective=False,
-            init_gloo=False,
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_endpoints=["127.0.0.1:6003"],
-            server_endpoints=["127.0.0.1:6001"])
-        role._node_type_comm = gloo
-        role._role_is_generated = True
-        fleet_util._set_role_maker(role)
-
-        output = fleet_util.all_reduce(1, "sum", comm_world="server")
-        print(output)
-
-    # self.assertEqual(output, 1)
-
-    def test_all_gather(self):
-        try:
-            import netifaces
-        except:
-            print("warning: no netifaces, skip test_all_gather")
-            return
-
-        gloo = fluid.core.Gloo()
-        gloo.set_rank(0)
-        gloo.set_size(1)
-        gloo.set_prefix("123")
-        gloo.set_iface("lo")
-        gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
-        gloo.init()
-
-        role = role_maker.UserDefinedRoleMaker(
-            is_collective=False,
-            init_gloo=False,
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_endpoints=["127.0.0.1:6003"],
-            server_endpoints=["127.0.0.1:6001"])
-        role._node_type_comm = gloo
-        role._all_comm = gloo
-        role._role_is_generated = True
-        fleet_util._set_role_maker(role)
-
-        output = fleet_util.all_gather(1, comm_world="all")
-        print(output)
-        # self.assertTrue(len(output) == 1 and output[0] == 1)
-        self.assertRaises(Exception, fleet_util.all_gather, 1, "test")
-
     def download_files(self):
         path = download(self.proto_data_url, self.module_name,
                         self.proto_data_md5)
-- 
GitLab


From d6b54de46753827c23cabe5f3307f7493db194d0 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Sun, 20 Sep 2020 13:18:26 +0800
Subject: [PATCH 143/261] =?UTF-8?q?=E3=80=90paddle.fleet=E3=80=91Fix/role?=
 =?UTF-8?q?=20maker=20api=20fix=20(#27326)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix fleet util and gloo

* fix worker endpoints

* fix

* fix UT

* fix gloo

* fix gloo

* update gloo

* update gloo

* update gloo

* update gloo

* update gloo

* fix gloo wrapper for hdfs

* add file gloo and UT

* fix UT

* fix UT

* fix UT

* hide public method of RoleMaker

* fix UT

* GPU fleetrun support gloo

* parameterserver fleetrun support gloo

* add UT

* add UT

* fix UT

* fix get server endpoint

* fix get server endpoint

* fix UT

* hide public method of rolemaker

* hide public method of rolemaker

* hide public method of rolemaker

* Update test_fleet_rolemaker_new.py

* hide public method of rolemaker

* hide public method of rolemaker
---
 .../distributed/fleet/base/fleet_base.py      |  24 +-
 .../distributed/fleet/base/role_maker.py      | 112 +++--
 .../distributed/fleet/base/util_factory.py    |   6 +-
 .../fleet/meta_optimizers/common.py           |   6 +-
 .../fleet/meta_optimizers/dgc_optimizer.py    |   4 +-
 .../graph_execution_optimizer.py              |  18 +-
 .../meta_optimizers/localsgd_optimizer.py     |  10 +-
 .../parameter_server_graph_optimizer.py       |   2 +-
 .../parameter_server_optimizer.py             |   4 +-
 .../meta_optimizers/pipeline_optimizer.py     |   8 +-
 .../fleet/runtime/parameter_server_runtime.py |  21 +-
 .../fleet/parameter_server/ir/public.py       |  30 +-
 .../fluid/tests/unittests/test_fleet_base.py  |  49 ++-
 .../tests/unittests/test_fleet_rolemaker_2.py |   2 +-
 .../unittests/test_fleet_rolemaker_new.py     | 414 ++++++++++++++++--
 15 files changed, 531 insertions(+), 179 deletions(-)

diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index aeb8cac98e2..d00faac8385 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -180,7 +180,7 @@ class Fleet(object):
                 raise ValueError(
                     "`role_maker` should be subclass of `RoleMakerBase`, but got {}".
                     format(type(role_maker)))
-        self._role_maker.generate_role()
+        self._role_maker._generate_role()
 
         self.strategy_compiler = StrategyCompiler()
         if paddle.fluid.framework.in_dygraph_mode():
@@ -207,7 +207,7 @@ class Fleet(object):
                 fleet.is_first_worker()
 
         """
-        return self._role_maker.is_first_worker()
+        return self._role_maker._is_first_worker()
 
     def worker_index(self):
         """
@@ -224,7 +224,7 @@ class Fleet(object):
                 fleet.worker_index()
 
         """
-        return self._role_maker.worker_index()
+        return self._role_maker._worker_index()
 
     def worker_num(self):
         """
@@ -241,7 +241,7 @@ class Fleet(object):
                 fleet.worker_num()
 
         """
-        return self._role_maker.worker_num()
+        return self._role_maker._worker_num()
 
     def is_worker(self):
         """
@@ -259,7 +259,7 @@ class Fleet(object):
                 fleet.is_worker()
 
         """
-        return self._role_maker.is_worker()
+        return self._role_maker._is_worker()
 
     def worker_endpoints(self, to_string=False):
         """
@@ -277,9 +277,9 @@ class Fleet(object):
 
         """
         if to_string:
-            return ",".join(self._role_maker.get_trainer_endpoints())
+            return ",".join(self._role_maker._get_trainer_endpoints())
         else:
-            return self._role_maker.get_trainer_endpoints()
+            return self._role_maker._get_trainer_endpoints()
 
     def server_num(self):
         """
@@ -294,7 +294,7 @@ class Fleet(object):
             fleet.init()
             fleet.server_num()
         """
-        return len(self._role_maker.get_pserver_endpoints())
+        return len(self._role_maker._get_pserver_endpoints())
 
     def server_index(self):
         """
@@ -311,7 +311,7 @@ class Fleet(object):
                 fleet.server_index()
 
         """
-        return self._role_maker.server_index()
+        return self._role_maker._server_index()
 
     def server_endpoints(self, to_string=False):
         """
@@ -330,9 +330,9 @@ class Fleet(object):
         """
 
         if to_string:
-            return ",".join(self._role_maker.get_pserver_endpoints())
+            return ",".join(self._role_maker._get_pserver_endpoints())
         else:
-            return self._role_maker.get_pserver_endpoints()
+            return self._role_maker._get_pserver_endpoints()
 
     def is_server(self):
         """
@@ -350,7 +350,7 @@ class Fleet(object):
                 fleet.is_server()
 
         """
-        return self._role_maker.is_server(
+        return self._role_maker._is_server(
         ) or self._role_maker._is_heter_worker()
 
     def set_util(self, util):
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index d36c06047f5..81d5908ccd4 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -361,19 +361,19 @@ class RoleMakerBase(object):
         self._heter_trainer_device = "CPU"
         self._is_heter_parameter_server_mode = False
 
-    def is_worker(self):
+    def _is_worker(self):
         """
         return is_worker() of current process
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def is_server(self):
+    def _is_server(self):
         """
         return is_server() of current process
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def is_first_worker(self):
+    def _is_first_worker(self):
         """
         Check whether the node is the first instance of worker.
         Returns:
@@ -382,7 +382,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def worker_num(self):
+    def _worker_num(self):
         """
         Get current total worker number.
 
@@ -391,7 +391,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def server_num(self):
+    def _server_num(self):
         """
         Get current total server number.
 
@@ -400,7 +400,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def worker_index(self):
+    def _worker_index(self):
         """
         Get current worker id.
 
@@ -409,7 +409,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def server_index(self):
+    def _server_index(self):
         """
         Get current server id.
 
@@ -418,7 +418,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def role_id(self):
+    def _role_id(self):
         """
         Get current id.
 
@@ -427,7 +427,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def node_num(self):
+    def _node_num(self):
         """
         Get the training node number
         Returns:
@@ -435,13 +435,13 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def get_trainer_endpoints(self):
+    def _get_trainer_endpoints(self):
         """
         return trainer endpoints
         """
         return self._worker_endpoints
 
-    def get_pserver_endpoints(self):
+    def _get_pserver_endpoints(self):
         """
         return pserver endpoints
         """
@@ -543,90 +543,92 @@ class PaddleCloudRoleMaker(RoleMakerBase):
     def _all_reduce(self, input, mode="sum", comm_world="worker"):
         return self._gloo.all_reduce(input, mode, comm_world)
 
-    def is_worker(self):
+    def _is_worker(self):
         """
         whether current process is worker
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._role == Role.WORKER
 
-    def is_server(self):
+    def _is_server(self):
         """
         whether current process is server
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._role == Role.SERVER
 
-    def is_first_worker(self):
+    def _is_first_worker(self):
         """
         whether current process is worker of rank 0
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._role == Role.WORKER and self._current_id == 0
 
-    def worker_index(self):
+    def _worker_index(self):
         """
         get index of current worker
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._current_id
 
-    def server_index(self):
+    def _server_index(self):
         """
         get index of current server
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._current_id
 
-    def role_id(self):
+    def _role_id(self):
         """
         get index of current node
         """
+        if not self._role_is_generated:
+            self._generate_role()
         return self._current_id
 
-    def worker_num(self):
+    def _worker_num(self):
         """
         retrun the current number of worker
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._trainers_num
 
-    def server_num(self):
+    def _server_num(self):
         """
         return the current number of server
         """
         if not self._role_is_generated:
-            self.generate_role()
-        return len(self.get_pserver_endpoints())
+            self._generate_role()
+        return len(self._get_pserver_endpoints())
 
-    def node_num(self):
+    def _node_num(self):
         """
         return the training node number
         """
         if not self._role_is_generated:
-            self.generate_role()
-        return self._node_num
+            self._generate_role()
+        return self._nodes_num
 
-    def get_trainer_endpoints(self):
+    def _get_trainer_endpoints(self):
         """
         get endpoint of all trainers
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._worker_endpoints
 
-    def get_pserver_endpoints(self):
+    def _get_pserver_endpoints(self):
         """
         get endpoint of all pservers
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._server_endpoints
 
     def _is_non_distributed(self):
@@ -635,7 +637,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         (use python-run to launch fleet-code directly)
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._non_distributed
 
     def _heter_worker_num(self):
@@ -643,7 +645,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         get heter worker nums
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._heter_trainers_num
 
     def _is_heter_worker(self):
@@ -651,25 +653,9 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         whether current process is heter worker
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._role == Role.HETER_WORKER
 
-    def _get_rank(self):
-        """
-        get current rank in all workers and pservers
-        """
-        if not self._role_is_generated:
-            self.generate_role()
-        return self._rank
-
-    def _get_size(self):
-        """
-        get total num of all workers and pservers
-        """
-        if not self._role_is_generated:
-            self.generate_role()
-        return self._size
-
     def _ps_env(self):
         try:
             # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
@@ -682,7 +668,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
                 self._trainers_num = 1
                 self._role = Role.WORKER
                 self._current_id = 0
-                self._node_num = 1
+                self._nodes_num = 1
                 self._heter_trainers_num = 0
                 self._heter_trainer_endpoints = None
                 self._non_distributed = True
@@ -757,7 +743,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         self._trainers_num = trainers_num
         self._role = role
         self._current_id = current_id
-        self._node_num = len(
+        self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
         self._heter_trainers_num = heter_trainers_num
         self._heter_trainer_endpoints = heter_trainer_eplist
@@ -776,7 +762,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
             self._non_distributed = True
         self._worker_endpoints = self._worker_endpoints.split(",")
         self._trainers_num = len(self._worker_endpoints)
-        self._node_num = len(
+        self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
 
     def _gloo_init(self):
@@ -832,13 +818,13 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         self._gloo.init(
             rendezvous=rendezvous_type,
             role=self._role,
-            role_id=self.role_id(),
-            worker_num=self.worker_num(),
-            server_num=self.server_num(),
+            role_id=self._role_id(),
+            worker_num=self._worker_num(),
+            server_num=self._server_num(),
             need_init_all=need_init_all,
             kwargs=kwargs)
 
-    def generate_role(self):
+    def _generate_role(self):
         """
         generate role for role maker
         """
@@ -874,7 +860,7 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
             self._cur_endpoint = self._worker_endpoints[self._current_id]
         elif self._role == Role.SERVER:
             self._cur_endpoint = self._server_endpoints[self._current_id]
-        self._node_num = len(
+        self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
 
     def _user_defined_collective_env(self):
@@ -882,10 +868,10 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
         self._current_id = self._kwargs.get("current_id")
         self._trainers_num = len(self._worker_endpoints)
         self._training_role = Role.WORKER
-        self._node_num = len(
+        self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
 
-    def generate_role(self):
+    def _generate_role(self):
         """
         generate role for role maker
         """
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
index e822c3c92f4..efaa854c087 100644
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -237,8 +237,8 @@ class UtilBase(object):
         if not isinstance(files, list):
             raise TypeError("files should be a list of file need to be read.")
 
-        trainer_id = self.role_maker.worker_index()
-        trainers = self.role_maker.worker_num()
+        trainer_id = self.role_maker._worker_index()
+        trainers = self.role_maker._worker_num()
 
         remainder = len(files) % trainers
         blocksize = int(len(files) / trainers)
@@ -280,7 +280,7 @@ class UtilBase(object):
                 fleet_util._set_role_maker(role)
                 fleet_util.print_on_rank("I'm worker 0", 0)
         """
-        if self.role_maker.worker_index() != rank_id:
+        if self.role_maker._worker_index() != rank_id:
             return
         print(message)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index 70b010978bb..8ff4114bf8e 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -57,12 +57,12 @@ class CollectiveHelper(object):
         if startup_program is None:
             self.startup_program = fluid.default_startup_program()
 
-        endpoints = self.role_maker.get_trainer_endpoints()
-        current_endpoint = endpoints[self.role_maker.worker_index()]
+        endpoints = self.role_maker._get_trainer_endpoints()
+        current_endpoint = endpoints[self.role_maker._worker_index()]
         for ring_id in range(self.nrings):
             self._init_communicator(
                 self.startup_program, current_endpoint, endpoints,
-                self.role_maker.worker_index(), ring_id, self.wait_port)
+                self.role_maker._worker_index(), ring_id, self.wait_port)
         self._broadcast_params()
 
     def _init_communicator(self, program, current_endpoint, endpoints, rank,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 3f6ed1ed2f2..6806a479d30 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -47,7 +47,7 @@ class DGCOptimizer(MetaOptimizerBase):
             sparsity=configs['sparsity'],
             parameter_list=opt._parameter_list,
             use_nesterov=opt._use_nesterov,
-            num_trainers=self.role_maker.worker_num(),
+            num_trainers=self.role_maker._worker_num(),
             regularization=opt.regularization,
             grad_clip=opt._grad_clip,
             name=opt._name)
@@ -60,7 +60,7 @@ class DGCOptimizer(MetaOptimizerBase):
             if not isinstance(self.inner_opt, Momentum):
                 logging.warn("dgc only works on Momentum optimizer")
                 return False
-            if self.role_maker.worker_num() <= 1:
+            if self.role_maker._worker_num() <= 1:
                 logging.warn("dgc only works on multi cards")
                 return False
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index 6c1cc3d7a97..0ad9e5680ea 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -50,12 +50,12 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
 
     # should fix the variable
     def _setup_nccl_op(self, startup_program, main_program, build_strategy):
-        trainer_endpoints = self.role_maker.get_trainer_endpoints()
+        trainer_endpoints = self.role_maker._get_trainer_endpoints()
         trainers = trainer_endpoints
-        trainer_id = self.role_maker.worker_index()
-        current_endpoint = self.role_maker.get_trainer_endpoints()[trainer_id]
+        trainer_id = self.role_maker._worker_index()
+        current_endpoint = self.role_maker._get_trainer_endpoints()[trainer_id]
         trainer_endpoints_env = ",".join(trainer_endpoints)
-        trainers_num = self.role_maker.worker_num()
+        trainers_num = self.role_maker._worker_num()
         nccl_id_var = startup_program.global_block().create_var(
             name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
         for i in range(1, build_strategy.nccl_comm_num):
@@ -127,8 +127,8 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
             local_build_strategy.enable_sequential_execution = True
 
         exe_strategy = self.user_defined_strategy.execution_strategy
-        worker_num = self.role_maker.worker_num()
-        node_num = self.role_maker.node_num()
+        worker_num = self.role_maker._worker_num()
+        node_num = self.role_maker._node_num()
 
         if self.role_maker._is_collective:
             assert worker_num >= 1, "nccl2 worker_num must >= 1, now:{}" % worker_num
@@ -170,9 +170,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         # TODO(guru4elephant): should be an independent optimizer
         self._setup_nccl_op(startup_program, main_program, local_build_strategy)
 
-        local_build_strategy.num_trainers = self.role_maker.worker_num()
-        local_build_strategy.trainer_id = self.role_maker.worker_index()
-        local_build_strategy.trainers_endpoints = self.role_maker.get_trainer_endpoints(
+        local_build_strategy.num_trainers = self.role_maker._worker_num()
+        local_build_strategy.trainer_id = self.role_maker._worker_index()
+        local_build_strategy.trainers_endpoints = self.role_maker._get_trainer_endpoints(
         )
         local_build_strategy.enable_backward_optimizer_op_deps = True
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 4ebac20888d..9f094978d84 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -38,7 +38,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         if not self.user_defined_strategy.localsgd:
             return False
 
-        if self.role_maker.worker_num() <= 1:
+        if self.role_maker._worker_num() <= 1:
             return False
 
         return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
@@ -168,7 +168,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                         inputs={'X': [param]},
                         outputs={'Out': [param]},
                         attrs={
-                            'scale': 1.0 / self.role_maker.worker_num(),
+                            'scale': 1.0 / self.role_maker._worker_num(),
                             OP_ROLE_KEY: OpRole.Optimize
                         })
                     sub_block.append_op(
@@ -208,7 +208,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
         if not self.user_defined_strategy.adaptive_localsgd:
             return False
 
-        if self.role_maker.worker_num() <= 1:
+        if self.role_maker._worker_num() <= 1:
             return False
 
         return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
@@ -275,7 +275,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
             inputs={'X': [avg_loss]},
             outputs={'Out': [avg_loss]},
             attrs={
-                'scale': 1.0 / self.role_maker.worker_num(),
+                'scale': 1.0 / self.role_maker._worker_num(),
                 OP_ROLE_KEY: OpRole.Optimize
             })
 
@@ -398,7 +398,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
                         inputs={'X': [param]},
                         outputs={'Out': [param]},
                         attrs={
-                            'scale': 1.0 / self.role_maker.worker_num(),
+                            'scale': 1.0 / self.role_maker._worker_num(),
                             OP_ROLE_KEY: OpRole.Optimize
                         })
                     sub_block.append_op(
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
index 7dc532c86ea..dfa765364f3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
@@ -31,7 +31,7 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
         if k_steps < 0:
             return False
 
-        if self.role_maker.is_server():
+        if self.role_maker._is_server():
             return False
 
         if self.role_maker._is_heter_parameter_server_mode:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index 51d4d343165..38ad41f8836 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -239,10 +239,10 @@ class ParameterServerOptimizer(MetaOptimizerBase):
                                                      strategy, self.role_maker)
         compiled_config.strategy = strategy
 
-        if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
+        if self.role_maker._is_worker() or self.role_maker._is_heter_worker():
             main_program, startup_program = self._build_trainer_programs(
                 compiled_config)
-        elif self.role_maker.is_server():
+        elif self.role_maker._is_server():
             main_program, startup_program = self._build_pserver_programs(
                 compiled_config)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index 87fa7077911..889fec838ed 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -126,11 +126,11 @@ class PipelineOptimizer(MetaOptimizerBase):
         optimize_ops, params_grads, prog_list = \
             self.wrapped_opt.minimize(loss, startup_program,
                                       parameter_list, no_grad_set)
-        if self.role_maker.worker_num() == 1:
+        if self.role_maker._worker_num() == 1:
             return optimize_ops, params_grads
 
-        endpoints = self.role_maker.get_trainer_endpoints()
-        current_endpoint = endpoints[self.role_maker.worker_index()]
+        endpoints = self.role_maker._get_trainer_endpoints()
+        current_endpoint = endpoints[self.role_maker._worker_index()]
         self.startup_program = startup_program
         if startup_program is None:
             self.startup_program = fluid.default_startup_program()
@@ -142,7 +142,7 @@ class PipelineOptimizer(MetaOptimizerBase):
         self.nranks = nranks
         self.nrings = len(self.main_program_list)
 
-        self.rank = self.role_maker.worker_index()
+        self.rank = self.role_maker._worker_index()
         self.endpoints = endpoints
         self.current_endpoint = current_endpoint
 
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 227f8f60210..ae5c53b8a37 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -104,9 +104,9 @@ class ParameterServerRuntime(RuntimeBase):
     def _init_worker(self):
         def sync_strategy_envs():
             kwargs = {}
-            kwargs["pserver_endpoints"] = self.role_maker.get_pserver_endpoints(
-            )
-            kwargs["trainer_id"] = self.role_maker.worker_index()
+            kwargs[
+                "pserver_endpoints"] = self.role_maker._get_pserver_endpoints()
+            kwargs["trainer_id"] = self.role_maker._worker_index()
             return kwargs
 
         def geo_strategy_envs():
@@ -150,7 +150,7 @@ class ParameterServerRuntime(RuntimeBase):
                 return "#".join(init_attrs)
 
             kwargs = {}
-            kwargs["trainers"] = self.role_maker.worker_num()
+            kwargs["trainers"] = self.role_maker._worker_num()
             kwargs["sparse_attrs"] = get_sparse_attrs()
             return kwargs
 
@@ -338,7 +338,7 @@ class ParameterServerRuntime(RuntimeBase):
                 block.append_op(
                     type='recv_save',
                     attrs={
-                        "trainer_id": self.role_maker.worker_index(),
+                        "trainer_id": self.role_maker._worker_index(),
                         "shape": var.shape,
                         "slice_shapes":
                         [",".join([str(i) for i in var.shape])],
@@ -378,14 +378,15 @@ class ParameterServerRuntime(RuntimeBase):
             block.append_op(
                 type='recv_save',
                 attrs={
-                    "trainer_id": self.role_maker.worker_index(),
+                    "trainer_id": self.role_maker._worker_index(),
                     "shape": var.shape,
                     "slice_shapes": slice_shapes,
                     "slice_varnames": var_ctx.split_varnames(),
                     "remote_varnames": var_ctx.split_varnames(),
                     "is_sparse": True,
                     "endpoints": var_ctx.split_endpoints(),
-                    "pserver_num": len(self.role_maker.get_pserver_endpoints()),
+                    "pserver_num":
+                    len(self.role_maker._get_pserver_endpoints()),
                     "file_path": os.path.join(dirname, var.name)
                 })
 
@@ -403,7 +404,7 @@ class ParameterServerRuntime(RuntimeBase):
                 block.append_op(
                     type='recv_save',
                     attrs={
-                        "trainer_id": self.role_maker.worker_index(),
+                        "trainer_id": self.role_maker._worker_index(),
                         "shape": var.shape,
                         "slice_shapes": slice_shapes,
                         "slice_varnames": slice_varnames,
@@ -411,7 +412,7 @@ class ParameterServerRuntime(RuntimeBase):
                         "is_sparse": True,
                         "endpoints": var_ctx.split_endpoints(),
                         "pserver_num":
-                        len(self.role_maker.get_pserver_endpoints()),
+                        len(self.role_maker._get_pserver_endpoints()),
                         "file_path": os.path.join(dirname, var.name)
                     })
 
@@ -422,7 +423,7 @@ class ParameterServerRuntime(RuntimeBase):
                 block.append_op(
                     type='recv_save',
                     attrs={
-                        "trainer_id": self.role_maker.worker_index(),
+                        "trainer_id": self.role_maker._worker_index(),
                         "shape": var.shape,
                         "slice_shapes":
                         [",".join([str(i) for i in var.shape])],
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index 216478479a7..e348c67ae04 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -170,22 +170,40 @@ class CompileTimeStrategy(object):
         return trainer.mode == DistributedMode.ASYNC
 
     def get_role_id(self):
-        return self.role_maker.role_id()
+        try:
+            return self.role_maker._role_id()
+        except Exception:
+            return self.role_maker.role_id()
 
     def get_trainers(self):
-        return self.role_maker.worker_num()
+        try:
+            return self.role_maker._worker_num()
+        except Exception:
+            return self.role_maker.worker_num()
 
     def get_ps_endpoint(self):
-        return self.role_maker.get_pserver_endpoints()[self.get_role_id()]
+        try:
+            return self.role_maker._get_pserver_endpoints()[self.get_role_id()]
+        except Exception:
+            return self.role_maker.get_pserver_endpoints()[self.get_role_id()]
 
     def get_ps_endpoints(self):
-        return self.role_maker.get_pserver_endpoints()
+        try:
+            return self.role_maker._get_pserver_endpoints()
+        except Exception:
+            return self.role_maker.get_pserver_endpoints()
 
     def get_heter_worker_endpoints(self):
-        return self.role_maker._get_heter_worker_endpoints()
+        try:
+            return self.role_maker._get_heter_worker_endpoints()
+        except Exception:
+            return self.role_maker.get_heter_worker_endpoints()
 
     def get_heter_worker_endpoint(self):
-        return self.role_maker._get_heter_worker_endpoint()
+        try:
+            return self.role_maker._get_heter_worker_endpoint()
+        except Exception:
+            return self.role_maker.get_heter_worker_endpoint()
 
     def get_origin_programs(self):
         return self.origin_main_program, self.origin_startup_program
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 3a90b363f27..45597e7253c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -24,10 +24,10 @@ import numpy as np
 class TestFleetBase(unittest.TestCase):
     def setUp(self):
         os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36000"
         os.environ["PADDLE_TRAINERS_NUM"] = "2"
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-            "127.0.0.1:36001,127.0.0.2:36001"
+            "127.0.0.1:36001,127.0.0.2:36002"
 
     def test_init(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
@@ -58,32 +58,51 @@ class TestFleetBase(unittest.TestCase):
     def test_worker_endpoints(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        print(fleet.worker_endpoints(to_string=True))
+        self.assertEqual(
+            "127.0.0.1:36000", fleet.worker_endpoints(to_string=True))
+        self.assertEqual(["127.0.0.1:36000"], fleet.worker_endpoints())
 
     def test_server_num(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
-        if fleet.is_server():
-            print("fleet server num: {}".format(fleet.server_num()))
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        self.assertEqual(2, fleet.server_num())
 
     def test_server_index(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
-        if fleet.is_server():
-            print("fleet server index: {}".format(fleet.server_index()))
+        self.assertEqual(0, fleet.server_index())
 
     def test_server_endpoints(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
         if fleet.is_server():
-            print("fleet server index: {}".format(
-                fleet.server_endpoints(to_string=True)))
+            self.assertEqual(
+                "127.0.0.1:36001,127.0.0.2:36002",
+                fleet.server_endpoints(to_string=True))
+            self.assertEqual(["127.0.0.1:36001", "127.0.0.2:36002"],
+                             fleet.server_endpoints())
 
     def test_is_server(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
-        if fleet.is_server():
-            print("test fleet is server")
+        self.assertTrue(fleet.is_server())
 
     def test_util(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
index a831f6e838e..dae79071616 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
@@ -87,7 +87,7 @@ class TestCloudRoleMaker2(unittest.TestCase):
         role2._all_gather(1)
         role2._all_gather(1)
         role2._barrier_server()
-        role2.all_gather(1)
+        role2._all_gather(1)
         role3 = GeneralRoleMaker(path="./test_gloo_3")
         role3._worker_gather(1)
         role3._worker_gather(1)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
index d786fa1eba8..4dd254af251 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -30,19 +30,19 @@ class TestRoleMakerBase(unittest.TestCase):
 
     def test_rolemaker_base(self):
         role = role_maker.RoleMakerBase()
-        self.assertRaises(Exception, role.is_worker)
-        self.assertRaises(Exception, role.is_server)
-        self.assertRaises(Exception, role.is_first_worker)
-        self.assertRaises(Exception, role.worker_num)
-        self.assertRaises(Exception, role.server_num)
-        self.assertRaises(Exception, role.worker_index)
-        self.assertRaises(Exception, role.server_index)
-        self.assertRaises(Exception, role.role_id)
-        self.assertRaises(Exception, role.node_num)
-
-        trainer_endpoints = role.get_trainer_endpoints()
+        self.assertRaises(Exception, role._is_worker)
+        self.assertRaises(Exception, role._is_server)
+        self.assertRaises(Exception, role._is_first_worker)
+        self.assertRaises(Exception, role._worker_num)
+        self.assertRaises(Exception, role._server_num)
+        self.assertRaises(Exception, role._worker_index)
+        self.assertRaises(Exception, role._server_index)
+        self.assertRaises(Exception, role._role_id)
+        self.assertRaises(Exception, role._node_num)
+
+        trainer_endpoints = role._get_trainer_endpoints()
         self.assertTrue(len(trainer_endpoints) == 0)
-        pserver_endpoints = role.get_pserver_endpoints()
+        pserver_endpoints = role._get_pserver_endpoints()
         self.assertTrue(len(pserver_endpoints) == 0)
 
         print(role.to_string())
@@ -77,20 +77,32 @@ class TestCloudRoleMaker(unittest.TestCase):
             return
 
         ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
-
-        self.assertTrue(ro.is_worker())
-        self.assertFalse(ro.is_server())
-        self.assertEqual(ro.worker_num(), 2)
-        self.assertTrue(ro.is_first_worker())
-        worker_endpoints = ro.get_trainer_endpoints()
+        self.assertTrue(ro._is_worker())
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertFalse(ro._is_server())
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertEqual(ro._worker_num(), 2)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertTrue(ro._is_first_worker())
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        worker_endpoints = ro._get_trainer_endpoints()
         self.assertEqual(worker_endpoints[0], '127.0.0.1:36001')
-        self.assertEqual(ro.role_id(), 0)
-        self.assertEqual(ro.node_num(), 2)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertEqual(ro._role_id(), 0)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertEqual(ro._node_num(), 2)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertFalse(ro._is_non_distributed())
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertEqual(ro._heter_worker_num(), 0)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertFalse(ro._is_heter_worker())
 
     def test_tr_rolemaker_collective(self):
         ro = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        self.assertEqual(ro.worker_num(), 2)
-        self.assertEqual(ro.node_num(), 2)
+        self.assertEqual(ro._worker_num(), 2)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        self.assertEqual(ro._node_num(), 2)
 
     def test_ps_rolemaker(self):
         """Test ps rolemaker."""
@@ -106,11 +118,11 @@ class TestCloudRoleMaker(unittest.TestCase):
 
         ro = role_maker.PaddleCloudRoleMaker(
             is_collective=False, init_gloo=False)
-        self.assertEqual(ro.server_index(), 0)
-        self.assertFalse(ro.is_worker())
-        self.assertTrue(ro.is_server())
-        self.assertEqual(ro.server_num(), 2)
-        pserver_endpoints = ro.get_pserver_endpoints()
+        self.assertEqual(ro._server_index(), 0)
+        self.assertFalse(ro._is_worker())
+        self.assertTrue(ro._is_server())
+        self.assertEqual(ro._server_num(), 2)
+        pserver_endpoints = ro._get_pserver_endpoints()
         self.assertEqual(pserver_endpoints[0], '127.0.0.1:36001')
 
         self.assertEqual(ro._all_gather(1, "worker"), 1)
@@ -126,7 +138,7 @@ class TestCloudRoleMaker(unittest.TestCase):
             return
 
         ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
-        self.assertRaises(ValueError, ro.generate_role)
+        self.assertRaises(ValueError, ro._generate_role)
 
 
 class TestUserDefinedRoleMaker(unittest.TestCase):
@@ -151,10 +163,10 @@ class TestUserDefinedRoleMaker(unittest.TestCase):
             role=role_maker.Role.SERVER,
             current_id=0,
             worker_num=2)
-        self.assertEqual(ro.server_num(), 2)
-        ro.generate_role()
-        self.assertTrue(ro.is_server())
-        self.assertEqual(ro.role_id(), 0)
+        self.assertEqual(ro._server_num(), 2)
+        ro._generate_role()
+        self.assertTrue(ro._is_server())
+        self.assertEqual(ro._role_id(), 0)
 
     def test_tr_rolemaker(self):
         try:
@@ -171,9 +183,9 @@ class TestUserDefinedRoleMaker(unittest.TestCase):
             current_id=0,
             worker_num=2)
 
-        self.assertIn("127.0.0.1:36001", ro.get_pserver_endpoints())
-        self.assertTrue(ro.is_worker())
-        self.assertEqual(ro.role_id(), 0)
+        self.assertIn("127.0.0.1:36001", ro._get_pserver_endpoints())
+        self.assertTrue(ro._is_worker())
+        self.assertEqual(ro._role_id(), 0)
 
 
 class TestGlooWithCloudRoleMaker(unittest.TestCase):
@@ -216,7 +228,7 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
         os.environ["PADDLE_GLOO_FS_PATH"] = tmp
 
         role = role_maker.PaddleCloudRoleMaker()
-        role.generate_role()
+        role._generate_role()
         self.case(role, "worker")
         self.clean(tmp)
 
@@ -234,7 +246,7 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
         os.environ["PADDLE_GLOO_FS_PATH"] = tmp
 
         role = role_maker.PaddleCloudRoleMaker()
-        role.generate_role()
+        role._generate_role()
         self.case(role, "worker")
         self.clean(tmp)
 
@@ -256,7 +268,7 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
         os.environ["PADDLE_GLOO_FS_PATH"] = tmp
 
         role = role_maker.PaddleCloudRoleMaker()
-        role.generate_role()
+        role._generate_role()
         self.case(role, "server")
         self.clean(tmp)
 
@@ -280,7 +292,7 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
         os.environ["PADDLE_GLOO_FS_PATH"] = tmp
 
         role = role_maker.PaddleCloudRoleMaker()
-        role.generate_role()
+        role._generate_role()
         self.case(role, "server")
         self.clean(tmp)
 
@@ -302,7 +314,7 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
         os.environ["PADDLE_GLOO_HTTP_PORT"] = "30019"
 
         role = role_maker.PaddleCloudRoleMaker()
-        role.generate_role()
+        role._generate_role()
         import time
         time.sleep(3)
 
@@ -326,7 +338,7 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
         os.environ["PADDLE_GLOO_FS_PATH"] = tmp
 
         role = role_maker.PaddleCloudRoleMaker()
-        role.generate_role()
+        role._generate_role()
         self.case(role, "server")
         self.case(role, "all")
         self.clean(tmp)
@@ -354,7 +366,7 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
         os.environ["PADDLE_GLOO_FS_PATH"] = tmp
 
         role = role_maker.PaddleCloudRoleMaker()
-        role.generate_role()
+        role._generate_role()
         self.case(role, "server")
         self.case(role, "all")
         self.clean(tmp)
@@ -377,7 +389,323 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
         os.environ["PADDLE_GLOO_RENDEZVOUS"] = "5"
 
         role = role_maker.PaddleCloudRoleMaker()
-        self.assertRaises(ValueError, role.generate_role)
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_fs_gloo8(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        def net():
+            x = paddle.fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = paddle.fluid.layers.fc(input=x, size=1, act=None)
+            y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+            cost = paddle.fluid.layers.square_error_cost(
+                input=y_predict, label=y)
+            avg_cost = paddle.fluid.layers.mean(cost)
+            return avg_cost
+
+        from paddle.distributed import fleet
+
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        avg_cost = net()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = False
+
+        optimizer = paddle.optimizer.SGD(0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(avg_cost)
+
+        comm_world = "server"
+        fleet.util().barrier(comm_world)
+
+        gather = fleet.util().all_gather(1, comm_world)
+        self.assertEqual(gather[0], 1)
+
+        all_reduce = fleet.util().all_reduce(1, "sum", comm_world)
+        self.assertEqual(1, all_reduce)
+
+        self.clean(tmp)
+
+
+class TestGlooWithCloudRoleMaker(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def case(self, role, comm_world):
+        role._barrier(comm_world)
+
+        gather = role._all_gather(1, comm_world)
+        self.assertEqual(gather[0], 1)
+
+        all_reduce = role._all_reduce(1, "sum", comm_world)
+        self.assertEqual(1, all_reduce)
+
+    def mkdir(self):
+        tmp = tempfile.mkdtemp()
+        return tmp
+
+    def clean(self, tmp):
+        shutil.rmtree(tmp)
+
+    def test_hdfs_gloo(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "worker")
+        self.clean(tmp)
+
+    def test_fs_gloo(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "worker")
+        self.clean(tmp)
+
+    def test_fs_gloo2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.clean(tmp)
+
+    def test_fs_gloo3(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.clean(tmp)
+
+    def test_fs_gloo4(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
+        os.environ["PADDLE_GLOO_HTTP_HOST"] = "127.0.0.1"
+        os.environ["PADDLE_GLOO_HTTP_PORT"] = "30019"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        import time
+        time.sleep(3)
+
+    def test_fs_gloo5(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.case(role, "all")
+        self.clean(tmp)
+
+    def test_fs_gloo6(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.case(role, "all")
+        self.clean(tmp)
+
+    def test_fs_gloo7(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "5"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_hdfs_gloo_v2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = ""
+        os.environ["PADDLE_GLOO_FS_UGI"] = ""
+        os.environ["PADDLE_GLOO_FS_PATH"] = ""
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_fs_gloo_v2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = ""
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_http_gloo_v2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
+        os.environ["PADDLE_GLOO_HTTP_HOST"] = ""
+        os.environ["PADDLE_GLOO_HTTP_PORT"] = ""
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
 
     def test_fs_gloo8(self):
         plats = platform.platform()
-- 
GitLab


From f936adbd2d9e2a34dd4797ef1769e2c38e8cfae2 Mon Sep 17 00:00:00 2001
From: MRXLT <xlt2024@gmail.com>
Date: Mon, 21 Sep 2020 11:16:34 +0800
Subject: [PATCH 144/261] fix adam (#27343)

* fix adam

* rmsprop support double
---
 paddle/fluid/operators/optimizers/rmsprop_op.cc |  3 ++-
 paddle/fluid/operators/optimizers/rmsprop_op.cu |  3 ++-
 python/paddle/optimizer/adam.py                 | 11 +++++------
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc
index 99d1156ee6d..eeee008cdc5 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc
@@ -143,4 +143,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>);
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cu b/paddle/fluid/operators/optimizers/rmsprop_op.cu
index 8b17d6a0204..bf11ee68675 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cu
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cu
@@ -15,4 +15,5 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>);
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 708aaa788f6..24cebf8e6e6 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -282,14 +282,13 @@ class Adam(Optimizer):
         for param in self._parameter_list:
             if not param.trainable:
                 continue
-            if hasattr(
-                    param, "_is_sparse"
-            ) and param._is_sparse and self.regularization is not None:
-                raise RuntimeError(
-                    "Adam don't support weight_decay with sparse parameters, please set it to None."
-                )
             if param._grad_ivar() is not None:
                 grad_var = param._grad_ivar()
+                if hasattr(grad_var, "_is_sparse") and grad_var._is_sparse(
+                ) and self.regularization is not None:
+                    raise RuntimeError(
+                        "Adam don't support weight_decay with sparse parameters, please set it to None."
+                    )
                 params_grads.append((param, grad_var))
 
         optimize_ops = self._apply_optimize(
-- 
GitLab


From f3b4a64addfdae3f8c8f56ac919dc1e1ed1be229 Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Mon, 21 Sep 2020 11:23:46 +0800
Subject: [PATCH 145/261] fix CIFAR MNIST UCIHousing dataset. test=develop
 (#27368)

* fix CIFAR & MNIST dataset. test=develop
---
 python/paddle/tests/test_dataset_cifar.py  | 24 ++++++++++++++--------
 python/paddle/tests/test_datasets.py       |  6 ++++--
 python/paddle/text/datasets/uci_housing.py |  6 +++++-
 python/paddle/vision/datasets/cifar.py     |  1 +
 python/paddle/vision/datasets/mnist.py     | 11 +---------
 5 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/python/paddle/tests/test_dataset_cifar.py b/python/paddle/tests/test_dataset_cifar.py
index 2ecc41c3f0a..672de7ae8e9 100644
--- a/python/paddle/tests/test_dataset_cifar.py
+++ b/python/paddle/tests/test_dataset_cifar.py
@@ -27,8 +27,10 @@ class TestCifar10Train(unittest.TestCase):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 50000)
         data, label = cifar[idx]
-        self.assertTrue(len(data.shape) == 1)
-        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[2] == 32)
         self.assertTrue(0 <= int(label) <= 9)
 
 
@@ -41,8 +43,10 @@ class TestCifar10Test(unittest.TestCase):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 10000)
         data, label = cifar[idx]
-        self.assertTrue(len(data.shape) == 1)
-        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[2] == 32)
         self.assertTrue(0 <= int(label) <= 9)
 
 
@@ -55,8 +59,10 @@ class TestCifar100Train(unittest.TestCase):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 50000)
         data, label = cifar[idx]
-        self.assertTrue(len(data.shape) == 1)
-        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[2] == 32)
         self.assertTrue(0 <= int(label) <= 99)
 
 
@@ -69,8 +75,10 @@ class TestCifar100Test(unittest.TestCase):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 10000)
         data, label = cifar[idx]
-        self.assertTrue(len(data.shape) == 1)
-        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[2] == 32)
         self.assertTrue(0 <= int(label) <= 99)
 
 
diff --git a/python/paddle/tests/test_datasets.py b/python/paddle/tests/test_datasets.py
index 1e50ff60aa5..1e0d6dbacf6 100644
--- a/python/paddle/tests/test_datasets.py
+++ b/python/paddle/tests/test_datasets.py
@@ -103,12 +103,14 @@ class TestMNISTTest(unittest.TestCase):
 
 class TestMNISTTrain(unittest.TestCase):
     def test_main(self):
-        mnist = MNIST(mode='train', chw_format=False)
+        mnist = MNIST(mode='train')
         self.assertTrue(len(mnist) == 60000)
 
         for i in range(len(mnist)):
             image, label = mnist[i]
-            self.assertTrue(image.shape[0] == 784)
+            self.assertTrue(image.shape[0] == 1)
+            self.assertTrue(image.shape[1] == 28)
+            self.assertTrue(image.shape[2] == 28)
             self.assertTrue(label.shape[0] == 1)
             self.assertTrue(0 <= int(label) <= 9)
 
diff --git a/python/paddle/text/datasets/uci_housing.py b/python/paddle/text/datasets/uci_housing.py
index a0d465eb177..a8dfbc44a97 100644
--- a/python/paddle/text/datasets/uci_housing.py
+++ b/python/paddle/text/datasets/uci_housing.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import six
 import numpy as np
 
+import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
@@ -88,6 +89,8 @@ class UCIHousing(Dataset):
         # read dataset into memory
         self._load_data()
 
+        self.dtype = paddle.get_default_dtype()
+
     def _load_data(self, feature_num=14, ratio=0.8):
         data = np.fromfile(self.data_file, sep=' ')
         data = data.reshape(data.shape[0] // feature_num, feature_num)
@@ -103,7 +106,8 @@ class UCIHousing(Dataset):
 
     def __getitem__(self, idx):
         data = self.data[idx]
-        return np.array(data[:-1]), np.array(data[-1:])
+        return np.array(data[:-1]).astype(self.dtype), \
+                np.array(data[-1:]).astype(self.dtype)
 
     def __len__(self):
         return len(self.data)
diff --git a/python/paddle/vision/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
index 1193be26da5..631892ee4dc 100644
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -139,6 +139,7 @@ class Cifar10(Dataset):
 
     def __getitem__(self, idx):
         image, label = self.data[idx]
+        image = np.reshape(image, [3, 32, 32])
         if self.transform is not None:
             image = self.transform(image)
         return image, label
diff --git a/python/paddle/vision/datasets/mnist.py b/python/paddle/vision/datasets/mnist.py
index a9856133392..597d4046441 100644
--- a/python/paddle/vision/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
@@ -44,8 +44,6 @@ class MNIST(Dataset):
             :attr:`download` is True. Default None
         label_path(str): path to label file, can be set None if
             :attr:`download` is True. Default None
-        chw_format(bool): If set True, the output shape is [1, 28, 28],
-            otherwise, output shape is [1, 784]. Default True.
         mode(str): 'train' or 'test' mode. Default 'train'.
         download(bool): whether to download dataset automatically if
             :attr:`image_path` :attr:`label_path` is not set. Default True
@@ -70,14 +68,12 @@ class MNIST(Dataset):
     def __init__(self,
                  image_path=None,
                  label_path=None,
-                 chw_format=True,
                  mode='train',
                  transform=None,
                  download=True):
         assert mode.lower() in ['train', 'test'], \
                 "mode should be 'train' or 'test', but got {}".format(mode)
         self.mode = mode.lower()
-        self.chw_format = chw_format
         self.image_path = image_path
         if self.image_path is None:
             assert download, "image_path is not set and downloading automatically is disabled"
@@ -139,10 +135,6 @@ class MNIST(Dataset):
                                                       cols)).astype('float32')
                     offset_img += struct.calcsize(fmt_images)
 
-                    images = images / 255.0
-                    images = images * 2.0
-                    images = images - 1.0
-
                     for i in range(buffer_size):
                         self.images.append(images[i, :])
                         self.labels.append(
@@ -150,8 +142,7 @@ class MNIST(Dataset):
 
     def __getitem__(self, idx):
         image, label = self.images[idx], self.labels[idx]
-        if self.chw_format:
-            image = np.reshape(image, [1, 28, 28])
+        image = np.reshape(image, [1, 28, 28])
         if self.transform is not None:
             image = self.transform(image)
         return image, label
-- 
GitLab


From bbc84e0fe0f4401c4a087f74fdf24863b4157b4d Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Mon, 21 Sep 2020 13:28:08 +0800
Subject: [PATCH 146/261] Refine error msg in paddle/fluid/framework/details
 [part 1] (#25631)

* refine error msg in var_handle.h, test=develop

* refine all_reduce_op_handle

* fix some error msg

* refine variable_visitor

* refine threaded_ssa_graph_executor

* refine inplace related files

* refine executor related files

* refine fetch_op_handle.cc

* fix bug

* follow comments
---
 .../framework/details/all_reduce_op_handle.cc | 82 ++++++++++++++-----
 .../details/async_ssa_graph_executor.cc       | 15 +++-
 .../fast_threaded_ssa_graph_executor.cc       |  6 +-
 .../framework/details/fetch_op_handle.cc      |  8 +-
 .../details/parallel_ssa_graph_executor.cc    |  9 +-
 .../scope_buffered_ssa_graph_executor.cc      | 10 ++-
 .../details/share_tensor_buffer_functor.cc    | 58 ++++++++++---
 .../details/share_tensor_buffer_op_handle.cc  | 16 +++-
 .../framework/details/ssa_graph_executor.cc   |  6 +-
 .../details/threaded_ssa_graph_executor.cc    | 21 +++--
 .../details/threaded_ssa_graph_executor.h     |  4 +-
 paddle/fluid/framework/details/var_handle.h   | 11 ++-
 .../framework/details/variable_visitor.cc     | 71 ++++++++++------
 13 files changed, 237 insertions(+), 80 deletions(-)

diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 7fc08f3e0f2..939a2fc8fc9 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+
 #include <algorithm>
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -34,14 +36,24 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<platform::Place> &places,
                                      const platform::NCCLCommunicator *ctxs)
     : NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
 }
 #else
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places)
     : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
 }
 #endif
 
@@ -60,13 +72,25 @@ void AllReduceOpHandle::AllReduceImpl(
     const std::vector<VarHandle *> &in_var_handles,
     const std::vector<VarHandle *> &out_var_handles) {
   size_t num_places = places_.size();
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), num_places,
-      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), num_places,
+                    platform::errors::InvalidArgument(
+                        "The NoDummyInputSize should be equal "
+                        "to the number of places, but got NoDummyInputSize is "
+                        "%d and the number of place is %d.",
+                        in_var_handles.size(), num_places));
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-  PADDLE_ENFORCE_EQ(local_exec_scopes_.size(), num_places);
+      platform::errors::InvalidArgument(
+          "The NoDummyInputSize and NoDummyOutputSize should be "
+          "equal, but got NoDummyInputSize is %d and NoDummyOutputSize is %d.",
+          in_var_handles.size(), out_var_handles.size()));
+  PADDLE_ENFORCE_EQ(
+      local_exec_scopes_.size(), num_places,
+      platform::errors::InvalidArgument(
+          "The number of local scopes should be equal "
+          "to the number of places, but got the number of local scopes is "
+          "%d and the number of place is %d.",
+          in_var_handles.size(), num_places));
 
   std::vector<const void *> lod_tensor_data;
   std::vector<platform::Place> places;
@@ -78,23 +102,36 @@ void AllReduceOpHandle::AllReduceImpl(
   for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
     auto &local_scope = local_exec_scopes_[i];
     auto var = local_scope->FindVar(in_var_handles[i]->name());
-    PADDLE_ENFORCE_NOT_NULL(var, "%s is not found int scope.",
-                            in_var_handles[i]->name());
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::NotFound(
+                                     "Variable %s is not found in local scope.",
+                                     in_var_handles[i]->name()));
     auto &lod_tensor = var->Get<LoDTensor>();
 
     if (i == 0) {
       numel = static_cast<int64_t>(lod_tensor.numel());
       // only enforce place0, we will enforce other palce numel == place0 numel
       PADDLE_ENFORCE_GT(
-          numel, 0, platform::errors::InvalidArgument(
-                        "The numel of tensos=[%s] must > 0. But now numel=[%d]",
-                        in_var_handles[i]->name(), numel));
+          numel, 0,
+          platform::errors::PreconditionNotMet(
+              "The numel of tensor %s should be > 0, but got numel is %d.",
+              in_var_handles[i]->name(), numel));
       dtype = lod_tensor.type();
       is_gpu_place = platform::is_gpu_place(lod_tensor.place());
     }
-    PADDLE_ENFORCE_EQ(numel, static_cast<int64_t>(lod_tensor.numel()));
-    PADDLE_ENFORCE_EQ(dtype, lod_tensor.type());
-    PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()));
+    PADDLE_ENFORCE_EQ(
+        numel, static_cast<int64_t>(lod_tensor.numel()),
+        platform::errors::PreconditionNotMet(
+            "The size of tensors of the same variable in different local "
+            "scopes should be equal."));
+    PADDLE_ENFORCE_EQ(
+        dtype, lod_tensor.type(),
+        platform::errors::PreconditionNotMet(
+            "The dtype of tensors of the same variable in different local "
+            "scopes should be equal."));
+    PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()),
+                      platform::errors::PreconditionNotMet(
+                          "The place type of tensors of the same variable "
+                          "in different local scopes should be equal."));
 
     lod_tensor_data.emplace_back(lod_tensor.data<void>());
     places.emplace_back(lod_tensor.place());
@@ -102,8 +139,12 @@ void AllReduceOpHandle::AllReduceImpl(
     VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
              << ", out_name:" << out_var_handles[i]->name();
 
-    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
-                      "The name of input and output should be equal.");
+    PADDLE_ENFORCE_EQ(
+        in_var_handles[i]->name(), out_var_handles[i]->name(),
+        platform::errors::InvalidArgument(
+            "The name of input and output of all_reduce op should be equal, "
+            "but got input is %s and output is %s.",
+            in_var_handles[i]->name(), out_var_handles[i]->name()));
   }
 
   std::vector<std::string> grad_var_names;
@@ -122,7 +163,9 @@ void AllReduceOpHandle::AllReduceFunc(
     const std::vector<std::string> &out_var_names) {
   if (is_gpu_place(places[0])) {
 #if defined(PADDLE_WITH_NCCL)
-    PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+    PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
+                            platform::errors::InvalidArgument(
+                                "The nccl context should not be NULL."));
     ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
     std::vector<std::function<void()>> all_reduce_calls;
     for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
@@ -134,7 +177,8 @@ void AllReduceOpHandle::AllReduceFunc(
     }
     NCCLAllReduceFunc(all_reduce_calls);
 #else
-    PADDLE_THROW("Not compiled with CUDA.");
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
   } else {  // Special handle CPU only Operator's gradient. Like CRF
     auto &trg = *local_exec_scopes_[0]
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index d42bd0b16d7..12c0d674902 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -89,8 +89,19 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
       places_(std::move(places)),
       graphs_(std::move(graphs)) {
   VLOG(3) << "build AsyncSSAGraphExecutor";
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
-  PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
+  PADDLE_ENFORCE_EQ(
+      local_scopes_.size(), local_exec_scopes_.size(),
+      platform::errors::InvalidArgument(
+          "The number of local scopes and the number of local execution scopes "
+          "should be equal, but got number of local scopes is %d and "
+          "number of local execution scopes is %d.",
+          local_scopes_.size(), local_exec_scopes_.size()));
 
   // set the correct size of thread pool to each device.
   strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index e440dff2af6..7f1d3c9b340 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+
 #include <deque>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -48,7 +50,9 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
       bootstrap_ops_.emplace_back(op);
     }
   }
-  PADDLE_ENFORCE_GT(op_deps_.size(), 0, "The graph doesn't have operators.");
+  PADDLE_ENFORCE_GT(op_deps_.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "The graph doesn't have operators."));
   PrepareAtomicOpDeps();
 }
 
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index ae69960ef78..aedb8db46a5 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
+
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -138,8 +140,10 @@ void FetchOpHandle::RunImpl() {
     auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
     auto &scope = scopes.at(var_handle->scope_idx());
     auto *var = scope->FindVar(var_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
-                            var_handle->name());
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::NotFound(
+            "Cannot find variable %s in execution scope.", var_handle->name()));
 
     if (var->IsType<LoDTensor>()) {
       auto &t = var->Get<framework::LoDTensor>();
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index e7d466c4af0..35834fe5d74 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+
 #include <algorithm>
 #include <memory>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
@@ -104,7 +106,12 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
       places_(places),
       graphs_(std::move(graphs)),
       feed_status_(places.size(), FeedStatus::kNone) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
 
   PADDLE_ENFORCE_EQ(places_.size(), graphs_.size(),
                     platform::errors::InvalidArgument(
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index fe86d002ca8..7cc1f541314 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+
 #include <stdexcept>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -37,7 +39,13 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
       var_infos_(std::move(var_infos)),
       places_(std::move(places)),
       scope_monitor_(places_, local_exec_scopes_) {
-  PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size());
+  PADDLE_ENFORCE_EQ(
+      local_scopes_.size(), local_exec_scopes_.size(),
+      platform::errors::InvalidArgument(
+          "The number of local scopes and the number of local execution scopes "
+          "should be equal, but got number of local scopes is %d and "
+          "number of local execution scopes is %d.",
+          local_scopes_.size(), local_exec_scopes_.size()));
   PrepareLocalExeScopes();
 }
 
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
index 6fdec553f3d..19f075018ce 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
+
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -29,7 +31,8 @@ static inline const Tensor &GetTensorFromVar(const Variable *var) {
   if (var->IsType<LoDTensor>()) {
     return var->Get<LoDTensor>();
   } else {
-    PADDLE_THROW("Variable must be type of LoDTensor");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable must be type of LoDTensor."));
   }
 }
 
@@ -37,7 +40,8 @@ static inline Tensor *GetMutableTensorFromVar(Variable *var) {
   if (var->IsType<LoDTensor>()) {
     return var->GetMutable<LoDTensor>();
   } else {
-    PADDLE_THROW("Variable must be type of LoDTensor");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable must be type of LoDTensor."));
   }
 }
 
@@ -50,7 +54,12 @@ ShareTensorBufferFunctor::ShareTensorBufferFunctor(
       op_type_(op_type),
       in_var_infos_(in_var_infos),
       out_var_names_(out_var_names) {
-  PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size());
+  PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size(),
+                    platform::errors::PreconditionNotMet(
+                        "The number of input variables and output variables "
+                        "should be equal, but got number of input variables is "
+                        "%d and number of output variables is %d.",
+                        in_var_infos_.size(), out_var_names_.size()));
   for (size_t i = 0; i < in_var_infos_.size(); ++i) {
     AddReuseVarPair(in_var_infos_[i], out_var_names_[i]);
   }
@@ -67,32 +76,59 @@ ShareTensorBufferFunctor::ReusedVars() const {
 
 void ShareTensorBufferFunctor::AddReuseVarPair(
     const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) {
-  PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr");
+  PADDLE_ENFORCE_NOT_NULL(
+      in_var_info,
+      platform::errors::InvalidArgument(
+          "The input variables to be inplaced should not be NULL."));
   PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name,
-                    "in/out cannot have same name: %s", out_var_name);
+                    platform::errors::InvalidArgument(
+                        "The input variable and output variable to be inplaced "
+                        "cannot have the same name: %s.",
+                        out_var_name));
   in_var_infos_.emplace_back(in_var_info);
   out_var_names_.emplace_back(out_var_name);
 }
 
 void ShareTensorBufferFunctor::CallOnce() {
-  PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here");
+  PADDLE_ENFORCE(in_out_vars_.empty(),
+                 platform::errors::InvalidArgument(
+                     "The input-output variable pairs to be "
+                     "inplaced should be initialized here."));
   for (size_t i = 0; i < in_var_infos_.size(); ++i) {
     auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name());
     auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    PADDLE_ENFORCE_NOT_NULL(out_var);
-    PADDLE_ENFORCE_NE(in_var, out_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        in_var, platform::errors::NotFound(
+                    "The input variable(%s)to be inplaced should not be NULL.",
+                    in_var_infos_[i]->Name()));
+    PADDLE_ENFORCE_NOT_NULL(
+        out_var,
+        platform::errors::NotFound(
+            "The output variable(%s) to be inplaced should not be NULL.",
+            out_var_names_[i]));
+    PADDLE_ENFORCE_NE(
+        in_var, out_var,
+        platform::errors::PreconditionNotMet(
+            "The input variable and output variable to be inplaced "
+            "cannot be the same variable(%s).",
+            out_var_names_[i]));
     in_out_vars_.emplace_back(in_var, out_var);
   }
 }
 
 void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
   if (!exec_scope_) {
-    PADDLE_ENFORCE_NOT_NULL(exec_scope);
+    PADDLE_ENFORCE_NOT_NULL(exec_scope,
+                            platform::errors::InvalidArgument(
+                                "The given execution scope should not be NULL "
+                                "if the cached scope is NULL."));
     exec_scope_ = exec_scope;
     CallOnce();
   } else {
-    PADDLE_ENFORCE(exec_scope_ == exec_scope, "Scope must be the same");
+    PADDLE_ENFORCE_EQ(exec_scope_, exec_scope,
+                      platform::errors::InvalidArgument(
+                          "The given execution scope and the cached execution "
+                          "scope should be the same."));
   }
 
   for (size_t i = 0; i < in_var_infos_.size(); ++i) {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index f06507257f1..b805ad3b072 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
+
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -32,17 +34,25 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle(
     for (ir::Node *pending_op : out_var->outputs) {
       auto &op = pending_op->Wrapper<OpHandleBase>();
       auto *compute_op = dynamic_cast<ComputationOpHandle *>(&op);
-      PADDLE_ENFORCE_NOT_NULL(compute_op);
+      PADDLE_ENFORCE_NOT_NULL(
+          compute_op,
+          platform::errors::PreconditionNotMet(
+              "The pending OpHandle should be ComputationOpHandle."));
 
       if (result_op == nullptr) {
         result_op = compute_op;
       } else {
-        PADDLE_ENFORCE_EQ(result_op, compute_op);
+        PADDLE_ENFORCE_EQ(
+            result_op, compute_op,
+            platform::errors::PreconditionNotMet(
+                "The pending OpHandle should be the unique one."));
       }
     }
   }
 
-  PADDLE_ENFORCE_NOT_NULL(result_op);
+  PADDLE_ENFORCE_NOT_NULL(result_op,
+                          platform::errors::PreconditionNotMet(
+                              "The pending OpHandle should not be NULL."));
   return result_op;
 }
 
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
index 71123f708e3..2723a46dcfa 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
+
 #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 
 namespace paddle {
@@ -27,8 +28,9 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
     PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
                           dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
                       true,
-                      "The input ops of ClearFetchOp function should be "
-                      "FetchOpHandle or FetchAsyncOpHandle.");
+                      platform::errors::PreconditionNotMet(
+                          "The input ops of ClearFetchOp function should be "
+                          "FetchOpHandle or FetchAsyncOpHandle."));
     for (auto& out_var : op->Node()->outputs) {
       graph->RemoveNode(out_var);
     }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 92c3a0cd6b9..2ed52b3bd94 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -138,7 +139,10 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
         }
       }
     }
-    PADDLE_ENFORCE(ready_ops.empty());
+    PADDLE_ENFORCE_EQ(
+        ready_ops.empty(), true,
+        platform::errors::Fatal("After the execution of computation graph, "
+                                "there are unexecuted operators left."));
   }
 
   // Wait FetchOps.
@@ -165,9 +169,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
     FetchResultType *fetch_data, bool return_merged) {
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
   std::unordered_set<VarHandleBase *> local_ready_vars;
-  std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
-                                                   fetch_tensors.end());
-  for (auto &fetch_var_name : fetch_tensor_set) {
+
+  for (auto &fetch_var_name : fetch_tensors) {
     for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
@@ -231,7 +234,11 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
       ready_ops->insert(static_cast<OpHandleBase *>(op));
     }
   }
-  PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0);
+  PADDLE_ENFORCE_EQ(
+      local_ready_vars.size(), 0,
+      platform::errors::Fatal(
+          "The number of ready variables should be 0, but got %d.",
+          local_ready_vars.size()));
 }
 
 void ThreadedSSAGraphExecutor::InsertPendingOp(
@@ -277,7 +284,9 @@ void ThreadedSSAGraphExecutor::PrepareOpDeps() {
     }
   }
   op_deps_->num_ops_ = ready_ops.size() + pending_ops.size();
-  PADDLE_ENFORCE_GT(op_deps_->num_ops_, 0, "The graph doesn't have operators.");
+  PADDLE_ENFORCE_GT(
+      op_deps_->num_ops_, 0,
+      platform::errors::InvalidArgument("The graph doesn't have operators."));
 
   for (auto ready_var : ready_vars) {
     pending_vars.erase(ready_var);
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index b8b584f2720..45fa3adbf14 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <ThreadPool.h>  // ThreadPool in thrird party
+
 #include <deque>
 #include <functional>
 #include <list>
@@ -24,8 +26,6 @@
 #include <utility>
 #include <vector>
 
-#include <ThreadPool.h>  // ThreadPool in thrird party
-
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 86428f8b761..bb38424d3ae 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -54,8 +54,10 @@ struct VarHandleBase {
 
   void AddOutput(OpHandleBase* out, ir::Node* node) {
     if (pending_ops_.find(out) == pending_ops_.end()) {
-      PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr",
-                     this->Node()->Name());
+      PADDLE_ENFORCE_NOT_NULL(out,
+                              platform::errors::InvalidArgument(
+                                  "The output added to VarHandle %s is NULL.",
+                                  this->Node()->Name()));
       pending_ops_.insert(out);
       node_->outputs.push_back(node);
     }
@@ -120,7 +122,10 @@ struct VarHandle : public VarHandleBase {
   bool HasEvent() { return has_event_; }
 
   const cudaEvent_t& GetEvent() {
-    PADDLE_ENFORCE(HasEvent(), "The event is not set.");
+    PADDLE_ENFORCE_EQ(
+        HasEvent(), true,
+        platform::errors::PreconditionNotMet(
+            "The cuda event is not set, maybe InitCUDA() is not called."));
     return event_;
   }
 
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index 134f759081a..fba0c1bf463 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/variable_visitor.h"
+
 #include "paddle/fluid/framework/selected_rows.h"
 namespace paddle {
 namespace framework {
@@ -24,7 +25,9 @@ static void VisitVariable(Variable* var, Func* func) {
   } else if (var->IsType<SelectedRows>()) {
     (*func)(var->GetMutable<SelectedRows>());
   } else {
-    PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "VisitVariable is not supported for type %s.",
+        ToTypeName(var->Type())));
   }
 }
 
@@ -35,7 +38,8 @@ static void VisitVariable(const Variable& var, Func* func) {
   } else if (var.IsType<SelectedRows>()) {
     (*func)(var.Get<SelectedRows>());
   } else {
-    PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "VisitVariable is not supported for type %s.", ToTypeName(var.Type())));
   }
 }
 
@@ -50,7 +54,8 @@ struct TensorVisitor {
 
   template <typename T>
   void operator()() {
-    PADDLE_THROW("Not Support to get LoDTensor from %s", typeid(T).name());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Getting tensor from type %s is not supported.", typeid(T).name()));
   }
 };
 
@@ -78,8 +83,8 @@ struct ShareDimsAndLoDVisitor {
 
   template <typename T>
   void operator()(const T&) {
-    PADDLE_ENFORCE("ShareDimsAndLoD is not supported by type %s",
-                   typeid(T).name());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "ShareDimsAndLoD is not supported for type %s.", typeid(T).name()));
   }
 };
 
@@ -89,42 +94,54 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) {
 }
 
 struct EnforceShapeAndDTypeEQVisitor {
-  const Variable* trg_;
+  const Variable* dst_;
 
   void operator()(const LoDTensor& src) {
-    auto& tensor = trg_->Get<LoDTensor>();
-    PADDLE_ENFORCE_EQ(
-        src.place().which(), tensor.place().which(),
-        "The Places of the two Variable must be all on CPU or all on GPU.");
+    auto& tensor = dst_->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(src.place().which(), tensor.place().which(),
+                      platform::errors::PreconditionNotMet(
+                          "The place type of the two variables is not equal."));
     PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
-                      "The dtype of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.dims(), tensor.dims(),
-                      "The dims of the two Variable is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The dtype of the two variables is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.dims(), tensor.dims(),
+        platform::errors::PreconditionNotMet(
+            "The layout of the two variables' tensors is not equal."));
     PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(),
-                      "The lod of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.layout(), tensor.layout(),
-                      "The layout of the two Variable's tensor is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The lod of the two variable is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.layout(), tensor.layout(),
+        platform::errors::PreconditionNotMet(
+            "The layout of the two variables' tensors tensor is not equal."));
   }
 
   void operator()(const SelectedRows& src) {
-    auto& selected_rows = trg_->Get<SelectedRows>();
-    PADDLE_ENFORCE_EQ(
-        src.place().which(), selected_rows.place().which(),
-        "The Places of the two Variable must be all on CPU or all on GPU.");
+    auto& selected_rows = dst_->Get<SelectedRows>();
+    PADDLE_ENFORCE_EQ(src.place().which(), selected_rows.place().which(),
+                      platform::errors::PreconditionNotMet(
+                          "The place type of the two variables is not equal."));
     PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
-                      "The dtype of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.value().layout(), selected_rows.value().layout(),
-                      "The layout of the two Variable's tensor is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The dtype of the two variables is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.value().layout(), selected_rows.value().layout(),
+        platform::errors::PreconditionNotMet(
+            "The layout of the two variables' tensors is not equal."));
     PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(),
-                      "The height of the two Variable is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The height of the two variables is not equal."));
     PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(),
-                      "The dims of the two Variable is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The dims of the two variables is not equal."));
   }
 
   template <typename T>
   void operator()(const T&) {
-    PADDLE_ENFORCE("EnforceShapeAndDTypeEQ is not supported by type %s",
-                   typeid(T).name());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "EnforceShapeAndDTypeEQ is not supported for type %s.",
+        typeid(T).name()));
   }
 };
 
-- 
GitLab


From aa7835efeeb94a75d54c5e569a0f90fe06513a51 Mon Sep 17 00:00:00 2001
From: guofei <52460041+gfwm2013@users.noreply.github.com>
Date: Mon, 21 Sep 2020 13:32:02 +0800
Subject: [PATCH 147/261] Correct the error in decorator.py (#27409)

test=develop
---
 python/paddle/reader/decorator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index aadfb3f49ed..91a2a78203c 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -42,7 +42,7 @@ import paddle.compat as cpt
 # For more details, please refer to
 # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
 # https://bugs.python.org/issue33725
-if sys.version_info >= (3, 8):
+if sys.version_info >= (3, 8) and sys.platform == 'darwin':
     fork_context = multiprocessing.get_context('fork')
 else:
     fork_context = multiprocessing
-- 
GitLab


From 02606d45efa33b3b1d5932b5cbeb6d02844e0c1e Mon Sep 17 00:00:00 2001
From: huangxu96 <46740794+huangxu96@users.noreply.github.com>
Date: Mon, 21 Sep 2020 13:35:37 +0800
Subject: [PATCH 148/261] Quant op dev (#25932)

* Finished ChannelWiseQuantDequantAbsMaxOp and Passed unittests.

* Finished channel-wise quantize strategy in imperative quantization.

* Added Cuda code of ChannelWiseQuantDequantMaxAbsOP
Add Cuda code of ChannelWiseQuantDequantMaxAbsOp

* Add quant_axis for channel_wise quant.

* fixed a bug in unnitests, which will not trigger axis = 1 case and cannot meet the coverage rate requirement.

* Added some assert infomation and fixed some coding style mistakes.
---
 paddle/fluid/operators/fake_quantize_op.cc    | 135 ++++++
 paddle/fluid/operators/fake_quantize_op.cu    |  89 +++-
 paddle/fluid/operators/fake_quantize_op.h     |  31 ++
 paddle/fluid/operators/fused/fusion_gru_op.cc |   1 +
 paddle/fluid/pybind/op_function_generator.cc  |   1 +
 .../slim/quantization/imperative/qat.py       |  11 +-
 .../slim/quantization/imperative/quant_nn.py  | 112 ++++-
 .../contrib/slim/tests/test_imperative_qat.py |   1 -
 .../tests/test_imperative_qat_channelwise.py  | 428 ++++++++++++++++++
 .../tests/unittests/test_fake_quantize_op.py  |  65 +++
 10 files changed, 861 insertions(+), 13 deletions(-)
 create mode 100644 python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py

diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 04ac4a35208..e9b4c7dacf8 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -174,7 +174,64 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
 
 template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
                                                float>;
+template <typename T>
+struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int quant_axis,
+                  framework::Tensor* out) {
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
 
+    auto* scale_data = scale.data<T>();
+    auto* in_data = in.data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    auto in_dims = in.dims();
+    const int64_t channel = in_dims[quant_axis];
+    platform::Transform<platform::CPUDeviceContext> trans;
+    if (quant_axis == 0) {
+      const int64_t channel_size = in.numel() / channel;
+      for (int i = 0; i < channel; i++) {
+        T s = scale_data[i];
+        auto* start = in_data + i * channel_size;
+        auto* end = in_data + (i + 1) * channel_size;
+        trans(ctx, start, end, out_data + i * channel_size,
+              ClipFunctor<T>(-s, s));
+      }
+      for (int i = 0; i < channel; i++) {
+        T s = scale_data[i];
+        T inv_s = inverse(s);
+        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+        out_e.device(*ctx.eigen_device()) =
+            (bin_cnt * inv_s * out_e).round() * s / static_cast<T>(bin_cnt);
+      }
+    } else if (quant_axis == 1) {
+      const int64_t step_i = in.numel() / in_dims[0];
+      const int64_t step_j = in.numel() / (in_dims[0] * in_dims[1]);
+      for (int i = 0; i < in_dims[0]; i++) {
+        for (int j = 0; j < in_dims[1]; j++) {
+          T s = scale_data[j];
+          T inv_s = inverse(s);
+          auto* start = in_data + i * step_i + j * step_j;
+          auto* end = in_data + i * step_i + (j + 1) * step_j;
+          auto* cur_out_data = out_data + i * step_i + j * step_j;
+          trans(ctx, start, end, cur_out_data, ClipFunctor<T>(-s, s));
+          for (int k = 0; k < step_j; k++) {
+            cur_out_data[k] = std::round(bin_cnt * inv_s * cur_out_data[k]) *
+                              s / static_cast<T>(bin_cnt);
+          }
+        }
+      }
+    }
+  }
+};
+
+template struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext,
+                                                   float>;
 template <typename T>
 struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx,
@@ -360,6 +417,75 @@ $$0 \leq c \lt \ the\ channel\ number\ of\ X$$
   }
 };
 
+class FakeChannelWiseQuantizeDequantizeAbsMaxOp
+    : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                   "FakeChannelWiseQuantizeDequantizeAbsMax");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "FakeChannelWiseQuantizeDequantizeAbsMax");
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+                   "FakeChannelWiseQuantizeDequantizeAbsMax");
+    int quant_axis = ctx->Attrs().Get<int>("quant_axis");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddOutput("Out",
+              "(Tensor) Output of quantized and dequantized low level tensor, "
+              "saved as float data type.");
+    AddOutput("OutScale", "(Tensor) Current channel wise scale");
+    AddAttr<int>("quant_axis",
+                 "(int, default 0) The axis for quantization. "
+                 "For conv2d, depthwise_conv2d, conv2d_transpose "
+                 "and mul, the quant_axis is equal to the cout axis.")
+        .SetDefault(0)
+        .AddCustomChecker([](const int& quant_axis) {
+          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
+                            platform::errors::InvalidArgument(
+                                "'quant_axis' should be 0 or 1, but "
+                                "the received is %d",
+                                quant_axis));
+        });
+    AddAttr<int>("bit_length", "(int, default 8)")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+                            platform::errors::InvalidArgument(
+                                "'bit_length' should be between 1 and 16, but "
+                                "the received is %d",
+                                bit_length));
+        });
+    AddComment(R"DOC(
+The scale of FakeChannelWiseQuantize operator is a vector.
+In detail, each channel of the input X has a scale value.
+
+$$scale_c = max(abs(X_c))$$
+$$range = 2^{bit\_length - 1} - 1$$
+$$Out_c = round(\frac{X_c * range} {scale_c}) * \frac{scale_c} {range}$$
+In above three formulas, the range value of c is as follow:
+$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
+)DOC");
+  }
+};
+
 class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
  public:
   FakeQuantizeRangeAbsMaxOp(const std::string& type,
@@ -666,3 +792,12 @@ REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
 REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp);
 REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad,
                        ops::FakeQuantDequantGradKernel<CPU, float>);
+
+REGISTER_OPERATOR(fake_channel_wise_quantize_dequantize_abs_max,
+                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
+                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
+                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
+                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    fake_channel_wise_quantize_dequantize_abs_max,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CPU, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 6ff3c7ec632..8bc14dde863 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -417,8 +417,90 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
-                                               float>;
+// ChannelClipAndQuantDequantKernel for quant_axis is 0
+template <typename T>
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(
+    const T* in, const T* scale, const int bin_cnt, const int n, const int c,
+    T* out) {
+  int tid = threadIdx.x;
+
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+
+  T s = scale[blockIdx.x];
+  T inv_s = inverse(s);
+
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out_c[i] = round(v) * s / bin_cnt;
+  }
+}
+
+// ChannelClipAndQuantDequantKernel for quant_axis is 1
+template <typename T>
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(
+    const T* in, const T* scale, const int bin_cnt, const int n, const int cin,
+    const int cout, T* out) {
+  T s = scale[blockIdx.x % cout];
+  T inv_s = inverse(s);
+
+  int wh_size = n / (cin * cout);
+  const T* in_c = in + blockIdx.x * wh_size;
+  T* out_c = out + blockIdx.x * wh_size;
+
+  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out_c[i] = round(v) * s / bin_cnt;
+  }
+}
+
+template <typename T>
+struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int quant_axis,
+                  framework::Tensor* out) {
+    // At present, channelwise quantization supports conv2d, depthwise_conv2d
+    // conv2d_transpose and mul
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
+
+    int num = in.numel();
+    auto in_dims = in.dims();
+
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    if (quant_axis == 0) {
+      int grid = in_dims[0];
+      int block = 1024;
+      ChannelClipAndQuantDequantKernelQuantAxis0<
+          T><<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
+                                               num, in_dims[0], out_data);
+    } else if (quant_axis == 1) {
+      int grid = in_dims[0] * in_dims[1];
+      int block = 1024;
+
+      ChannelClipAndQuantDequantKernelQuantAxis1<
+          T><<<grid, block, 0, ctx.stream()>>>(
+          in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
+    }
+  }
+};
+
+template struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext,
+                                                   float>;
 
 }  // namespace operators
 }  // namespace paddle
@@ -443,3 +525,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad,
                         ops::FakeQuantDequantGradKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(
+    fake_channel_wise_quantize_dequantize_abs_max,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 5c6e0b1f6e2..2f5afbe0eed 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -72,6 +72,13 @@ struct ChannelClipAndFakeQuantFunctor {
                   const int quant_axis, framework::Tensor* out);
 };
 
+template <typename DeviceContext, typename T>
+struct ChannelClipFakeQuantDequantFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& scale, const int bin_cnt,
+                  const int quant_axis, framework::Tensor* out);
+};
+
 template <typename DeviceContext, typename T>
 struct FindMovingAverageAbsMaxFunctor {
   void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
@@ -154,6 +161,30 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    out->mutable_data<T>(dev_ctx.GetPlace());
+
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    int quant_axis = context.Attr<int>("quant_axis");
+
+    FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
+                                                 out_scale_data);
+
+    ChannelClipFakeQuantDequantFunctor<DeviceContext, T>()(
+        dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
+  }
+};
+
 template <typename DeviceContext, typename T>
 class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 40139066096..e3776a80b31 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_gru_op.h"
 #include <cstring>  // for memcpy
 #include <string>
+#include <vector>
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc.h"
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index f751136640c..d3052ebd351 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -111,6 +111,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"Out", "OutScale", "OutAccum", "OutState"}},
     {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
+    {"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}},
     {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
     {"update_loss_scaling",
      {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 7b276293638..8d7ebcf4caa 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -99,7 +99,12 @@ class ImperativeQuantAware(object):
         self._activation_bits = activation_bits
         self._moving_rate = moving_rate
 
-        quant_type = {'abs_max', 'moving_average_abs_max'}
+        quant_type = {
+            'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max'
+        }
+
+        assert activation_quantize_type != 'channel_wise_abs_max', \
+            "The activation quantization type does not support 'channel_wise_abs_max'."
         if activation_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown activation_quantize_type : '%s'. It can only be "
@@ -108,8 +113,8 @@ class ImperativeQuantAware(object):
         if weight_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown weight_quantize_type: '%s'. It can only be "
-                "'abs_max' or 'moving_average_abs_max' now." %
-                (str(weight_quantize_type)))
+                "'abs_max' or 'moving_average_abs_max' or 'channel_wise_abs_max' now."
+                % (str(weight_quantize_type)))
         self._activation_quantize_type = activation_quantize_type
         self._weight_quantize_type = weight_quantize_type
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index e22c980b0a7..2e35ac288c7 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -24,7 +24,7 @@ from paddle.fluid.data_feeder import check_variable_and_dtype
 
 __all__ = [
     'FakeQuantMovingAverage', 'FakeQuantAbsMax', 'QuantizedConv2D',
-    'QuantizedLinear'
+    'QuantizedLinear', 'FakeChannelWiseQuantDequantAbsMax'
 ]
 
 
@@ -209,6 +209,89 @@ class FakeQuantAbsMax(layers.Layer):
         return quant_out
 
 
+class FakeChannelWiseQuantDequantAbsMax(layers.Layer):
+    def __init__(self,
+                 name=None,
+                 channel_num=None,
+                 quant_bits=8,
+                 quant_axis=0,
+                 dtype='float32',
+                 quant_on_weight=False):
+        assert quant_on_weight == True, "Channel_wise only can be used on weight quantization."
+        super(FakeChannelWiseQuantDequantAbsMax, self).__init__()
+        self._quant_bits = quant_bits
+        self._quant_axis = quant_axis
+        self._dtype = dtype
+        self._name = name
+        self._channel_num = channel_num
+        scale_prefix = "{}.scale".format(
+            name) if name else 'quant_dequant.scale'
+        self._scale_name = unique_name.generate(scale_prefix)
+        if quant_on_weight:
+            scale_attr = ParamAttr(
+                name=self._scale_name,
+                initializer=Constant(0.0),
+                trainable=False)
+            self._scale = self.create_parameter(
+                shape=[self._channel_num], attr=scale_attr, dtype=self._dtype)
+            self._scale.stop_gradient = True
+        else:
+            self._scale = None
+
+    def forward(self, input):
+        if in_dygraph_mode():
+            attrs = ('bit_length', self._quant_bits, 'quant_axis',
+                     self._quant_axis)
+            quant_out = _varbase_creator(
+                type=input.type,
+                name="{}.quantized.dequantized".format(input.name),
+                shape=input.shape,
+                dtype=input.dtype,
+                persistable=False)
+
+            out_scale = self._scale
+            if out_scale is None:
+                out_scale = _varbase_creator(
+                    type=core.VarDesc.VarType.LOD_TENSOR,
+                    name=self._scale_name,
+                    shape=[self._channel_num],
+                    dtype=self._dtype,
+                    persistable=False)
+                out_scale.stop_gradient = True
+
+            out, _, = core.ops.fake_channel_wise_quantize_dequantize_abs_max(
+                input, quant_out, out_scale, *attrs)
+            return out
+
+        check_variable_and_dtype(input, 'input', ['float32'],
+                                 "FakeChannelWiseQuantDequantAbsMax")
+        attrs = {'bit_length': self._quant_bits, 'quant_axis': self._quant_axis}
+        inputs = {"X": [input]}
+        quant_out = self._helper.create_variable(
+            name="{}.quantized.dequantized".format(input.name),
+            dtype=input.dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=False)
+        out_scale = self._scale
+        if not out_scale:
+            out_scale = self._helper.create_variable(
+                name=self._scale_name,
+                dtype=self._dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=True)
+        outputs = {"Out": [quant_out], "OutScale": [out_scale]}
+
+        self._helper.append_op(
+            type="fake_channel_wise_quantize_dequantize_abs_max",
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs)
+
+        return quant_out
+
+
 def _get_fake_quant_type(quant_type, **kwargs):
     call_args = {
         "name": kwargs.get("name", None),
@@ -220,10 +303,17 @@ def _get_fake_quant_type(quant_type, **kwargs):
         call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
     elif quant_type == 'moving_average_abs_max':
         call_args["moving_rate"] = kwargs.get("moving_rate", 0.9)
-
+    elif quant_type == 'channel_wise_abs_max':
+        call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
+        call_args["channel_num"] = kwargs.get("channel_num", None)
+        call_args["quant_axis"] = kwargs.get("quant_axis", 0)
+        assert call_args["channel_num"] is not None, (
+            "You need to input channel_num"
+            "when you use channel_wise_abs_max strategy.")
     fake_quant_map = {
         'abs_max': FakeQuantAbsMax,
-        'moving_average_abs_max': FakeQuantMovingAverage
+        'moving_average_abs_max': FakeQuantMovingAverage,
+        'channel_wise_abs_max': FakeChannelWiseQuantDequantAbsMax
     }
 
     return fake_quant_map[quant_type](**call_args)
@@ -255,19 +345,23 @@ class QuantizedConv2D(layers.Layer):
         self.weight = getattr(layer, 'weight')
         self.bias = getattr(layer, 'bias')
         # For FakeQuant
+        self._conv2d_quant_axis = 0
         self._fake_quant_weight = _get_fake_quant_type(
             weight_quantize_type,
             name=self.weight.name,
             moving_rate=moving_rate,
             quant_bits=weight_bits,
             dtype=self._dtype,
-            quant_on_weight=True)
+            quant_on_weight=True,
+            channel_num=self.weight.shape[self._conv2d_quant_axis],
+            quant_axis=self._conv2d_quant_axis)
         self._fake_quant_input = _get_fake_quant_type(
             activation_quantize_type,
             name=layer.full_name(),
             moving_rate=moving_rate,
             quant_bits=activation_bits,
-            dtype=self._dtype)
+            dtype=self._dtype,
+            quant_on_weight=False)
 
     def forward(self, input):
         quant_input = self._fake_quant_input(input)
@@ -341,19 +435,23 @@ class QuantizedLinear(layers.Layer):
         self.weight = getattr(layer, 'weight')
         self.bias = getattr(layer, 'bias')
         # For FakeQuant
+        self._linear_quant_axis = 1
         self._fake_quant_weight = _get_fake_quant_type(
             weight_quantize_type,
             name=self.weight.name,
             moving_rate=moving_rate,
             quant_bits=weight_bits,
             dtype=self._dtype,
-            quant_on_weight=True)
+            quant_on_weight=True,
+            channel_num=self.weight.shape[self._linear_quant_axis],
+            quant_axis=self._linear_quant_axis)
         self._fake_quant_input = _get_fake_quant_type(
             activation_quantize_type,
             name=layer.full_name(),
             moving_rate=moving_rate,
             quant_bits=activation_bits,
-            dtype=self._dtype)
+            dtype=self._dtype,
+            quant_on_weight=False)
 
     def forward(self, input):
         quant_input = self._fake_quant_input(input)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index f076d274b64..0d047a0cd3b 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -181,7 +181,6 @@ class TestImperativeQat(unittest.TestCase):
 
                     img = fluid.dygraph.to_variable(x_data)
                     label = fluid.dygraph.to_variable(y_data)
-
                     out = lenet(img)
                     acc = fluid.layers.accuracy(out, label)
                     loss = fluid.layers.cross_entropy(out, label)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
new file mode 100644
index 00000000000..17c613281a8
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -0,0 +1,428 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import random
+import unittest
+import logging
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.framework import IrGraph
+from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
+from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.fluid.dygraph.container import Sequential
+from paddle.fluid.dygraph.nn import Conv2D
+from paddle.fluid.dygraph.nn import Pool2D
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.log_helper import get_logger
+
+os.environ["CPU_NUM"] = "1"
+if core.is_compiled_with_cuda():
+    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+def StaticLenet(data, num_classes=10, classifier_activation='softmax'):
+    conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+    conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+    fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+    fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+    fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+    conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
+    conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+    fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+    fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+    fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+    conv1 = fluid.layers.conv2d(
+        data,
+        num_filters=6,
+        filter_size=3,
+        stride=1,
+        padding=1,
+        param_attr=conv2d_w1_attr,
+        bias_attr=conv2d_b1_attr)
+    pool1 = fluid.layers.pool2d(
+        conv1, pool_size=2, pool_type='max', pool_stride=2)
+    conv2 = fluid.layers.conv2d(
+        pool1,
+        num_filters=16,
+        filter_size=5,
+        stride=1,
+        padding=0,
+        param_attr=conv2d_w2_attr,
+        bias_attr=conv2d_b2_attr)
+    pool2 = fluid.layers.pool2d(
+        conv2, pool_size=2, pool_type='max', pool_stride=2)
+
+    fc1 = fluid.layers.fc(input=pool2,
+                          size=120,
+                          param_attr=fc_w1_attr,
+                          bias_attr=fc_b1_attr)
+    fc2 = fluid.layers.fc(input=fc1,
+                          size=84,
+                          param_attr=fc_w2_attr,
+                          bias_attr=fc_b2_attr)
+    fc3 = fluid.layers.fc(input=fc2,
+                          size=num_classes,
+                          act=classifier_activation,
+                          param_attr=fc_w3_attr,
+                          bias_attr=fc_b3_attr)
+
+    return fc3
+
+
+class ImperativeLenet(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10, classifier_activation='softmax'):
+        super(ImperativeLenet, self).__init__()
+        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
+        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        self.features = Sequential(
+            Conv2D(
+                num_channels=1,
+                num_filters=6,
+                filter_size=3,
+                stride=1,
+                padding=1,
+                param_attr=conv2d_w1_attr,
+                bias_attr=conv2d_b1_attr),
+            Pool2D(
+                pool_size=2, pool_type='max', pool_stride=2),
+            Conv2D(
+                num_channels=6,
+                num_filters=16,
+                filter_size=5,
+                stride=1,
+                padding=0,
+                param_attr=conv2d_w2_attr,
+                bias_attr=conv2d_b2_attr),
+            Pool2D(
+                pool_size=2, pool_type='max', pool_stride=2))
+
+        self.fc = Sequential(
+            Linear(
+                input_dim=400,
+                output_dim=120,
+                param_attr=fc_w1_attr,
+                bias_attr=fc_b1_attr),
+            Linear(
+                input_dim=120,
+                output_dim=84,
+                param_attr=fc_w2_attr,
+                bias_attr=fc_b2_attr),
+            Linear(
+                input_dim=84,
+                output_dim=num_classes,
+                act=classifier_activation,
+                param_attr=fc_w3_attr,
+                bias_attr=fc_b3_attr))
+
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        x = fluid.layers.flatten(x, 1)
+        x = self.fc(x)
+        return x
+
+
+class TestImperativeQat(unittest.TestCase):
+    """
+    QAT = quantization-aware training
+    """
+
+    def test_qat_save(self):
+        imperative_qat = ImperativeQuantAware(
+            weight_quantize_type='channel_wise_abs_max',
+            activation_quantize_type='moving_average_abs_max')
+
+        with fluid.dygraph.guard():
+            lenet = ImperativeLenet()
+            imperative_qat.quantize(lenet)
+            adam = AdamOptimizer(
+                learning_rate=0.001, parameter_list=lenet.parameters())
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=32, drop_last=True)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=32)
+
+            epoch_num = 1
+            for epoch in range(epoch_num):
+                lenet.train()
+                for batch_id, data in enumerate(train_reader()):
+                    x_data = np.array([x[0].reshape(1, 28, 28)
+                                       for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                    img = fluid.dygraph.to_variable(x_data)
+                    label = fluid.dygraph.to_variable(y_data)
+                    out = lenet(img)
+                    acc = fluid.layers.accuracy(out, label)
+                    loss = fluid.layers.cross_entropy(out, label)
+                    avg_loss = fluid.layers.mean(loss)
+                    avg_loss.backward()
+                    adam.minimize(avg_loss)
+                    lenet.clear_gradients()
+                    if batch_id % 100 == 0:
+                        _logger.info(
+                            "Train | At epoch {} step {}: loss = {:}, acc= {:}".
+                            format(epoch, batch_id,
+                                   avg_loss.numpy(), acc.numpy()))
+
+                lenet.eval()
+                for batch_id, data in enumerate(test_reader()):
+                    x_data = np.array([x[0].reshape(1, 28, 28)
+                                       for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                    img = fluid.dygraph.to_variable(x_data)
+                    label = fluid.dygraph.to_variable(y_data)
+
+                    out = lenet(img)
+                    acc_top1 = fluid.layers.accuracy(
+                        input=out, label=label, k=1)
+                    acc_top5 = fluid.layers.accuracy(
+                        input=out, label=label, k=5)
+
+                    if batch_id % 100 == 0:
+                        _logger.info(
+                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".
+                            format(epoch, batch_id,
+                                   acc_top1.numpy(), acc_top5.numpy()))
+
+            # save weights
+            model_dict = lenet.state_dict()
+            fluid.save_dygraph(model_dict, "save_temp")
+
+            # test the correctness of `paddle.jit.save`
+            data = next(test_reader())
+            test_data = np.array([x[0].reshape(1, 28, 28)
+                                  for x in data]).astype('float32')
+            test_img = fluid.dygraph.to_variable(test_data)
+            lenet.eval()
+            before_save = lenet(test_img)
+
+        # save inference quantized model
+        path = "./mnist_infer_model"
+        paddle.jit.save(
+            layer=lenet,
+            model_path=path,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        [inference_program, feed_target_names, fetch_targets] = (
+            fluid.io.load_inference_model(
+                dirname=path,
+                executor=exe,
+                model_filename="__model__",
+                params_filename="__variables__"))
+        after_save, = exe.run(inference_program,
+                              feed={feed_target_names[0]: test_data},
+                              fetch_list=fetch_targets)
+
+        self.assertTrue(
+            np.allclose(after_save, before_save.numpy()),
+            msg='Failed to save the inference quantized model.')
+
+    def test_qat_acc(self):
+        def _build_static_lenet(main, startup, is_test=False, seed=1000):
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    main.random_seed = seed
+                    startup.random_seed = seed
+                    img = fluid.layers.data(
+                        name='image', shape=[1, 28, 28], dtype='float32')
+                    label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+                    prediction = StaticLenet(img)
+                    if not is_test:
+                        loss = fluid.layers.cross_entropy(
+                            input=prediction, label=label)
+                        avg_loss = fluid.layers.mean(loss)
+                    else:
+                        avg_loss = prediction
+            return img, label, avg_loss
+
+        reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
+        weight_quantize_type = 'channel_wise_abs_max'
+        activation_quant_type = 'moving_average_abs_max'
+        param_init_map = {}
+        seed = 1000
+        lr = 0.1
+
+        # imperative train
+        _logger.info(
+            "--------------------------dynamic graph qat--------------------------"
+        )
+        imperative_qat = ImperativeQuantAware(
+            weight_quantize_type=weight_quantize_type,
+            activation_quantize_type=activation_quant_type)
+
+        with fluid.dygraph.guard():
+            np.random.seed(seed)
+            fluid.default_main_program().random_seed = seed
+            fluid.default_startup_program().random_seed = seed
+            lenet = ImperativeLenet()
+            fixed_state = {}
+            for name, param in lenet.named_parameters():
+                p_shape = param.numpy().shape
+                p_value = param.numpy()
+                if name.endswith("bias"):
+                    value = np.zeros_like(p_value).astype('float32')
+                else:
+                    value = np.random.normal(
+                        loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
+                            p_shape).astype('float32')
+                fixed_state[name] = value
+                param_init_map[param.name] = value
+            lenet.set_dict(fixed_state)
+
+            imperative_qat.quantize(lenet)
+            adam = AdamOptimizer(
+                learning_rate=lr, parameter_list=lenet.parameters())
+            dynamic_loss_rec = []
+            lenet.train()
+            for batch_id, data in enumerate(reader()):
+                x_data = np.array([x[0].reshape(1, 28, 28)
+                                   for x in data]).astype('float32')
+                y_data = np.array(
+                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                img = fluid.dygraph.to_variable(x_data)
+                label = fluid.dygraph.to_variable(y_data)
+
+                out = lenet(img)
+                loss = fluid.layers.cross_entropy(out, label)
+                avg_loss = fluid.layers.mean(loss)
+                avg_loss.backward()
+                adam.minimize(avg_loss)
+                lenet.clear_gradients()
+                dynamic_loss_rec.append(avg_loss.numpy()[0])
+                if batch_id % 100 == 0:
+                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
+
+        paddle.jit.save(
+            layer=lenet,
+            model_path="./dynamic_mnist",
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
+
+        # static graph train
+        _logger.info(
+            "--------------------------static graph qat--------------------------"
+        )
+        static_loss_rec = []
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+
+        main = fluid.Program()
+        infer = fluid.Program()
+        startup = fluid.Program()
+        static_img, static_label, static_loss = _build_static_lenet(
+            main, startup, False, seed)
+        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
+                                                      seed)
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                opt = AdamOptimizer(learning_rate=lr)
+                opt.minimize(static_loss)
+
+        scope = core.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup)
+        for param in main.all_parameters():
+            param_tensor = scope.var(param.name).get_tensor()
+            param_tensor.set(param_init_map[param.name], place)
+
+        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
+        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
+        transform_pass = QuantizationTransformPass(
+            scope=scope,
+            place=place,
+            activation_quantize_type=activation_quant_type,
+            weight_quantize_type=weight_quantize_type,
+            quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
+        transform_pass.apply(main_graph)
+        transform_pass.apply(infer_graph)
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.fuse_all_reduce_ops = False
+        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
+            loss_name=static_loss.name, build_strategy=build_strategy)
+
+        feeder = fluid.DataFeeder(
+            feed_list=[static_img, static_label], place=place)
+        with fluid.scope_guard(scope):
+            for batch_id, data in enumerate(reader()):
+                loss_v, = exe.run(binary,
+                                  feed=feeder.feed(data),
+                                  fetch_list=[static_loss])
+                static_loss_rec.append(loss_v[0])
+                if batch_id % 100 == 0:
+                    _logger.info('{}: {}'.format('loss', loss_v))
+
+        save_program = infer_graph.to_program()
+        with fluid.scope_guard(scope):
+            fluid.io.save_inference_model("./static_mnist", [infer_img.name],
+                                          [infer_pre], exe, save_program)
+        rtol = 1e-05
+        atol = 1e-08
+        for i, (loss_d,
+                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
+            diff = np.abs(loss_d - loss_s)
+            if diff > (atol + rtol * np.abs(loss_s)):
+                _logger.info(
+                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
+                    format(diff, i, loss_d, loss_s))
+                break
+
+        self.assertTrue(
+            np.allclose(
+                np.array(dynamic_loss_rec),
+                np.array(static_loss_rec),
+                rtol=rtol,
+                atol=atol,
+                equal_nan=True),
+            msg='Failed to do the imperative qat.')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 7835fd3f53d..01f0abe0f21 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -306,5 +306,70 @@ class TestFakeQuantDequantAbsOp(OpTest):
         self.check_grad(["X"], "Out", user_defined_grads=gradient)
 
 
+class TestChannelWiseFakeQuantDequantOp(OpTest):
+    def setUp(self):
+        self.set_arg()
+        assert self.quant_axis in [0, 1], "quant_axis should be 0 or 1."
+
+        self.op_type = "fake_channel_wise_quantize_dequantize_abs_max"
+        self.attrs = {'bit_length': 8, 'quant_axis': self.quant_axis}
+
+        scales = []
+        outputs = self.inputs['X'].copy()
+        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
+        if self.quant_axis == 0:
+            for i in range(self.inputs['X'].shape[0]):
+                scale_v = np.max(np.abs(self.inputs['X'][i])).astype("float32")
+                scales.append(scale_v)
+                outputs[i] = np.round(outputs[i] * range_v /
+                                      scale_v) * scale_v / range_v
+        elif self.quant_axis == 1:
+            for i in range(self.inputs['X'].shape[1]):
+                scale_v = np.max(np.abs(self.inputs['X'][:, i])).astype(
+                    "float32")
+                scales.append(scale_v)
+                outputs[:, i] = np.round(outputs[:, i] * range_v /
+                                         scale_v) * scale_v / range_v
+
+        self.outputs = {
+            'Out': outputs,
+            'OutScale': np.array(scales).astype("float32"),
+        }
+
+    def set_arg(self):
+        self.quant_axis = 0
+        self.inputs = {
+            'X': np.random.random((3, 4, 64, 64)).astype("float32"),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        x = self.inputs["X"]
+        gradient = [np.ones(x.shape) / np.product(x.shape)]
+        self.check_grad(["X"], "Out", user_defined_grads=gradient)
+
+
+class TestChannelWiseFakeQuantDequantOp1(TestChannelWiseFakeQuantDequantOp):
+    def set_arg(self):
+        self.quant_axis = 1
+        self.inputs = {
+            'X': np.random.random((15, 20, 5, 5)).astype("float32"),
+        }
+
+
+class TestChannelWiseFakeQuantDequantOp2(TestChannelWiseFakeQuantDequantOp):
+    def set_arg(self):
+        self.quant_axis = 0
+        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+
+
+class TestChannelWiseFakeQuantDequantOp3(TestChannelWiseFakeQuantDequantOp):
+    def set_arg(self):
+        self.quant_axis = 1
+        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+
+
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From 39546aa2f32788e1b55394739d46e47cd37fc232 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Mon, 21 Sep 2020 13:39:17 +0800
Subject: [PATCH 149/261] Add pass compatible and unit test. (#27377)

---
 .../ir/embedding_fc_lstm_fuse_pass.cc         | 12 ++-
 paddle/fluid/framework/ir/fc_fuse_pass.cc     |  8 ++
 paddle/fluid/framework/ir/fc_gru_fuse_pass.cc | 22 ++++-
 .../fluid/framework/ir/fc_lstm_fuse_pass.cc   | 15 ++++
 .../framework/ir/squared_mat_sub_fuse_pass.cc | 30 +++++--
 .../framework/ir/squared_mat_sub_fuse_pass.h  |  2 +-
 .../inference/api/paddle_pass_builder.cc      |  3 +-
 python/paddle/fluid/layers/tensor.py          |  2 +
 .../ir/inference/test_fc_fuse_pass.py         | 54 ++++++++++++
 .../ir/inference/test_fc_gru_fuse_pass.py     | 86 +++++++++++++++++++
 .../ir/inference/test_fc_lstm_fuse_pass.py    | 52 +++++++++++
 .../test_squared_mat_sub_fuse_pass.py         | 63 ++++++++++++++
 ...test_transpose_flatten_concat_fuse_pass.py |  4 +-
 13 files changed, 342 insertions(+), 11 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py

diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
index c50b7476c6a..02e3e2542f6 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -23,6 +23,8 @@
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -34,7 +36,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 
   // Build pattern
   PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
-                  ->assert_is_op_input("lookup_table")
+                  ->assert_is_op_input("lookup_table_v2")
                   ->assert_var_not_persistable();
   patterns::Embedding embedding_pattern(pattern, name_scope);
   // TODO(jczaja): Intermediate can only be for val that are not used anywhere
@@ -256,3 +258,11 @@ void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(embedding_fc_lstm_fuse_pass,
               paddle::framework::ir::EmbeddingFCLSTMFusePass);
+REGISTER_PASS_CAPABILITY(embedding_fc_lstm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("lookup_table_v2", 0)
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("lstm", 0)
+            .EQ("fused_embedding_fc_lstm", 0));
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 066a8fb9757..d60510a4074 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -182,3 +183,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
 
 REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass)
     .RequirePassAttr("use_gpu");
+REGISTER_PASS_CAPABILITY(fc_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0)
+            .EQ("fc", 0));
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
index a2185cdc559..f5fea90ac2f 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -125,7 +126,6 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
     auto* x_n = subgraph.at(x);
     GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern);
@@ -136,10 +136,17 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
                               gru_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchHidden, gru_pattern);
 
+    // TODO(wilber): Support origin_mode=True.
+    if (gru->Op()->GetAttrIfExists<bool>("origin_mode") == true) {
+      LOG(INFO) << "fc_gru_fuse_pass not supported when origin_mode=True.";
+      return;
+    }
+
     if (with_fc_bias) {
       GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
       GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
       GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
 
       gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias);
       // Remove unneeded nodes.
@@ -188,3 +195,16 @@ void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
 REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
+REGISTER_PASS_CAPABILITY(mul_gru_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("gru", 0)
+            .EQ("fusion_gru", 0));
+REGISTER_PASS_CAPABILITY(fc_gru_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("gru", 0)
+            .EQ("fusion_gru", 0));
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index 12c7fc051e2..a3c57e14e1a 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -196,3 +197,17 @@ void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
 REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
+
+REGISTER_PASS_CAPABILITY(fc_lstm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("lstm", 0)
+            .EQ("fusion_lstm", 0));
+REGISTER_PASS_CAPABILITY(mul_lstm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("lstm", 0)
+            .EQ("fusion_lstm", 0));
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
index 035b198bdcc..d74843611cd 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -77,7 +78,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
   };
 
   auto is_fusion_input_var = [=](Node* x, const std::string& arg_name) {
-    bool basic = var_is_op_input(x, "matmul", arg_name) &&
+    bool basic = (var_is_op_input(x, "matmul_v2", arg_name) ||
+                  var_is_op_input(x, "matmul", arg_name)) &&
                  var_is_op_input(x, "square", "X");
     if (!basic) {
       return false;
@@ -88,7 +90,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
     }
     auto* squared_x = squared_x_op->outputs[0];
     bool next_is_matmul_from_arg =
-        var_is_op_input(squared_x, "matmul", arg_name) &&
+        (var_is_op_input(squared_x, "matmul_v2", arg_name) ||
+         var_is_op_input(squared_x, "matmul", arg_name)) &&
         squared_x->outputs.size() == 1 &&
         squared_x->outputs[0]->outputs.size() == 1;
     if (!next_is_matmul_from_arg) {
@@ -103,7 +106,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
   auto is_fusion_first_mul_out = [=](Node* x) -> bool {
     bool input_is_matmul_op = x && x->inputs.size() == 1 &&
                               x->inputs[0]->IsOp() &&
-                              x->inputs[0]->Op()->Type() == "matmul";
+                              (x->inputs[0]->Op()->Type() == "matmul_v2" ||
+                               x->inputs[0]->Op()->Type() == "matmul");
     if (!input_is_matmul_op) {
       return false;
     }
@@ -167,7 +171,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
 
   auto* matmul_xy_op = pattern->NewNode(
       [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "matmul" &&
+        return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
+                                  x->Op()->Type() == "matmul") &&
                is_fusion_first_mul_out(x->outputs[0]);
       },
       name_scope + "/matmul_xy_op");
@@ -189,7 +194,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
 
   auto is_fusion_mat_squared_x_y_op_out = [=](Node* x) -> bool {
     bool basic = x && x->IsVar() && x->inputs.size() == 1 &&
-                 x->inputs[0]->IsOp() && x->inputs[0]->Op()->Type() == "matmul";
+                 x->inputs[0]->IsOp() &&
+                 (x->inputs[0]->Op()->Type() == "matmul_v2" ||
+                  x->inputs[0]->Op()->Type() == "matmul");
     if (!basic) {
       return false;
     }
@@ -206,7 +213,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
 
   auto* matmul_squared_x_y_op = pattern->NewNode(
       [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "matmul" &&
+        return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
+                                  x->Op()->Type() == "matmul") &&
                is_fusion_mat_squared_x_y_op_out(x->outputs[0]);
       },
       name_scope + "/matmul_squared_x_y_op");
@@ -378,3 +386,13 @@ void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(squared_mat_sub_fuse_pass,
               paddle::framework::ir::SquaredMatSubFusePass);
+REGISTER_PASS_CAPABILITY(squared_mat_sub_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("matmul", 0)
+            .EQ("matmul_v2", 0)
+            .EQ("square", 0)
+            .EQ("elementwise_mul", 0)
+            .EQ("elementwise_sub", 0)
+            .EQ("fill_constant", 0)
+            .EQ("fusion_squared_mat_sub", 0));
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
index b6165a512ac..56b7ec9b843 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
@@ -24,7 +24,7 @@ namespace framework {
 namespace ir {
 
 /**
- * Fuse ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar
+ * Fuse ( (A * B).^2 - (A.^2 * B.^2) ) .* scalar
  */
 class SquaredMatSubFusePass : public FusePassBase {
  public:
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index c19e77d2714..19f52422b44 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -156,7 +156,8 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
                   // "seqpool_concat_fuse_pass",    //
                   "seqpool_cvm_concat_fuse_pass",  //
                   // "embedding_fc_lstm_fuse_pass", //
-                  "fc_lstm_fuse_pass",                       //
+                  // TODO(wilber): fix correctness problem.
+                  // "fc_lstm_fuse_pass",                       //
                   "mul_lstm_fuse_pass",                      //
                   "fc_gru_fuse_pass",                        //
                   "mul_gru_fuse_pass",                       //
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 89acfc6075b..0ce7c098e2d 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -680,8 +680,10 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     if not isinstance(value, Variable):
         if dtype in ['int64', 'int32']:
             attrs['str_value'] = str(int(value))
+            attrs['value'] = int(value)
         else:
             attrs['str_value'] = str(float(value))
+            attrs['value'] = float(value)
 
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
new file mode 100644
index 00000000000..a62adcea3f9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class FcFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 128, 768], dtype="float32")
+            data_y = fluid.data(name="y", shape=[-1, 128, 768], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=3072,
+                                      num_flatten_dims=2,
+                                      act="relu")
+            fc_out2 = fluid.layers.fc(input=fc_out1,
+                                      size=768,
+                                      num_flatten_dims=2)
+
+        self.feeds = {"data": np.random.random((4, 128, 768)).astype("float32")}
+        self.fetch_list = [fc_out2]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+        self.assertTrue(PassVersionChecker.IsCompatible('fc_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py
new file mode 100644
index 00000000000..f7b43470d40
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class FcGruFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            dict_dim, emb_dim = 128, 64
+            data = fluid.data(
+                name='step_data', shape=[None], dtype='int64', lod_level=1)
+            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            hidden_dim = 512
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
+            hidden = fluid.layers.dynamic_gru(
+                input=x,
+                size=hidden_dim,
+                bias_attr=True,
+                origin_mode=False,
+                is_reverse=True)
+
+        batch = 16
+        lod_tensor = fluid.LoDTensor()
+        lod_tensor.set(np.random.randint(
+            0, dict_dim, size=[batch]).astype("int64"),
+                       fluid.CPUPlace())
+        lod_tensor.set_lod([[0, batch]])
+        self.feeds = {"step_data": lod_tensor}
+        self.fetch_list = [hidden]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+        self.assertTrue(PassVersionChecker.IsCompatible('fc_gru_fuse_pass'))
+
+
+class MulGruFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            dict_dim, emb_dim = 128, 64
+            data = fluid.data(
+                name='step_data', shape=[None], dtype='int64', lod_level=1)
+            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            hidden_dim = 512
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 3, bias_attr=False)
+            hidden = fluid.layers.dynamic_gru(
+                input=x,
+                size=hidden_dim,
+                bias_attr=True,
+                origin_mode=False,
+                is_reverse=True)
+
+        batch = 16
+        lod_tensor = fluid.LoDTensor()
+        lod_tensor.set(np.random.randint(
+            0, dict_dim, size=[batch]).astype("int64"),
+                       fluid.CPUPlace())
+        lod_tensor.set_lod([[0, batch]])
+        self.feeds = {"step_data": lod_tensor}
+        self.fetch_list = [hidden]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+        self.assertTrue(PassVersionChecker.IsCompatible('mul_gru_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py
new file mode 100644
index 00000000000..fbb4373dae2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class MulLstmFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            dict_dim, emb_dim = 128, 64
+            hidden_dim = 512
+
+            data = fluid.data(
+                name='data', shape=[1], dtype='int64', lod_level=1)
+            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 4, bias_attr=False)
+            forward, cell = fluid.layers.dynamic_lstm(
+                input=x, size=hidden_dim * 4)
+
+        batch = 16
+        lod_tensor = fluid.LoDTensor()
+        lod_tensor.set(np.random.randint(
+            0, dict_dim, size=[batch]).astype("int64"),
+                       fluid.CPUPlace())
+        lod_tensor.set_lod([[0, batch]])
+        self.feeds = {"data": lod_tensor}
+        self.fetch_list = [forward, cell]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+        self.assertTrue(PassVersionChecker.IsCompatible('mul_lstm_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
new file mode 100644
index 00000000000..5fa242df4e4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class SquaredMatSubFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_a = fluid.data(name="data_a", shape=[128, 1], dtype="float32")
+            data_b = fluid.data(name="data_b", shape=[256, 1], dtype="float32")
+
+            fc_a = fluid.layers.fc(data_a, size=256)
+            fc_b = fluid.layers.fc(data_b, size=64)
+
+            data_a_square = paddle.square(fc_a)
+            data_b_square = paddle.square(fc_b)
+
+            matmul_ab = paddle.matmul(fc_a, fc_b)
+            matmul_ab_square = paddle.square(matmul_ab)
+            matmul_square_ab = paddle.matmul(data_a_square, data_b_square)
+
+            scale = paddle.fill_constant(shape=[1], value=0.5, dtype='float32')
+
+            sub_val = paddle.elementwise_sub(matmul_ab_square, matmul_square_ab)
+            squared_mat_sub_out = fluid.layers.elementwise_mul(sub_val, scale)
+
+        self.feeds = {
+            "data_a": np.random.random((128, 1)).astype("float32"),
+            "data_b": np.random.random((256, 1)).astype("float32")
+        }
+        self.fetch_list = [squared_mat_sub_out]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('squared_mat_sub_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
index 34a52e7aed3..83d4b7091cb 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
@@ -75,7 +75,9 @@ class TransposeFlattenConcatFusePassWithAxisTest(InferencePassTest):
             use_gpu = True
             self.check_output_with_option(use_gpu)
 
-        PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass')
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'transpose_flatten_concat_fuse_pass'))
 
 
 if __name__ == "__main__":
-- 
GitLab


From c7e5cf16ba4529964a5d24ab8e2554a16070c411 Mon Sep 17 00:00:00 2001
From: zhangchunle <clzhang_cauc@163.com>
Date: Mon, 21 Sep 2020 14:19:57 +0800
Subject: [PATCH 150/261] rm setup.py;test=document_fix (#27421)

---
 setup.py | 577 -------------------------------------------------------
 1 file changed, 577 deletions(-)
 delete mode 100644 setup.py

diff --git a/setup.py b/setup.py
deleted file mode 100644
index af558c2ef0b..00000000000
--- a/setup.py
+++ /dev/null
@@ -1,577 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import subprocess
-import os
-import os.path
-import errno
-import re
-import shutil
-import sys
-import fnmatch
-import errno
-import platform
-
-from contextlib import contextmanager
-from setuptools import Command
-from setuptools import setup, Distribution, Extension
-from setuptools.command.install import install as InstallCommandBase
-
-
-class BinaryDistribution(Distribution):
-    def has_ext_modules(foo):
-        return True
-
-
-RC = 0
-
-ext_name = '.dll' if os.name == 'nt' else ('.dylib' if sys.platform == 'darwin'
-                                           else '.so')
-
-
-def git_commit():
-    try:
-        cmd = ['git', 'rev-parse', 'HEAD']
-        git_commit = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE,
-            cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
-    except:
-        git_commit = 'Unknown'
-    git_commit = git_commit.decode()
-    return str(git_commit)
-
-
-def _get_version_detail(idx):
-    assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
-        so detail index must less than 3"
-
-    if re.match('@TAG_VERSION_REGEX@', '@PADDLE_VERSION@'):
-        version_details = '@PADDLE_VERSION@'.split('.')
-
-        if len(version_details) >= 3:
-            return version_details[idx]
-
-    return 0
-
-
-def get_major():
-    return int(_get_version_detail(0))
-
-
-def get_minor():
-    return int(_get_version_detail(1))
-
-
-def get_patch():
-    return str(_get_version_detail(2))
-
-
-def is_taged():
-    try:
-        cmd = [
-            'git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null'
-        ]
-        git_tag = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE,
-            cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
-        git_tag = git_tag.decode()
-    except:
-        return False
-
-    if str(git_tag).replace('v', '') == '@PADDLE_VERSION@':
-        return True
-    else:
-        return False
-
-
-def write_version_py(filename='paddle/version.py'):
-    cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
-#
-full_version    = '%(major)d.%(minor)d.%(patch)s'
-major           = '%(major)d'
-minor           = '%(minor)d'
-patch           = '%(patch)s'
-rc              = '%(rc)d'
-istaged         = %(istaged)s
-commit          = '%(commit)s'
-with_mkl        = '%(with_mkl)s'
-
-def show():
-    if istaged:
-        print('full_version:', full_version)
-        print('major:', major)
-        print('minor:', minor)
-        print('patch:', patch)
-        print('rc:', rc)
-    else:
-        print('commit:', commit)
-
-def mkl():
-    return with_mkl
-'''
-    commit = git_commit()
-    with open(filename, 'w') as f:
-        f.write(cnt % {
-            'major': get_major(),
-            'minor': get_minor(),
-            'patch': get_patch(),
-            'rc': RC,
-            'version': '${PADDLE_VERSION}',
-            'commit': commit,
-            'istaged': is_taged(),
-            'with_mkl': '@WITH_MKL@'
-        })
-
-
-write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
-
-
-def write_distributed_training_mode_py(
-        filename='paddle/fluid/incubate/fleet/parameter_server/version.py'):
-    cnt = '''from __future__ import print_function
-
-# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
-
-from paddle.fluid.incubate.fleet.base.mode import Mode
-
-BUILD_MODE=Mode.%(mode)s
-
-def is_transpiler():
-    return Mode.TRANSPILER == BUILD_MODE
-
-'''
-
-    dirname = os.path.dirname(filename)
-
-    try:
-        os.makedirs(dirname)
-    except OSError as e:
-        if e.errno != errno.EEXIST:
-            raise
-
-    with open(filename, 'w') as f:
-        f.write(cnt %
-                {'mode': 'PSLIB' if '${WITH_PSLIB}' == 'ON' else 'TRANSPILER'})
-
-
-write_distributed_training_mode_py(
-    filename='@PADDLE_BINARY_DIR@/python/paddle/fluid/incubate/fleet/parameter_server/version.py'
-)
-
-packages = [
-    'paddle',
-    'paddle.libs',
-    'paddle.utils',
-    'paddle.dataset',
-    'paddle.reader',
-    'paddle.distributed',
-    'paddle.incubate',
-    'paddle.incubate.complex',
-    'paddle.incubate.complex.tensor',
-    'paddle.distributed.fleet',
-    'paddle.distributed.fleet.base',
-    'paddle.distributed.fleet.meta_optimizers',
-    'paddle.distributed.fleet.runtime',
-    'paddle.distributed.fleet.dataset',
-    'paddle.distributed.fleet.metrics',
-    'paddle.distributed.fleet.proto',
-    'paddle.distributed.fleet.utils',
-    'paddle.framework',
-    'paddle.jit',
-    'paddle.fluid',
-    'paddle.fluid.inference',
-    'paddle.fluid.dygraph',
-    'paddle.fluid.dygraph.dygraph_to_static',
-    'paddle.fluid.dygraph.amp',
-    'paddle.fluid.proto',
-    'paddle.fluid.proto.profiler',
-    'paddle.fluid.distributed',
-    'paddle.fluid.layers',
-    'paddle.fluid.dataloader',
-    'paddle.fluid.contrib',
-    'paddle.fluid.contrib.decoder',
-    'paddle.fluid.contrib.quantize',
-    'paddle.fluid.contrib.reader',
-    'paddle.fluid.contrib.slim',
-    'paddle.fluid.contrib.slim.quantization',
-    'paddle.fluid.contrib.slim.quantization.imperative',
-    'paddle.fluid.contrib.utils',
-    'paddle.fluid.contrib.extend_optimizer',
-    'paddle.fluid.contrib.mixed_precision',
-    'paddle.fluid.contrib.layers',
-    'paddle.fluid.transpiler',
-    'paddle.fluid.transpiler.details',
-    'paddle.fluid.incubate',
-    'paddle.fluid.incubate.data_generator',
-    'paddle.fluid.incubate.fleet',
-    'paddle.fluid.incubate.checkpoint',
-    'paddle.fluid.incubate.fleet.base',
-    'paddle.fluid.incubate.fleet.parameter_server',
-    'paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler',
-    'paddle.fluid.incubate.fleet.parameter_server.pslib',
-    'paddle.fluid.incubate.fleet.parameter_server.ir',
-    'paddle.fluid.incubate.fleet.collective',
-    'paddle.fluid.incubate.fleet.utils',
-    'paddle.hapi',
-    'paddle.vision',
-    'paddle.vision.models',
-    'paddle.vision.transforms',
-    'paddle.vision.datasets',
-    'paddle.text',
-    'paddle.text.datasets',
-    'paddle.incubate',
-    'paddle.io',
-    'paddle.optimizer',
-    'paddle.nn',
-    'paddle.nn.functional',
-    'paddle.nn.layer',
-    'paddle.nn.initializer',
-    'paddle.nn.utils',
-    'paddle.metric',
-    'paddle.static',
-    'paddle.static.nn',
-    'paddle.tensor',
-]
-
-with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
-    setup_requires = f.read().splitlines()
-
-# Note(wangzhongpu):
-# When compiling paddle under python36, the dependencies belonging to python2.7 will be imported, resulting in errors when installing paddle
-if sys.version_info >= (3, 6) and sys.version_info < (3, 7):
-    setup_requires_tmp = []
-    for setup_requires_i in setup_requires:
-        if "<\"3.6\"" in setup_requires_i or "<\"3.5\"" in setup_requires_i or "<=\"3.5\"" in setup_requires_i:
-            continue
-        setup_requires_tmp += [setup_requires_i]
-    setup_requires = setup_requires_tmp
-if sys.version_info >= (3, 5) and sys.version_info < (3, 6):
-    setup_requires_tmp = []
-    for setup_requires_i in setup_requires:
-        if "<\"3.5\"" in setup_requires_i:
-            continue
-        setup_requires_tmp += [setup_requires_i]
-    setup_requires = setup_requires_tmp
-if sys.version_info >= (3, 7):
-    setup_requires_tmp = []
-    for setup_requires_i in setup_requires:
-        if "<\"3.6\"" in setup_requires_i or "<=\"3.6\"" in setup_requires_i or "<\"3.5\"" in setup_requires_i or "<=\"3.5\"" in setup_requires_i or "<\"3.7\"" in setup_requires_i:
-            continue
-        setup_requires_tmp += [setup_requires_i]
-    setup_requires = setup_requires_tmp
-
-if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
-    setup_requires += ['opencv-python']
-
-# the prefix is sys.prefix which should always be usr
-paddle_bins = ''
-
-if not '${WIN32}':
-    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
-package_data = {
-    'paddle.fluid':
-    ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]
-}
-if '${HAS_NOAVX_CORE}' == 'ON':
-    package_data['paddle.fluid'] += [
-        'core_noavx' + ('.so' if os.name != 'nt' else '.pyd')
-    ]
-
-package_dir = {
-    '': '${PADDLE_BINARY_DIR}/python',
-    # The paddle.fluid.proto will be generated while compiling.
-    # So that package points to other directory.
-    'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
-    'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
-    'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
-}
-
-# put all thirdparty libraries in paddle.libs
-libs_path = '${PADDLE_BINARY_DIR}/python/paddle/libs'
-
-package_data['paddle.libs'] = []
-package_data['paddle.libs'] = [('libwarpctc'
-                                if os.name != 'nt' else 'warpctc') + ext_name]
-shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
-
-if '${WITH_MKL}' == 'ON':
-    shutil.copy('${MKLML_SHARED_LIB}', libs_path)
-    shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
-    package_data['paddle.libs'] += [
-        ('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name,
-        ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name
-    ]
-else:
-    if os.name == 'nt':
-        # copy the openblas.dll
-        shutil.copy('${OPENBLAS_SHARED_LIB}', libs_path)
-        package_data['paddle.libs'] += ['openblas' + ext_name]
-
-if '${WITH_LITE}' == 'ON':
-    shutil.copy('${LITE_SHARED_LIB}', libs_path)
-    package_data['paddle.libs'] += ['libpaddle_full_api_shared' + ext_name]
-
-if '${WITH_PSLIB}' == 'ON':
-    shutil.copy('${PSLIB_LIB}', libs_path)
-    if os.path.exists('${PSLIB_VERSION_PY}'):
-        shutil.copy(
-            '${PSLIB_VERSION_PY}',
-            '${PADDLE_BINARY_DIR}/python/paddle/fluid/incubate/fleet/parameter_server/pslib/'
-        )
-    package_data['paddle.libs'] += ['libps' + ext_name]
-
-if '${WITH_MKLDNN}' == 'ON':
-    if '${CMAKE_BUILD_TYPE}' == 'Release' and os.name != 'nt':
-        # only change rpath in Release mode.
-        # TODO(typhoonzero): use install_name_tool to patch mkl libs once
-        # we can support mkl on mac.
-        #
-        # change rpath of libdnnl.so.1, add $ORIGIN/ to it.
-        # The reason is that all thirdparty libraries in the same directory,
-        # thus, libdnnl.so.1 will find libmklml_intel.so and libiomp5.so.
-        command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
-        if os.system(command) != 0:
-            raise Exception("patch libdnnl.so failed, command: %s" % command)
-    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
-    if os.name != 'nt':
-        shutil.copy('${MKLDNN_SHARED_LIB_1}', libs_path)
-        package_data['paddle.libs'] += ['libmkldnn.so.0', 'libdnnl.so.1']
-    else:
-        package_data['paddle.libs'] += ['mkldnn.dll']
-
-if '${WITH_XPU}' == 'ON':
-    # only change rpath in Release mode,
-    if '${CMAKE_BUILD_TYPE}' == 'Release':
-        if os.name != 'nt':
-            if "@APPLE@" == "1":
-                command = "install_name_tool -id \"@loader_path/\" ${XPU_API_LIB}"
-            else:
-                command = "patchelf --set-rpath '$ORIGIN/' ${XPU_API_LIB}"
-            if os.system(command) != 0:
-                raise Exception("patch ${XPU_API_LIB} failed, command: %s" %
-                                command)
-    shutil.copy('${XPU_API_LIB}', libs_path)
-    shutil.copy('${XPU_RT_LIB}', libs_path)
-    shutil.copy('${XPU_SIM_LIB}', libs_path)
-    package_data['paddle.libs'] += [
-        '${XPU_API_LIB_NAME}', '${XPU_RT_LIB_NAME}', '${XPU_SIM_LIB_NAME}'
-    ]
-
-# copy libfuild_framework.so to libs
-if os.name != 'nt' and sys.platform != 'darwin':
-    paddle_framework_lib = '${FLUID_FRAMEWORK_SHARED_LIB}'
-    shutil.copy(paddle_framework_lib, libs_path)
-    package_data['paddle.libs'] += [
-        ('libpaddle_framework'
-         if os.name != 'nt' else 'paddle_framework') + ext_name
-    ]
-
-# remove unused paddle/libs/__init__.py
-if os.path.isfile(libs_path + '/__init__.py'):
-    os.remove(libs_path + '/__init__.py')
-package_dir['paddle.libs'] = libs_path
-
-# change rpath of ${FLUID_CORE_NAME}.ext, add $ORIGIN/../libs/ to it.
-# The reason is that libwarpctc.ext, libiomp5.ext etc are in paddle.libs, and
-# ${FLUID_CORE_NAME}.ext is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
-# This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
-if '${CMAKE_BUILD_TYPE}' == 'Release':
-    if os.name != 'nt':
-        # only change rpath in Release mode, since in Debug mode, ${FLUID_CORE_NAME}.xx is too large to be changed.
-        if "@APPLE@" == "1":
-            command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
-        else:
-            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
-        # The dynamic library compiled under aarch64 is greater than 64M,
-        # and an oversize error will be reported when using patchelf.
-        if platform.machine() != 'aarch64':
-            if os.system(command) != 0:
-                raise Exception(
-                    "patch ${FLUID_CORE_NAME}.%s failed, command: %s" %
-                    (ext_name, command))
-
-ext_modules = [Extension('_foo', ['stub.cc'])]
-if os.name == 'nt':
-    # fix the path separator under windows
-    fix_package_dir = {}
-    for k, v in package_dir.items():
-        fix_package_dir[k] = v.replace('/', '\\')
-    package_dir = fix_package_dir
-    ext_modules = []
-elif sys.platform == 'darwin':
-    ext_modules = []
-
-
-def find_files(pattern, root):
-    for dirpath, _, files in os.walk(root):
-        for filename in fnmatch.filter(files, pattern):
-            yield os.path.join(dirpath, filename)
-
-
-headers = (
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/framework')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/imperative')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/memory')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) +
-    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) +
-    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) +
-    list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}'))
-    +  # errorMessage.pb for errormessage
-    ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] +  # eigen
-    list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) +  # eigen
-    list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) +  # eigen
-    list(find_files('*', '${GFLAGS_INSTALL_DIR}/include')) +  # gflags
-    list(find_files('*', '${GLOG_INSTALL_DIR}/include')) +  # glog
-    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost')) +  # boost
-    list(find_files('*', '${XXHASH_INSTALL_DIR}/include')) +  # xxhash
-    list(find_files('*', '${PROTOBUF_INCLUDE_DIR}')) +  # protobuf
-    list(find_files('*', '${DLPACK_INCLUDE_DIR}')) +  # dlpack
-    list(find_files('*.h', '${THREADPOOL_INCLUDE_DIR}')))  # threadpool
-
-if '${WITH_MKLDNN}' == 'ON':
-    headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include'))  # mkldnn
-
-if '${WITH_GPU}' == 'ON':
-    headers += list(find_files(
-        '*.pb', '${cudaerror_INCLUDE_DIR}'))  # errorMessage.pb for errormessage
-
-
-class InstallCommand(InstallCommandBase):
-    def finalize_options(self):
-        ret = InstallCommandBase.finalize_options(self)
-        self.install_headers = os.path.join(self.install_purelib, 'paddle',
-                                            'include')
-        self.install_lib = self.install_platlib
-        return ret
-
-
-class InstallHeaders(Command):
-    """Override how headers are copied.
-    """
-    description = 'install C/C++ header files'
-
-    user_options = [
-        ('install-dir=', 'd', 'directory to install header files to'),
-        ('force', 'f', 'force installation (overwrite existing files)'),
-    ]
-
-    boolean_options = ['force']
-
-    def initialize_options(self):
-        self.install_dir = None
-        self.force = 0
-        self.outfiles = []
-
-    def finalize_options(self):
-        self.set_undefined_options(
-            'install', ('install_headers', 'install_dir'), ('force', 'force'))
-
-    def mkdir_and_copy_file(self, header):
-        if 'pb.h' in header:
-            install_dir = re.sub('${PADDLE_BINARY_DIR}/', '', header)
-        elif 'third_party' not in header:
-            # framework
-            install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
-        else:
-            # third_party
-            install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
-            patterns = [
-                'eigen3/src/extern_eigen3', 'boost/src/extern_boost',
-                'dlpack/src/extern_dlpack/include', 'install/protobuf/include',
-                'install/gflags/include', 'install/glog/include',
-                'install/xxhash/include', 'install/mkldnn/include',
-                'threadpool/src/extern_threadpool'
-            ]
-            for pattern in patterns:
-                install_dir = re.sub(pattern, '', install_dir)
-        install_dir = os.path.join(self.install_dir,
-                                   os.path.dirname(install_dir))
-        if not os.path.exists(install_dir):
-            self.mkpath(install_dir)
-        return self.copy_file(header, install_dir)
-
-    def run(self):
-        # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
-        if os.name == 'nt' or sys.platform == 'darwin':
-            if '${WITH_GPU}' == 'ON':
-                self.mkdir_and_copy_file(
-                    '${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
-            return
-        hdrs = self.distribution.headers
-        if not hdrs:
-            return
-        self.mkpath(self.install_dir)
-        for header in hdrs:
-            (out, _) = self.mkdir_and_copy_file(header)
-            self.outfiles.append(out)
-
-    def get_inputs(self):
-        return self.distribution.headers or []
-
-    def get_outputs(self):
-        return self.outfiles
-
-
-# we redirect setuptools log for non-windows
-if sys.platform != 'win32':
-
-    @contextmanager
-    def redirect_stdout():
-        f_log = open('${SETUP_LOG_FILE}', 'w')
-        origin_stdout = sys.stdout
-        sys.stdout = f_log
-        yield
-        f_log = sys.stdout
-        sys.stdout = origin_stdout
-        f_log.close()
-else:
-
-    @contextmanager
-    def redirect_stdout():
-        yield
-
-
-if '${WITH_GPU}' == 'ON':
-    os.environ['PACKAGE_NAME'] = "paddlepaddle-gpu"
-else:
-    os.environ['PACKAGE_NAME'] = "paddlepaddle"
-
-with redirect_stdout():
-    setup(
-        name='${PACKAGE_NAME}',
-        version='${PADDLE_VERSION}',
-        description='Parallel Distributed Deep Learning',
-        install_requires=setup_requires,
-        packages=packages,
-        ext_modules=ext_modules,
-        package_data=package_data,
-        package_dir=package_dir,
-        scripts=paddle_bins,
-        distclass=BinaryDistribution,
-        headers=headers,
-        cmdclass={
-            'install_headers': InstallHeaders,
-            'install': InstallCommand,
-        },
-        entry_points={
-            'console_scripts':
-            ['fleetrun = paddle.distributed.fleet.launch:launch']
-        })
-
-# As there are a lot of files in purelib which causes many logs,
-# we don't print them on the screen, and you can open `setup.py.log`
-# for the full logs.
-if os.path.exists('${SETUP_LOG_FILE}'):
-    os.system('grep -v "purelib" ${SETUP_LOG_FILE}')
-- 
GitLab


From 669efb98de55c617387603acebf875a669432706 Mon Sep 17 00:00:00 2001
From: LutaoChu <30695251+LutaoChu@users.noreply.github.com>
Date: Mon, 21 Sep 2020 16:10:26 +0800
Subject: [PATCH 151/261] Fix bug: shapes of Topk outputs are wrong when the
 parameter k is Tensor

Fix bug: shapes of Topk outputs are wrong when the parameter k is Tensor
---
 paddle/fluid/operators/top_k_v2_op.cc         | 15 ++++++++++++---
 .../fluid/tests/unittests/test_top_k_v2_op.py | 19 +++++++++++++++----
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index cc72d83411f..0e3fcced19e 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -32,7 +32,6 @@ class TopkV2Op : public framework::OperatorWithKernel {
 
     auto input_dims = ctx->GetInputDim("X");
     const int& dim_size = input_dims.size();
-    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
     int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
     PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true,
                       "the axis of topk"
@@ -41,8 +40,18 @@ class TopkV2Op : public framework::OperatorWithKernel {
 
     if (axis < 0) axis += dim_size;
 
-    PADDLE_ENFORCE_GE(
-        k, 1, "the attribute of k in the topk must >= 1, but received %d .", k);
+    int k;
+    auto k_is_tensor = ctx->HasInput("K");
+    if (k_is_tensor) {
+      k = -1;
+    } else {
+      k = static_cast<int>(ctx->Attrs().Get<int>("k"));
+      PADDLE_ENFORCE_EQ(k >= 1, true,
+                        "the attribute of k in the topk must >= 1 or be a "
+                        "Tensor, but received %d .",
+                        k);
+    }
+
     PADDLE_ENFORCE_GE(input_dims.size(), 1,
                       "input of topk must have >= 1d shape");
 
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
index 54e7765c0fb..b9d96f329b5 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
@@ -63,28 +63,28 @@ class TestTopkOp(OpTest):
         self.check_grad(set(['X']), 'Out')
 
 
-class TestTopOp1(TestTopkOp):
+class TestTopkOp1(TestTopkOp):
     def init_args(self):
         self.k = 3
         self.axis = 0
         self.largest = True
 
 
-class TestTopOp2(TestTopkOp):
+class TestTopkOp2(TestTopkOp):
     def init_args(self):
         self.k = 3
         self.axis = 0
         self.largest = False
 
 
-class TestTopOp3(TestTopkOp):
+class TestTopkOp3(TestTopkOp):
     def init_args(self):
         self.k = 4
         self.axis = 0
         self.largest = False
 
 
-class TestTopOp4(TestTopkOp):
+class TestTopkOp4(TestTopkOp):
     def init_args(self):
         self.k = 4
         self.axis = 0
@@ -189,6 +189,8 @@ class TestTopKAPI(unittest.TestCase):
             result1 = paddle.topk(input_tensor, k=2)
             result2 = paddle.topk(input_tensor, k=2, axis=-1)
             result3 = paddle.topk(input_tensor, k=k_tensor, axis=1)
+            self.assertEqual(result3[0].shape, (6, -1, 8))
+            self.assertEqual(result3[1].shape, (6, -1, 8))
             result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False)
             result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
             result6 = paddle.topk(large_input_tensor, k=1, axis=-1)
@@ -239,6 +241,15 @@ class TestTopKAPI(unittest.TestCase):
             self.run_dygraph(place)
             self.run_static(place)
 
+    def test_errors(self):
+        paddle.disable_static()
+        x = paddle.to_tensor([1, 2, 3])
+        with self.assertRaises(BaseException):
+            paddle.topk(x, k=-1)
+
+        with self.assertRaises(BaseException):
+            paddle.topk(x, k=0)
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From aba759ba16422abf8cd39ae7e19d24f5997b9ade Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Mon, 21 Sep 2020 17:55:29 +0800
Subject: [PATCH 152/261] [Feature] Enhance inplace addto strategy for gradient
 accumulation in static graph (#27112)

* support use add instead of sum to do gradient accumulation

* add inplace addto pass

* add grad_add op and inplace addto pass

* remove debug code

* code refine

* fix bug when sereral sum ops inserts at same op_idx

* fix Flags type

* add addto attribute for conv3d

* fix ut

* code clean

* fix type
---
 paddle/fluid/framework/details/CMakeLists.txt |   1 +
 .../fluid/framework/details/build_strategy.h  |   4 +
 .../fluid/framework/details/op_handle_base.cc |   7 +
 .../fluid/framework/details/op_handle_base.h  |   6 +
 .../details/share_tensor_buffer_functor.cc    |  12 +-
 .../details/share_tensor_buffer_functor.h     |  10 +-
 .../details/share_tensor_buffer_op_handle.cc  |   9 +-
 .../details/share_tensor_buffer_op_handle.h   |   5 +-
 .../ir/memory_optimize_pass/CMakeLists.txt    |   2 +
 .../buffer_shared_inplace_op_pass.cc          |   6 +-
 .../inplace_addto_op_pass.cc                  | 221 ++++++++++++++++++
 .../memory_optimize_pass/memory_reuse_pass.cc |  11 +-
 .../memory_optimize_pass/memory_reuse_pass.h  |  14 +-
 paddle/fluid/framework/operator.h             |   8 +
 paddle/fluid/framework/parallel_executor.cc   |  19 ++
 paddle/fluid/operators/conv_cudnn_op.cu       |  27 ++-
 paddle/fluid/operators/conv_op.cc             |  10 +
 .../elementwise/elementwise_add_op.cc         |  18 ++
 .../elementwise/elementwise_add_op.cu         |   7 +
 paddle/fluid/platform/flags.cc                |  15 ++
 .../pybind/global_value_getter_setter.cc      |   3 +-
 paddle/fluid/pybind/pybind.cc                 |   6 +
 python/paddle/fluid/__init__.py               |   1 +
 python/paddle/fluid/backward.py               |  94 ++++++--
 .../unittests/test_inplace_addto_strategy.py  | 114 +++++++++
 25 files changed, 589 insertions(+), 41 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index a3cc4d1721e..8281ec21438 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -74,6 +74,7 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto
     eager_deletion_pass
     buffer_shared_inplace_op_pass
     buffer_shared_cross_op_memory_reuse_pass
+    inplace_addto_op_pass
     set_reader_device_info_utils
     add_reader_dependency_pass)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 5388df6bc50..01d496d4ea7 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -19,6 +19,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "boost/optional.hpp"
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -119,6 +120,9 @@ struct BuildStrategy {
   // Turn on inplace by default.
   bool enable_inplace_{true};
 
+  // Turn off inplace addto by default.
+  bool enable_addto_{false};
+
   // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
   // num_trainers is 1, so the current fields of build_strategy doesn't tell if
   // it's distributed model.
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 35fe5d631fb..459bcff5c0b 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/op_handle_base.h"
+
 #include <map>
 #include <unordered_set>
 
@@ -88,6 +89,12 @@ void OpHandleBase::Run(bool use_cuda) {
   PADDLE_ENFORCE(!use_cuda);
 #endif
 
+  // skip running current op, used with inplace_addto_op_pass
+  if (skip_running_) {
+    VLOG(4) << "skip running: " << Name();
+    return;
+  }
+
   RunImpl();
 }
 
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index c5aa1295aad..097f54d5d58 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -18,6 +18,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -52,6 +53,10 @@ class OpHandleBase {
 
   virtual Priority GetPriority() const { return kNormal; }
 
+  virtual bool GetSkipRunning() const { return skip_running_; }
+
+  virtual void SetSkipRunning(bool skip_runing) { skip_running_ = skip_runing; }
+
   virtual std::string Name() const = 0;
 
   void Run(bool use_cuda);
@@ -131,6 +136,7 @@ class OpHandleBase {
   std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
 
   std::vector<Scope *> local_exec_scopes_;
+  bool skip_running_ = false;
 
 #ifdef PADDLE_WITH_CUDA
   std::unordered_map<int, cudaEvent_t> events_;
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
index 19f075018ce..5fbaf3cbfe0 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -48,12 +48,13 @@ static inline Tensor *GetMutableTensorFromVar(Variable *var) {
 ShareTensorBufferFunctor::ShareTensorBufferFunctor(
     Scope *scope, size_t scope_idx, const std::string &op_type,
     const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-    const std::vector<std::string> &out_var_names)
+    const std::vector<std::string> &out_var_names, bool share_dims)
     : scope_(scope),
       scope_idx_(scope_idx),
       op_type_(op_type),
       in_var_infos_(in_var_infos),
-      out_var_names_(out_var_names) {
+      out_var_names_(out_var_names),
+      share_dims_(share_dims) {
   PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size(),
                     platform::errors::PreconditionNotMet(
                         "The number of input variables and output variables "
@@ -151,6 +152,13 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
     } else {
       out_tensor->ShareBufferWith(in_tensor);
 
+      // NOTE(zhiqiu): In the case of inplace addto, if the operator of
+      // the in_out_vars is skipped during running, we should set the dims of
+      // output as the same as input.
+      if (share_dims_) {
+        out_tensor->Resize(in_tensor.dims());
+      }
+
       VLOG(2) << "Share tensor buffer when running " << op_type_ << " : "
               << in_var_info->Name() << " -> " << out_var_names_[i];
     }
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.h b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
index 774dcd056e5..be49d1c432b 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
@@ -19,6 +19,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/scope.h"
@@ -40,11 +41,13 @@ class ShareTensorBufferFunctor {
   ShareTensorBufferFunctor(
       Scope *scope, size_t scope_idx, const std::string &op_type,
       const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-      const std::vector<std::string> &out_var_names);
+      const std::vector<std::string> &out_var_names, bool share_dims = false);
 
   void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
                        const std::string &out_var_name);
 
+  void SetShareDims(bool share_dims) { share_dims_ = share_dims; }
+
   void operator()(Scope *exec_scope);
 
   std::unordered_map<std::string, std::string> ReusedVars() const;
@@ -66,6 +69,11 @@ class ShareTensorBufferFunctor {
   std::vector<std::string> out_var_names_;
 
   std::vector<std::pair<const Variable *, Variable *>> in_out_vars_;
+
+  // NOTE(zhiqiu): In the case of inplace addto, if the operator of
+  // the in_out_vars is skipped during running, we should set the dims of output
+  // as the same as input.
+  bool share_dims_{false};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index b805ad3b072..be3f5515a97 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -59,9 +59,10 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle(
 ShareTensorBufferOpHandle::ShareTensorBufferOpHandle(
     ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type,
     const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-    const std::vector<std::string> &out_var_names)
+    const std::vector<std::string> &out_var_names, bool share_dims)
     : OpHandleBase(node),
-      functor_(scope, scope_idx, op_type, in_var_infos, out_var_names) {}
+      functor_(scope, scope_idx, op_type, in_var_infos, out_var_names,
+               share_dims) {}
 
 std::unordered_map<std::string, std::string>
 ShareTensorBufferOpHandle::ReusedVars() const {
@@ -73,6 +74,10 @@ void ShareTensorBufferOpHandle::AddReuseVarPair(
   functor_.AddReuseVarPair(in_var_info, out_var_name);
 }
 
+void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
+  functor_.SetShareDims(share_dims);
+}
+
 void ShareTensorBufferOpHandle::InitCUDA() {
 #ifdef PADDLE_WITH_CUDA
   int dev_id =
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
index b22f5621fe4..a02c346485e 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
@@ -17,6 +17,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
@@ -31,7 +32,7 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
       ir::Node *node, Scope *scope, size_t scope_idx,
       const std::string &op_type,
       const std::vector<const ir::MemOptVarInfo *> &in_vars_infos,
-      const std::vector<std::string> &out_var_names);
+      const std::vector<std::string> &out_var_names, bool share_dims = false);
 
   std::unordered_map<std::string, std::string> ReusedVars() const;
 
@@ -42,6 +43,8 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
   void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
                        const std::string &out_var_name);
 
+  void SetShareDims(bool share_dims);
+
   const ShareTensorBufferFunctor &Functor() const { return functor_; }
 
  protected:
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
index 726a2d90fcf..a8c0973cac4 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -13,4 +13,6 @@ cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handl
 cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass)
 cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass) 
 
+cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass)
+
 cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op)
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
index 0b42f2ebd55..ce7f27d2755 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
@@ -16,6 +16,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
@@ -141,11 +142,12 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const {
         VLOG(4) << "Inplace performed in op " << op_type << ": "
                 << in_var_handle_ptr->Name() << " -> "
                 << out_var_handle_ptr->Name()
-                << ". Debug String is: " << op->GetOp()->DebugString();
+                << ". Debug String is: " << op->GetOp()->DebugString()
+                << ". ReuseType: " << ReuseType();
       } else {
         VLOG(3) << "Inplace failed in op " << op_type << ": "
                 << in_var_handle_ptr->Name() << " -> "
-                << out_var_handle_ptr->Name();
+                << out_var_handle_ptr->Name() << ". ReuseType: " << ReuseType();
       }
     }
   }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
new file mode 100644
index 00000000000..81c63f46bda
--- /dev/null
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InplaceAddToOpPass : public MemoryReusePass {
+ protected:
+  std::string ReuseType() const override { return "inplace_addto"; }
+
+  void Run(Graph *graph) const override;
+
+ private:
+  // 1. Add last living op of in_var, add any last living op of out_var
+  // 2. Set reference count of in_var to be 2
+  void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
+                             details::VarHandle *in_var,
+                             details::VarHandle *out_var) const override {
+    size_t scope_idx = op->GetScopeIdx();
+    auto *last_live_ops_of_vars_ =
+        &Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+    auto *var_infos_ = &(Get<MemOptVarInfoMapList>(kMemOptVarInfoMapList));
+    auto out_var_op_iter =
+        (*last_live_ops_of_vars_)[scope_idx].find(out_var->Name());
+
+    // In Reduce mode, some output variable(gradient of parameter) does not have
+    // last live ops
+    details::ComputationOpHandle *last_live_op_of_in_var = nullptr;
+    if (out_var_op_iter == (*last_live_ops_of_vars_)[scope_idx].end()) {
+      last_live_op_of_in_var = op;
+    } else {
+      PADDLE_ENFORCE_EQ(
+          out_var_op_iter->second.ops().empty(), false,
+          platform::errors::InvalidArgument(
+              "Var(%s)'s last live op should not empty.", out_var->Name()));
+      last_live_op_of_in_var = *(out_var_op_iter->second.ops().begin());
+    }
+
+    auto *last_live_ops_of_in_var =
+        (*last_live_ops_of_vars_)[scope_idx][in_var->Name()].mutable_ops();
+    // last_live_ops_of_in_var->clear();
+    last_live_ops_of_in_var->insert(last_live_op_of_in_var);
+
+    auto in_var_info_iter = (*var_infos_)[scope_idx].find(in_var->Name());
+    PADDLE_ENFORCE_NE(
+        in_var_info_iter, (*var_infos_)[scope_idx].end(),
+        platform::errors::NotFound("Cannot find variable %s.", in_var->Name()));
+
+    in_var_info_iter->second->SetRefCnt(2);  // before inplace, it is 1
+  }
+};
+
+void InplaceAddToOpPass::Run(Graph *graph) const {
+  const auto &last_live_ops =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+
+  bool use_cuda = Get<bool>(kUseCuda);
+
+  // Currently, only perform InplaceAddToOpPass on cuda place
+  if (!use_cuda) {
+    return;
+  }
+
+  // Step 1: Build a reverse map of last_live_ops
+  // i.e.: op -> vars
+  std::unordered_map<details::ComputationOpHandle *,
+                     std::unordered_map<std::string, ir::Node *>>
+      candidate_ops;
+  for (auto &each_scope_ops : last_live_ops) {
+    for (auto &pair : each_scope_ops) {
+      // If variable has more than 1 last lived ops, this variable cannot
+      // be inplaced.
+      if (pair.second.ops().size() != 1) {
+        continue;
+      }
+
+      auto *op = *(pair.second.ops().begin());
+      const std::string &op_type = op->GetOp()->Type();
+      const framework::OpDesc *op_desc = op->Node()->Op();
+      PADDLE_ENFORCE_NOT_NULL(
+          op_desc, platform::errors::NotFound("Op(%s) can not find opdesc.",
+                                              op->Name()));
+
+      // only grad op should be processed.
+      if (op_type != "grad_add") {
+        continue;
+      }
+
+      const std::string &var_name = pair.first;
+      auto in_nodes = this->FindNodesByName(var_name, op->Node()->inputs);
+      if (in_nodes.size() == 1) {
+        candidate_ops[op][var_name] = *in_nodes.begin();
+      }
+      VLOG(4) << "Find op " << op_type << " with input(" << var_name
+              << ") that can do inplace add to";
+    }
+  }
+
+  // Step 2: Check which vars can be inplaced indeed
+  for (auto &op_vars_pair : candidate_ops) {
+    auto *op = op_vars_pair.first;
+
+    // The original gradient accumulation is g = sum(g_0, g_1,..., g_n), and it
+    // could be changed as follws if inplace addto is enabled:
+    // g_sum_0 = g_0
+    // g_sum_1 = grad_add(g_sum_0, g_1)
+    // g_sum_2 = grad_add(g_sum_1, g_2)
+    // ...
+    // g_sum_n = grad_add(g_sum_n-1, g_n)
+
+    // here we will add inplace for each grad_add, for example, for the first
+    // grad_add, g_sum_0 -> g1, g_sum_1 -> g1, and set grad_add as skipped.
+
+    const std::string &op_type = op->GetOp()->Type();
+
+    PADDLE_ENFORCE_EQ(op->Node()->inputs.size(), 2,
+                      platform::errors::InvalidArgument(
+                          "The size of inputs of %s should be 2, but got %d",
+                          op_type, op->Node()->inputs.size()));
+
+    PADDLE_ENFORCE_EQ(op->Node()->outputs.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The size of outputs of %s should be 1, but got %d",
+                          op_type, op->Node()->outputs.size()));
+
+    auto *left_var_ptr = dynamic_cast<details::VarHandle *>(
+        &(op->Node()->inputs[0]->Wrapper<details::VarHandleBase>()));
+    auto *right_var_ptr = dynamic_cast<details::VarHandle *>(
+        &(op->Node()->inputs[1]->Wrapper<details::VarHandleBase>()));
+    auto *out_var_ptr = dynamic_cast<details::VarHandle *>(
+        &(op->Node()->outputs[0]->Wrapper<details::VarHandleBase>()));
+
+    if (left_var_ptr == nullptr || right_var_ptr == nullptr ||
+        out_var_ptr == nullptr) {
+      continue;
+    }
+
+    // auto *left_generated_op = dynamic_cast<details::ComputationOpHandle *>(
+    //     left_var_ptr->GeneratedOp());
+
+    auto *right_generated_op = dynamic_cast<details::ComputationOpHandle *>(
+        right_var_ptr->GeneratedOp());
+
+    auto *out_generated_op = dynamic_cast<details::ComputationOpHandle *>(
+        out_var_ptr->GeneratedOp());
+
+    // NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy
+    if (right_generated_op->Name() != "conv2d_grad") {
+      continue;
+    }
+
+    // NOTE(zhiqiu): Normally, if we inplace a->b, we should let a generated
+    // before b. However, in the situation of inplace addto, we do not care
+    // the order, since a+b is equal to b+a. Is there any exception for that?
+
+    // AddDependencyVar(right_generated_op, left_generated_op);
+    // no need, as discussed above.
+
+    // step (a): inplace right_var->left_var of grad_add
+
+    this->AddReuseVar(right_generated_op, left_var_ptr, right_var_ptr);
+    UpdateLastLiveOpOfVar(right_generated_op, left_var_ptr, right_var_ptr);
+    VLOG(4) << "Inplace performed in op " << right_generated_op->GetOp()->Type()
+            << ": " << left_var_ptr->Name() << " -> " << right_var_ptr->Name()
+            << ". Debug String is: "
+            << right_generated_op->GetOp()->DebugString()
+            << ". ReuseType: " << ReuseType();
+
+    // step (b): inplace out -> right_var of grad_add
+
+    this->AddReuseVar(out_generated_op, right_var_ptr, out_var_ptr, true);
+
+    VLOG(4) << "Inplace performed in op " << op_type << ": "
+            << left_var_ptr->Name() << " -> " << out_var_ptr->Name()
+            << ". Debug String is: " << op->GetOp()->DebugString()
+            << ". ReuseType: " << ReuseType();
+
+    // step (c): make right_var cannot inplace afterwards. canbe done
+    // aotomatically since CollectReusedVars is called before any reuse.
+
+    // step (d): make right_var's generated op use addto
+    right_generated_op->GetOp()->SetAttr("use_addto", true);
+
+    // step (e): make grad_add skip running
+    op->SetSkipRunning(true);
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inplace_addto_op_pass, paddle::framework::ir::InplaceAddToOpPass)
+    .RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList)
+    .RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars)
+    .RequirePassAttr(paddle::framework::ir::kUseCuda);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
index 221b0a76e7e..3e3b9864a7b 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
+
 #include <functional>
 #include <map>
 #include <string>
@@ -73,6 +74,7 @@ bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var,
           out_var->Name()));
   if (IsVarPairReusable(*in_var, *out_var)) {
     AddReuseVar(op, in_var, out_var);
+    UpdateLastLiveOpOfVar(op, in_var, out_var);
     return true;
   } else {
     return false;
@@ -324,7 +326,8 @@ bool MemoryReusePass::IsVarPairReusable(
 
 void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
                                   details::VarHandle *in_var,
-                                  details::VarHandle *out_var) const {
+                                  details::VarHandle *out_var,
+                                  bool share_dims) const {
   PADDLE_ENFORCE_GT(
       (*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0,
       platform::errors::NotFound("Var(%s) does not in mem opt var infos.",
@@ -344,13 +347,15 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
     share_buffer_op->AddInput(in_var);
   }
 
+  if (share_dims) {
+    share_buffer_op->SetShareDims(true);
+  }
+
   share_buffer_op->AddReuseVarPair(
       (*var_infos_)[op->GetScopeIdx()].at(in_var->Name()).get(),
       out_var->Name());
   reused_in_var_names_[op->GetScopeIdx()].insert(in_var->Name());
   reused_out_var_names_[op->GetScopeIdx()].insert(out_var->Name());
-
-  UpdateLastLiveOpOfVar(op, in_var, out_var);
 }
 
 // 1. Set last living op of in_var to be any last living op of out_var
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
index 82274419184..1c0c6ae6020 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
@@ -18,6 +18,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
@@ -92,6 +93,12 @@ class MemoryReusePass : public Pass {
 
   int64_t GetMemorySize(const details::VarHandle &var) const;
 
+  void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
+                   details::VarHandle *out_var, bool share_dims = false) const;
+  virtual void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
+                                     details::VarHandle *in_var,
+                                     details::VarHandle *out_var) const;
+
  private:
   VarDesc *GetVarDesc(const details::VarHandle &var) const;
 
@@ -109,13 +116,6 @@ class MemoryReusePass : public Pass {
 
   void CollectReusedVars() const;
 
-  void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
-                   details::VarHandle *out_var) const;
-
-  void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
-                             details::VarHandle *in_var,
-                             details::VarHandle *out_var) const;
-
  private:
   mutable Graph *graph_;
   mutable bool use_cuda_;
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index ebecbf0498c..bd52d7ffef5 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -157,6 +157,14 @@ class OperatorBase {
         platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
     return BOOST_GET_CONST(T, attrs_.at(name));
   }
+  void SetAttr(const std::string& name, const Attribute& v) {
+    PADDLE_ENFORCE_EQ(
+        HasAttr(name), true,
+        platform::errors::NotFound(
+            "The attribute %s is not found in operator %s", name, Type()));
+
+    attrs_[name] = v;
+  }
   const AttributeMap& Attrs() const { return attrs_; }
 
   const VariableNameMap& Inputs() const { return inputs_; }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 12e0f97f126..535ec9cd7d9 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/parallel_executor.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -108,6 +110,11 @@ class ParallelExecutorPrivate {
    *                                       them.
    */
   inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) {
+    if (mem_opt_var_infos_.size() == 0) {
+      VLOG(4) << "The mem_opt_var_infos_ is empty, maybe no memory "
+                 "optimization strategy is enabled";
+      return;
+    }
     auto iter = mem_opt_var_infos_[scope_idx].find(name);
     if (iter != mem_opt_var_infos_[scope_idx].end()) {
       iter->second->SetSkipMemoryReuse(true);
@@ -308,6 +315,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
   }
 
   bool need_mem_opt = build_strategy_.enable_inplace_ ||
+                      build_strategy_.enable_addto_ ||
                       build_strategy_.memory_optimize_.get() || is_gc_enabled;
 
   if (!need_mem_opt) return graph;
@@ -320,6 +328,16 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
   graph = ref_cnt_pass->Apply(graph);
   VLOG(10) << "ReferenceCountPass Applied";
 
+  if (build_strategy_.enable_addto_) {
+    auto addto_pass = ir::PassRegistry::Instance().Get("inplace_addto_op_pass");
+    addto_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
+    addto_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
+    addto_pass->SetNotOwned(ir::kUseCuda, &use_cuda_);
+    VLOG(10) << "Start to apply inplace_addto_op_pass";
+    graph = addto_pass->Apply(graph);
+    VLOG(10) << "inplace_addto_op_pass Applied";
+  }
+
   if (build_strategy_.enable_inplace_) {
     auto inplace_pass =
         ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass");
@@ -1068,3 +1086,4 @@ USE_PASS(reference_count_pass);
 USE_PASS(eager_deletion_pass);
 USE_PASS(buffer_shared_inplace_pass);
 USE_PASS(buffer_shared_cross_op_memory_reuse_pass);
+USE_PASS(inplace_addto_op_pass);
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 7f705755915..00af724ac7f 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -287,7 +288,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 #endif
 
     // ------------------- cudnn conv forward ---------------------
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+    VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
     for (int i = 0; i < groups; i++) {
       workspace_handle.RunFunc(
           [&](void* workspace_ptr) {
@@ -609,9 +612,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     }
 
     // ------------------- cudnn conv backward data ---------------------
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+    VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+
     if (input_grad) {
-      // Because beta is zero, it is unnecessary to reset input_grad.
+      // When beta is 0, it is unnecessary to reset input_grad.
+      // When beta is 1, the output cannot be reset since addt strategy used.
       for (int i = 0; i < groups; i++) {
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
@@ -653,6 +660,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
             ctx, &transformed_input_grad_channel, input_grad);
       }
     }
+
+    // filter_grad do not use inplace addto.
+    ScalingParamType<T> beta_filter = 0.0f;
     // ------------------- cudnn conv backward filter ---------------------
     if (filter_grad) {
       // Because beta is zero, it is unnecessary to reset filter_grad.
@@ -665,7 +675,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                       input_data + i * group_offset_in, args2.odesc.desc(),
                       output_grad_data + i * group_offset_out,
                       args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
-                      workspace_size, &beta, args2.wdesc.desc(),
+                      workspace_size, &beta_filter, args2.wdesc.desc(),
                       filter_grad_data + i * group_offset_filter));
             },
             workspace_size);
@@ -1017,7 +1027,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
     int group_offset_out = o_c / groups * o_h * o_w * o_d;
     int group_offset_filter = W->numel() / groups;
 
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = 0.0f;
+
+    // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+    // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
+    // 0.0f;
+    // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+
     auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
 
     if (ddO) {
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 9ed169fe350..bf97b9d03c4 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -305,6 +305,11 @@ void Conv2DOpMaker::Make() {
       .SetDefault(0.0f);
   AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
       .SetDefault(0.0f);
+  AddAttr<bool>(
+      "use_addto",
+      "(bool, default false) If use addto strategy or not, only used in "
+      "cudnn kernel")
+      .SetDefault(false);
   AddAttr<bool>("fuse_residual_connection",
                 "(bool, default false) Only used in mkldnn kernel. Used "
                 "whenever convolution output is as an input to residual "
@@ -460,6 +465,11 @@ void Conv3DOpMaker::Make() {
       .SetDefault(0.0f);
   AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
       .SetDefault(0.0f);
+  AddAttr<bool>(
+      "use_addto",
+      "(bool, default false) If use addto strategy or not, only used in "
+      "cudnn kernel")
+      .SetDefault(false);
   AddAttr<bool>("fuse_residual_connection",
                 "(bool, default false) Only used in mkldnn kernel. Used "
                 "whenever convolution output is as an input to residual "
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index 534a19bd94a..97624944ca1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+
 #include <memory>
 #include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
 namespace paddle {
@@ -129,3 +132,18 @@ REGISTER_OP_CPU_KERNEL(
                                         int>,
     ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         int64_t>);
+
+// A specialization elementwise_add operator, used in gradient accumulation with
+// inplace addto.
+REGISTER_OPERATOR(
+    grad_add, paddle::operators::ElementwiseOp,
+    paddle::operators::ElementwiseAddOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(
+    grad_add,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 71019872802..a4cbd14388b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -111,3 +111,10 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
                                         plat::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index af8798a4b7c..9116edd01b0 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -521,3 +521,18 @@ DEFINE_int32(
 DEFINE_bool(sort_sum_gradient, false,
             "Sum gradients by the reverse order of "
             "the forward execution sequence.");
+
+/**
+ * Performance related FLAG
+ * Name: max_inplace_grad_add
+ * Since Version: 2.0.0
+ * Value Range: int32, default=0
+ * Example:
+ * Note: The maximum number of inplace grad_add.
+ */
+DEFINE_int32(
+    max_inplace_grad_add, 0,
+    "The maximum number of inplace grad_add. When doing "
+    "gradient accumulation, if the number of gradients need to that "
+    "less FLAGS_max_inplace_grad_add, than it will be use several grad_add"
+    "instead of sum. Default is 0.");
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 318178d5eb9..894740e25c0 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -62,6 +62,7 @@ DECLARE_bool(use_system_allocator);
 // others
 DECLARE_bool(benchmark);
 DECLARE_int32(inner_op_parallelism);
+DECLARE_int32(max_inplace_grad_add);
 DECLARE_string(tracer_profile_fname);
 #ifdef PADDLE_WITH_CUDA
 // cudnn
@@ -348,7 +349,7 @@ static void RegisterGlobalVarGetterSetter() {
       FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
       FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
       FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
-      FLAGS_paddle_num_threads, FLAGS_use_mkldnn);
+      FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add);
 
 #ifdef PADDLE_WITH_CUDA
   REGISTER_PUBLIC_GLOBAL_VAR(
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 330254ecaaf..04087cb241c 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <Python.h>
+
 #include <algorithm>
 #include <cstdlib>
 #include <map>
@@ -22,6 +23,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
@@ -2528,6 +2530,10 @@ All parameter, weight, gradient are variables in Paddle.
           "enable_inplace",
           [](const BuildStrategy &self) { return self.enable_inplace_; },
           [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
+      .def_property(
+          "enable_addto",
+          [](const BuildStrategy &self) { return self.enable_addto_; },
+          [](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
       .def_property(
           "fuse_all_reduce_ops",
           [](const BuildStrategy &self) {
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 9f748b7956f..e8cc6ce9901 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -197,6 +197,7 @@ def __bootstrap__():
         'free_when_no_cache_hit',
         'call_stack_level',
         'sort_sum_gradient',
+        'max_inplace_grad_add',
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index d51cacd1a5c..478fecf74e4 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -251,12 +251,19 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
         begin_idx = 0
     if end_idx is None:
         end_idx = len(op_descs)
-    for i in range(begin_idx, end_idx):
-        op_desc = op_descs[i]
-        if isinstance(op_desc, tuple):
-            op_desc = op_desc[0]
-        op_desc._rename_input(old_name, new_name)
-        op_desc._rename_output(old_name, new_name)
+    if isinstance(op_descs, (list, tuple)):
+        for i in range(begin_idx, end_idx):
+            op_desc = op_descs[i]
+            if isinstance(op_desc, tuple):
+                op_desc = op_desc[0]
+            op_desc._rename_input(old_name, new_name)
+            op_desc._rename_output(old_name, new_name)
+    if isinstance(op_descs, collections.OrderedDict):
+        for key, value in op_descs.items():
+            if isinstance(value, (list, tuple)):
+                for op_desc in value:
+                    op_desc._rename_input(old_name, new_name)
+                    op_desc._rename_output(old_name, new_name)
 
 
 def _create_op_desc_(op_type, inputs, outputs, attrs):
@@ -369,6 +376,41 @@ def _append_grad_suffix_(name):
     return cpt.to_text(name) + core.grad_var_suffix()
 
 
+def _accumulate_gradients_by_sum_op_(var_name, renamed_vars, pending_sum_ops,
+                                     op_idx):
+    """
+    Use sum op to accumulate_gradients, the gradients are stored in renamed_vars.
+    """
+    if op_idx not in pending_sum_ops.keys():
+        pending_sum_ops[op_idx] = []
+    pending_sum_ops[op_idx].append(
+        _create_op_desc_("sum", {"X": renamed_vars[var_name]},
+                         {"Out": [var_name]}, {"use_mkldnn": False}))
+    renamed_vars[var_name] = [var_name]
+
+
+def _accumulate_gradients_by_add_ops_(var_name, renamed_vars, pending_sum_ops,
+                                      op_idx):
+    """
+    Use several inplace add op to accumulate_gradients, the gradients are stored in renamed_vars.
+    """
+    if op_idx not in pending_sum_ops.keys():
+        pending_sum_ops[op_idx] = []
+    out_name = renamed_vars[var_name][0]
+    for i in range(1, len(renamed_vars[var_name])):
+        x_name = out_name
+        y_name = renamed_vars[var_name][i]
+        if i != len(renamed_vars[var_name]) - 1:
+            out_name = var_name + '@ADD@' + str(i)
+        else:
+            out_name = var_name
+        pending_sum_ops[op_idx].append(
+            _create_op_desc_("grad_add", {"X": [x_name],
+                                          "Y": [y_name]}, {"Out": [out_name]},
+                             {"use_mkldnn": False}))
+    renamed_vars[var_name] = [var_name]
+
+
 def _addup_repetitive_outputs_(op_descs, block_idx):
     """
     In backward part, an variable may be the output of more than one ops.
@@ -376,7 +418,9 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
     In these cases, the variable should be the accumulation of all the outputs.
     `sum_op`s are added to implement the accumulate.
     """
-    pending_sum_ops = []
+    _MAX_ADD_NUM_ = core.globals()['FLAGS_max_inplace_grad_add']
+    #pending_sum_ops = []
+    pending_sum_ops = collections.OrderedDict()
     var_rename_count = collections.defaultdict(int)
     renamed_vars = collections.defaultdict(list)
     renamed_var_start_idx = collections.defaultdict(list)
@@ -385,10 +429,13 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
             if "@GRAD" not in var_name:
                 continue
             if len(renamed_vars[var_name]) > 1:
-                pending_sum_ops.append((_create_op_desc_(
-                    "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
-                    {"use_mkldnn": False}), idx))
-                renamed_vars[var_name] = [var_name]
+                if len(renamed_vars[var_name]) > _MAX_ADD_NUM_:
+                    _accumulate_gradients_by_sum_op_(var_name, renamed_vars,
+                                                     pending_sum_ops, idx)
+                else:
+                    _accumulate_gradients_by_add_ops_(var_name, renamed_vars,
+                                                      pending_sum_ops, idx)
+
         for param_idx, param_name in enumerate(op_desc.output_names()):
             arg_names = op_desc.output(param_name)
             for arg_idx, var_name in enumerate(arg_names):
@@ -440,13 +487,26 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
                     renamed_vars[var_name].append(new_name)
 
     for var_name, inputs in six.iteritems(renamed_vars):
-        if len(inputs) > 1:
-            pending_sum_ops.append(
-                (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
-                                  {"use_mkldnn": False}), len(op_descs)))
+        if len(renamed_vars[var_name]) > 1:
+            if len(renamed_vars[var_name]) > _MAX_ADD_NUM_:
+                _accumulate_gradients_by_sum_op_(var_name, renamed_vars,
+                                                 pending_sum_ops, len(op_descs))
+            else:
+                _accumulate_gradients_by_add_ops_(var_name, renamed_vars,
+                                                  pending_sum_ops,
+                                                  len(op_descs))
+
     # sum_op descs are sorted according to their insert position
-    for p in reversed(pending_sum_ops):
-        op_descs.insert(p[1], p[0])
+    for key, value in collections.OrderedDict(
+            reversed(list(pending_sum_ops.items()))).items():
+
+        # NOTE(zhiqiu): Since reversed, the idx of op_descs to be inserted will remains correct.
+        # For example, [0, 1, 2], and we want to insert 'a' at idx 1, 'b' at idx 2, and the expected result is [0, 1, 'a', 2, 'b'].
+        # If reversed, we first insert 'b' at idx 2, it becomes [0, 1, 2, 'b'], and then insert 'a' at idx 1, it becomes [0, 1, 'a', 2, 'b'].
+        # If not reverse, we first insert 'a' at idx 1, it becomes [0, 1, 'a', 2], and then insert 'b' at idx 2, it becomes [0, 1, 'a', 'b', 2].
+        idx = key
+        for i, op in enumerate(value):
+            op_descs.insert(idx + i, op)
 
     return op_descs
 
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
new file mode 100644
index 00000000000..c75acd7c15b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
@@ -0,0 +1,114 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid.backward import calc_gradient
+import numpy as np
+
+
+class ConvBNLayer(fluid.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 use_cudnn=False):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = fluid.dygraph.Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False,
+            use_cudnn=use_cudnn)
+
+        self._batch_norm = fluid.dygraph.BatchNorm(num_filters, act=act)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+def create_program():
+    main = fluid.Program()
+    startup = fluid.Program()
+    with fluid.program_guard(main, startup):
+        x = fluid.data(name='img', shape=[-1, 3, 224, 224])
+        x.stop_gradient = False
+        x = fluid.layers.prelu(x, mode="channel")
+        conv = ConvBNLayer(
+            num_channels=3,
+            num_filters=3,
+            filter_size=1,
+            act='relu',
+            use_cudnn=True)
+        y = conv(x) + x
+
+        loss = fluid.layers.reduce_sum(y)
+
+        sgd = fluid.optimizer.SGD(learning_rate=0.01)
+        sgd.minimize(loss)
+
+    return loss, main, startup, conv._conv.weight
+
+
+class TestInplaceAddto(unittest.TestCase):
+    def test_result(self):
+        def run_program(enable_addto):
+            np.random.seed(10)
+            paddle.manual_seed(10)
+            paddle.framework.random._manual_program_seed(10)
+            if fluid.core.is_compiled_with_cuda():
+                fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+            fluid.set_flags({"FLAGS_max_inplace_grad_add": 2})
+            loss, main, startup, w = create_program()
+            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+
+            strategy = fluid.BuildStrategy()
+            strategy.enable_addto = enable_addto
+            compiled = fluid.CompiledProgram(main).with_data_parallel(
+                loss_name=loss.name, build_strategy=strategy)
+
+            exe.run(startup)
+            img = np.random.uniform(-128, 128,
+                                    [8, 3, 224, 224]).astype(np.float32)
+            for i in range(2):
+                res = exe.run(compiled,
+                              feed={'img': img},
+                              fetch_list=[loss.name, w.name])
+            return res
+
+        res1, w1 = run_program(True)
+        res2, w2 = run_program(False)
+        print(res1, res2)
+        self.assertTrue(np.array_equal(res1, res2))
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From 37f7414fd854590f1b36a8fdd1d0d0ebea4276cc Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Mon, 21 Sep 2020 18:27:48 +0800
Subject: [PATCH 153/261] Optimization error message ;test=document_fix
 (#27424)

---
 tools/check_api_approvals.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index b787ae62501..943b8c01e8c 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -39,9 +39,9 @@ fi
 
 api_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5  ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5` 
 if [ "$api_spec_diff" != "" ]; then
+    echo_line="${echo_line}Related APIs: ${api_spec_diff}\n"
     echo_line="You must have one RD (zhiqiu (Recommend) or phlrain) approval for the api change for the opreator-related api without 'core.ops'.\n"
     echo_line="${echo_line}For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/paddle_api_development_manual.md]\n"
-    echo_line="${echo_line}Related APIs: ${api_spec_diff}\n"
     check_approval 1 6888866 43953930
 fi
 
-- 
GitLab


From d93661942ed69527ea53259a4b6e65e8aef3cbea Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Mon, 21 Sep 2020 19:10:04 +0800
Subject: [PATCH 154/261] fix bug sequececonv_eltadd_relu_fuse_pass (#27404)

* fix bug sequececonv_eltadd_relu_fuse_pass, output error when sequence_conv's padding_start > 0

* fix seqconv_eltadd_relu_fuse_pass unitest error
---
 .../ir/seqconv_eltadd_relu_fuse_pass.cc       |   7 +
 .../fused/fusion_seqconv_eltadd_relu_op.cc    |  18 ++-
 .../test_seqconv_eltadd_relu_fuse_pass.py     | 140 ++++++++++++++++++
 3 files changed, 159 insertions(+), 6 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py

diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
index 1485a84d001..75ab04f1b91 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -98,3 +99,9 @@ void SeqConvEltAddReluFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(seqconv_eltadd_relu_fuse_pass,
               paddle::framework::ir::SeqConvEltAddReluFusePass);
+REGISTER_PASS_CAPABILITY(seqconv_eltadd_relu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("sequence_conv", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0));
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
index a6c9a137b54..c5a291f10b2 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -192,6 +192,9 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
           copy_size += src_mat_w_sz;
         }
         // fill data
+        if (context_start > 0) {
+          src_data += context_start * src_mat_w;
+        }
         for (int j = 0; j < seq_len - up_pad - down_pad; ++j) {
           std::memcpy(dst_data, src_data, copy_size);
           dst_data += col_mat_w;
@@ -201,18 +204,15 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
         std::memset(dst_data, 0, down_pad * col_mat_w_sz);
         copy_size -= src_mat_w_sz;
         for (int j = 0; j < down_pad; ++j) {
+          if (copy_size < 0) {
+            copy_size = 0;
+          }
           std::memcpy(dst_data, src_data, copy_size);
           dst_data += col_mat_w;
           src_data += src_mat_w;
           copy_size -= src_mat_w_sz;
         }
       } else {
-        PADDLE_ENFORCE_GE(context_length, up_pad + down_pad + 1,
-                          platform::errors::InvalidArgument(
-                              "context length must be bigger or equal than "
-                              "up_pad + down_pad + 1, but received context "
-                              "length is: %d, up_pad is: %d, down_pad is: %d.",
-                              context_length, up_pad, down_pad));
         std::memset(dst_data, 0, seq_len * col_mat_w_sz);
         dst_data = dst_data + up_pad * src_mat_w;
         int zero_sz = up_pad * src_mat_w_sz;
@@ -226,9 +226,15 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
         // from bottom
         dst_data = col_data + ed * col_mat_w;
         src_data = x_data + st * src_mat_w;
+        if (context_start > 0) {
+          src_data += context_start * src_mat_w;
+        }
         zero_sz = down_pad * src_mat_w_sz;
         for (int j = 1; j <= std::min(down_pad, seq_len); ++j) {
           int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz);
+          if (copy_size < 0) {
+            copy_size = 0;
+          }
           std::memcpy(dst_data - (zero_sz + copy_size) / sizeof(T),
                       src_data + std::max(seq_len - j - up_pad, 0) * src_mat_w,
                       copy_size);
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
new file mode 100644
index 00000000000..eadda5ba06a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class SeqconvEltaddReluFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.sequence_conv(
+                input=data,
+                num_filters=16,
+                filter_size=4,
+                padding_start=0,
+                act="relu",
+                bias_attr=param_attr)
+
+        np_data = np.random.random((80, 100)).astype('float32')
+        x_lod_tensor = fluid.create_lod_tensor(np_data, [[10, 20, 30, 20]],
+                                               fluid.CPUPlace())
+        self.feeds = {"data": x_lod_tensor}
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
+
+
+class SeqconvEltaddReluFusePassTestPaddingStartPositive(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[-1, 4], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.sequence_conv(
+                input=data,
+                num_filters=16,
+                filter_size=3,
+                padding_start=2,
+                act="relu",
+                bias_attr=param_attr)
+
+        np_data = np.array([[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3],
+                            [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7]]).astype('float32')
+        x_lod_tensor = fluid.create_lod_tensor(np_data, [[5, 2]],
+                                               fluid.CPUPlace())
+        self.feeds = {"data": x_lod_tensor}
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
+
+
+class SeqconvEltaddReluFusePassTestPaddingStartNegative(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.sequence_conv(
+                input=data,
+                num_filters=16,
+                filter_size=4,
+                padding_start=-1,
+                act="relu",
+                bias_attr=param_attr)
+
+        np_data = np.random.random((80, 100)).astype('float32')
+        x_lod_tensor = fluid.create_lod_tensor(np_data, [[10, 20, 30, 20]],
+                                               fluid.CPUPlace())
+        self.feeds = {"data": x_lod_tensor}
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
+
+
+class SeqconvEltaddReluFusePassTestPaddingStartNone(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.sequence_conv(
+                input=data,
+                num_filters=16,
+                filter_size=4,
+                act="relu",
+                bias_attr=param_attr)
+
+        np_data = np.random.random((80, 100)).astype('float32')
+        x_lod_tensor = fluid.create_lod_tensor(np_data, [[10, 20, 30, 20]],
+                                               fluid.CPUPlace())
+        self.feeds = {"data": x_lod_tensor}
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From e217e965a9fd3bcb022381ce81431b48f060d5c8 Mon Sep 17 00:00:00 2001
From: Double_V <liuvv0203@163.com>
Date: Mon, 21 Sep 2020 19:14:37 +0800
Subject: [PATCH 155/261] fix pool bug (#27366)

---
 python/paddle/nn/functional/pooling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 042625a3dbd..1eb9167d035 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -713,7 +713,7 @@ def max_pool2d(x,
                 'data_format', data_format)
             return output
 
-    op_type = 'max_pool2d_with_index' if data_format == "NCHW" else "max_pool2d"
+    op_type = 'max_pool2d_with_index' if data_format == "NCHW" else "pool2d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -839,7 +839,7 @@ def max_pool3d(x,
                 'data_format', data_format)
             return output
 
-    op_type = "max_pool3d_with_index" if data_format == "NCDHW" else "max_pool3d"
+    op_type = "max_pool3d_with_index" if data_format == "NCDHW" else "pool3d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
-- 
GitLab


From fc61efd736a04b97c0a8ce4f75a6ddfc577a9663 Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Mon, 21 Sep 2020 20:03:37 +0800
Subject: [PATCH 156/261] fix port env bug(int);test=develop (#27405)

---
 python/paddle/distributed/fleet/launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 4b629bc35ce..17fa0a0c7c3 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -156,7 +156,7 @@ def get_cluster_from_args(args, gpus):
     else:
         start_port = 6070
         if os.environ.get('FLAGS_START_PORT') is not None:
-            start_port = os.environ.get('FLAGS_START_PORT')
+            start_port = int(os.environ.get('FLAGS_START_PORT'))
 
         free_ports = [x for x in range(start_port, start_port + len(gpus))]
 
-- 
GitLab


From 4bd7aa25668b06a00e528900a9f3485e785beb0f Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Mon, 21 Sep 2020 20:09:09 +0800
Subject: [PATCH 157/261] use paddle.get_default_dtype in vision datasets.
 test=develop (#27426)

---
 python/paddle/vision/datasets/cifar.py   | 5 ++++-
 python/paddle/vision/datasets/flowers.py | 5 ++++-
 python/paddle/vision/datasets/folder.py  | 3 +++
 python/paddle/vision/datasets/mnist.py   | 5 ++++-
 python/paddle/vision/datasets/voc2012.py | 5 ++++-
 5 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/python/paddle/vision/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
index 631892ee4dc..c531f3d0e4e 100644
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -19,6 +19,7 @@ import numpy as np
 import six
 from six.moves import cPickle as pickle
 
+import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
@@ -113,6 +114,8 @@ class Cifar10(Dataset):
         # read dataset into memory
         self._load_data()
 
+        self.dtype = paddle.get_default_dtype()
+
     def _init_url_md5_flag(self):
         self.data_url = CIFAR10_URL
         self.data_md5 = CIFAR10_MD5
@@ -142,7 +145,7 @@ class Cifar10(Dataset):
         image = np.reshape(image, [3, 32, 32])
         if self.transform is not None:
             image = self.transform(image)
-        return image, label
+        return image.astype(self.dtype), np.array(label).astype('int64')
 
     def __len__(self):
         return len(self.data)
diff --git a/python/paddle/vision/datasets/flowers.py b/python/paddle/vision/datasets/flowers.py
index 1c0f41123e2..2251333fd8d 100644
--- a/python/paddle/vision/datasets/flowers.py
+++ b/python/paddle/vision/datasets/flowers.py
@@ -21,6 +21,7 @@ import numpy as np
 import scipy.io as scio
 from PIL import Image
 
+import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
@@ -104,6 +105,8 @@ class Flowers(Dataset):
         # read dataset into memory
         self._load_anno()
 
+        self.dtype = paddle.get_default_dtype()
+
     def _load_anno(self):
         self.name2mem = {}
         self.data_tar = tarfile.open(self.data_file)
@@ -124,7 +127,7 @@ class Flowers(Dataset):
         if self.transform is not None:
             image = self.transform(image)
 
-        return image, label.astype('int64')
+        return image.astype(self.dtype), label.astype('int64')
 
     def __len__(self):
         return len(self.indexes)
diff --git a/python/paddle/vision/datasets/folder.py b/python/paddle/vision/datasets/folder.py
index 8a3053abefc..19d913504bd 100644
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
@@ -15,6 +15,7 @@
 import os
 import sys
 
+import paddle
 from paddle.io import Dataset
 from paddle.utils import try_import
 
@@ -143,6 +144,8 @@ class DatasetFolder(Dataset):
         self.samples = samples
         self.targets = [s[1] for s in samples]
 
+        self.dtype = paddle.get_default_dtype()
+
     def _find_classes(self, dir):
         """
         Finds the class folders in a dataset.
diff --git a/python/paddle/vision/datasets/mnist.py b/python/paddle/vision/datasets/mnist.py
index 597d4046441..16c39e56ef0 100644
--- a/python/paddle/vision/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
@@ -19,6 +19,7 @@ import gzip
 import struct
 import numpy as np
 
+import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
@@ -95,6 +96,8 @@ class MNIST(Dataset):
         # read dataset into memory
         self._parse_dataset()
 
+        self.dtype = paddle.get_default_dtype()
+
     def _parse_dataset(self, buffer_size=100):
         self.images = []
         self.labels = []
@@ -145,7 +148,7 @@ class MNIST(Dataset):
         image = np.reshape(image, [1, 28, 28])
         if self.transform is not None:
             image = self.transform(image)
-        return image, label
+        return image.astype(self.dtype), label.astype('int64')
 
     def __len__(self):
         return len(self.labels)
diff --git a/python/paddle/vision/datasets/voc2012.py b/python/paddle/vision/datasets/voc2012.py
index ae14ea30163..5fc9d7c3815 100644
--- a/python/paddle/vision/datasets/voc2012.py
+++ b/python/paddle/vision/datasets/voc2012.py
@@ -19,6 +19,7 @@ import tarfile
 import numpy as np
 from PIL import Image
 
+import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
@@ -96,6 +97,8 @@ class VOC2012(Dataset):
         # read dataset into memory
         self._load_anno()
 
+        self.dtype = paddle.get_default_dtype()
+
     def _load_anno(self):
         self.name2mem = {}
         self.data_tar = tarfile.open(self.data_file)
@@ -127,7 +130,7 @@ class VOC2012(Dataset):
         label = np.array(label)
         if self.transform is not None:
             data = self.transform(data)
-        return data, label
+        return data.astype(self.dtype), label.astype(self.dtype)
 
     def __len__(self):
         return len(self.data)
-- 
GitLab


From 1d3b27cae8a7d88db80358a2810279874835fc68 Mon Sep 17 00:00:00 2001
From: ceci3 <ceci3@users.noreply.github.com>
Date: Mon, 21 Sep 2020 20:22:41 +0800
Subject: [PATCH 158/261] add double grad compute for batch norm (#27296)

* add double grad compute for batch norm,test=develop

* fix unittest, test=develop

* remove unuse tensor,test=develop

* add format,test=develop

* update, test=develop
---
 paddle/fluid/operators/batch_norm_op.cc       | 405 ++++++++++++++-
 paddle/fluid/operators/batch_norm_op.cu       |  44 ++
 paddle/fluid/operators/batch_norm_op.h        |  61 +++
 paddle/fluid/operators/instance_norm_op.cc    |   8 +-
 paddle/fluid/operators/norm_utils.cu.h        | 486 ++++++++++++++++++
 python/paddle/fluid/layers/nn.py              |   4 +-
 .../unittests/test_imperative_double_grad.py  |   2 +-
 .../tests/unittests/test_norm_nn_grad.py      |  62 +++
 8 files changed, 1066 insertions(+), 6 deletions(-)
 create mode 100644 paddle/fluid/operators/norm_utils.cu.h

diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index c92f72e653d..dcfe8bb1bb4 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -831,6 +831,401 @@ void BatchNormGradMaker<T>::Apply(GradOpPtr<T> op) const {
   op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
 }
 
+template <typename T>
+void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const {
+  op->SetType("batch_norm_grad_grad");
+  op->SetInput("X", this->Input("X"));
+  op->SetInput("Scale", this->Input("Scale"));
+  op->SetInput("SavedMean", this->Input("SavedMean"));
+  op->SetInput("SavedVariance", this->Input("SavedVariance"));
+  if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) {
+    op->SetInput("Variance", this->Input("Variance"));
+  }
+  op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+  op->SetInput("DDScale", this->OutputGrad(framework::GradVarName("Scale")));
+  op->SetInput("DDBias", this->OutputGrad(framework::GradVarName("Bias")));
+  op->SetInput("DY", this->Input(framework::GradVarName("Y")));
+
+  op->SetAttrMap(this->Attrs());
+  op->SetOutput("DX", this->InputGrad("X"));
+  op->SetOutput("DScale", this->InputGrad("Scale"));
+  op->SetOutput("DDY", this->InputGrad(framework::GradVarName("Y")));
+}
+
+void BatchNormDoubleGradOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchNormDoubleGrad");
+  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
+                 "BatchNormDoubleGrad");
+  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
+                 "BatchNormDoubleGrad");
+  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
+                 "BatchNormDoubleGrad");
+
+  const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
+  if (use_global_stats) {
+    OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "VarianceOut",
+                   "BatchNormDoubleGrad");
+  }
+
+  OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX", "BatchNormDoubleGrad");
+  OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "BatchNormDoubleGrad");
+
+  // check output
+  OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX", "BatchNormDoubleGrad");
+
+  const auto x_dims = ctx->GetInputDim("X");
+  const int C = x_dims[1];
+  if (ctx->HasOutput("DX")) {
+    ctx->SetOutputDim("DX", x_dims);
+  }
+  if (ctx->HasOutput("DScale")) {
+    ctx->SetOutputDim("DScale", {C});
+  }
+  if (ctx->HasOutput("DDY")) {
+    ctx->ShareDim("X", "DDY");
+  }
+}
+
+framework::OpKernelType BatchNormDoubleGradOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  const auto *var = ctx.InputVar("DY");
+  if (var == nullptr) {
+    PADDLE_THROW(
+        platform::errors::NotFound("cannot find gradient variable of Y"));
+  }
+  const Tensor *t = nullptr;
+  if (var->IsType<Tensor>()) {
+    t = &var->Get<Tensor>();
+  } else if (var->IsType<LoDTensor>()) {
+    t = &var->Get<LoDTensor>();
+  }
+  if (t == nullptr) {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("gradient variable of Y is empty"));
+  }
+  return framework::OpKernelType(
+      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+}
+
+template <typename T>
+class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *X = ctx.Input<Tensor>("X");
+    const auto *Scale = ctx.Input<Tensor>("Scale");
+    const auto *dY = ctx.Input<Tensor>("DY");
+    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
+    const float epsilon = ctx.Attr<float>("epsilon");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const bool is_test = ctx.Attr<bool>("is_test");
+
+    PADDLE_ENFORCE_EQ(
+        is_test, false,
+        platform::errors::InvalidArgument(
+            "`is_test = True` CANNOT be used in train program. If "
+            "you want to use global status in pre_train model, "
+            "please set `use_global_stats = True`"));
+
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    const auto *ddX = ctx.Input<Tensor>("DDX");
+    const auto *ddScale = ctx.Input<Tensor>("DDScale");
+    const auto *ddBias = ctx.Input<Tensor>("DDBias");
+
+    auto *dX = ctx.Output<Tensor>("DX");
+    auto *dScale = ctx.Output<Tensor>("DScale");
+    auto *ddY = ctx.Output<Tensor>("DDY");
+    dX->mutable_data<T>(ctx.GetPlace());
+    ddY->mutable_data<T>(ctx.GetPlace());
+
+    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+
+    const auto &x_dims = X->dims();
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+    const int sample_size = X->numel() / C;
+    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+
+    const T *mean_data = Saved_mean->data<T>();
+    const T *inv_var_data = Saved_variance->data<T>();
+
+    Tensor inv_var_tensor;
+    if (use_global_stats) {
+      const auto *running_variance = ctx.Input<Tensor>("Variance");
+      inv_var_tensor.Resize({C});
+
+      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
+      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+      inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
+      inv_var_data = running_inv_var_data;
+    }
+
+    // transpose NCHW -> NHWC for easy calculate
+    Tensor transformed_x(X->type());
+    Tensor transformed_dy(dY->type());
+    Tensor transformed_ddx(ddX->type());
+
+    Tensor transformed_dx(dX->type());
+    Tensor transformed_ddy(ddY->type());
+    if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
+      VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+      // Input Tensor
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, X,
+                                                         &transformed_x);
+      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, X, &transformed_x);
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, dY,
+                                                         &transformed_dy);
+      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, dY,
+                                                        &transformed_dy);
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, ddX,
+                                                         &transformed_ddx);
+      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, ddX,
+                                                        &transformed_ddx);
+      // Output Tensor
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, dX,
+                                                         &transformed_dx);
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, ddY,
+                                                         &transformed_ddy);
+    } else {
+      transformed_x.ShareDataWith(*X);
+      transformed_dy.ShareDataWith(*dY);
+      transformed_ddx.ShareDataWith(*ddX);
+
+      transformed_dx.ShareDataWith(*dX);
+      transformed_ddy.ShareDataWith(*ddY);
+    }
+
+    ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
+    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+    Tensor mean_tile;
+    mean_tile.Resize({C, sample_size});
+    mean_tile.mutable_data<T>(ctx.GetPlace());
+    EigenArrayMap<T> mean_tile_data(mean_tile.mutable_data<T>(ctx.GetPlace()),
+                                    C, sample_size);
+
+    Tensor inv_var_tile;
+    inv_var_tile.Resize({C, sample_size});
+    inv_var_tile.mutable_data<T>(ctx.GetPlace());
+    EigenArrayMap<T> inv_var_tile_data(
+        inv_var_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+
+    mean_tile_data = mean_arr.replicate(1, sample_size);
+    inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
+
+    Tensor Scale_data;
+    if (!Scale) {
+      Scale_data.mutable_data<T>({C}, ctx.GetPlace());
+      set_constant(dev_ctx, &Scale_data, static_cast<T>(1));
+    }
+    ConstEigenVectorArrayMap<T> scale_arr(
+        Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
+
+    Tensor scale_tile;
+    scale_tile.Resize({C, sample_size});
+    scale_tile.mutable_data<T>(ctx.GetPlace());
+    EigenArrayMap<T> scale_tile_data(scale_tile.mutable_data<T>(ctx.GetPlace()),
+                                     C, sample_size);
+    scale_tile_data = scale_arr.replicate(1, sample_size);
+
+    ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
+    ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
+
+    Tensor x_sub_mean_mul_invstd;
+    x_sub_mean_mul_invstd.Resize({C, sample_size});
+    x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace());
+    EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
+        x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+    x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
+
+    if (dX) {
+      dX->mutable_data<T>(ctx.GetPlace());
+      EigenArrayMap<T> dx_arr(transformed_dx.mutable_data<T>(ctx.GetPlace()), C,
+                              sample_size);
+      dx_arr.setZero();
+      if (use_global_stats) {
+        // math: dx = (ddscale * dy) * inv_var
+        if (ddScale) {
+          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+          Tensor ddscale_tile;
+          ddscale_tile.Resize({C, sample_size});
+          EigenArrayMap<T> ddscale_tile_data(
+              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+          dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
+        }
+      } else {
+        // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
+        // axis=(n,h,w)) *
+        //          np.sum(dy, axis=(n,h,w)) -
+        //          np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
+        //          mean),
+        //          axis=(n,h,w)) * inv_var.pow(2) *
+        //          np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
+        //          NxHxW *
+        //          np.sum(ddx * (x - mean)) *
+        //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
+        //          np.sum(dy,
+        //          axis=(n,h,w)) * (x - mean) *
+        //          (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
+        //          inv_var
+        //          *
+        //          np.mean(dy, axis=(n,h,w)) -
+        //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
+        //          axis=(n,h,w))))
+
+        if (ddX) {
+          dx_arr +=
+              (x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+               inv_var_tile_data / sample_size)
+                  .colwise() *
+              (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
+               (dy_arr * ddx_arr).rowwise().sum() +
+               3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
+                   (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                   sample_size);
+
+          dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                    (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                    sample_size *
+                    (dy_arr.rowwise().sum() / sample_size - dy_arr);
+
+          dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                    (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                    sample_size *
+                    (ddx_arr.rowwise().sum() / sample_size - ddx_arr);
+
+          dx_arr = scale_tile_data * dx_arr;
+        }
+        if (ddScale) {
+          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+          Tensor ddscale_tile;
+          ddscale_tile.Resize({C, sample_size});
+          EigenArrayMap<T> ddscale_tile_data(
+              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+          dx_arr += (dy_arr * inv_var_tile_data -
+                     (dy_arr.rowwise().sum().replicate(1, sample_size) /
+                      sample_size) *
+                         inv_var_tile_data -
+                     x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+                         (dy_arr * x_sub_mean_mul_invstd_arr)
+                             .rowwise()
+                             .sum()
+                             .replicate(1, sample_size) /
+                         sample_size) *
+                    ddscale_tile_data;
+        }
+      }
+      if (data_layout == DataLayout::kNCHW) {
+        VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+        TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
+            ctx, &transformed_dx, dX);
+      }
+    }
+    if (dScale) {
+      dScale->mutable_data<T>(ctx.GetPlace());
+      EigenVectorArrayMap<T> dscale_arr(dScale->mutable_data<T>(ctx.GetPlace()),
+                                        C);
+      dscale_arr.setZero();
+      if (use_global_stats) {
+        // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
+        if (ddX) {
+          dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
+        }
+      } else {
+        // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
+        //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
+        //            ddx
+        if (ddX) {
+          Tensor first_grad;
+          first_grad.Resize({C, sample_size});
+          EigenArrayMap<T> first_grad_arr(
+              first_grad.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          first_grad_arr.setZero();
+
+          first_grad_arr +=
+              inv_var_tile_data *
+              (dy_arr -
+               dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+               x_sub_mean_mul_invstd_arr *
+                   (dy_arr * x_sub_mean_mul_invstd_arr)
+                       .rowwise()
+                       .sum()
+                       .replicate(1, sample_size) /
+                   sample_size);
+          dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
+        }
+      }
+    }
+
+    if (ddY) {
+      ddY->mutable_data<T>(ctx.GetPlace());
+      EigenArrayMap<T> ddy_arr(transformed_ddy.mutable_data<T>(ctx.GetPlace()),
+                               C, sample_size);
+      ddy_arr.setZero();
+      if (use_global_stats) {
+        // math: ddy = r * ddx * inv_var
+        if (ddX) {
+          ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
+        }
+      } else {
+        // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
+        //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
+        //           np.mean(ddx * (x - mean), axis=(n,h,w)))
+        if (ddX) {
+          ddy_arr +=
+              scale_tile_data * inv_var_tile_data *
+              (ddx_arr -
+               ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+               x_sub_mean_mul_invstd_arr *
+                   (ddx_arr * x_sub_mean_mul_invstd_arr)
+                       .rowwise()
+                       .sum()
+                       .replicate(1, sample_size) /
+                   sample_size);
+        }
+        if (ddScale && ddBias) {
+          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+          Tensor ddscale_tile;
+          ddscale_tile.Resize({C, sample_size});
+          EigenArrayMap<T> ddscale_tile_data(
+              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+          ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+          Tensor ddbias_tile;
+          ddbias_tile.Resize({C, sample_size});
+          EigenArrayMap<T> ddbias_tile_data(
+              ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
+
+          ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+          ddy_arr += ddbias_tile_data;
+        }
+      }
+      if (data_layout == DataLayout::kNCHW) {
+        VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+        TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
+            ctx, &transformed_ddy, ddY);
+      }
+    }
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(BatchNormDoubleGradOpInplaceInferer, {"DY", "DDY"});
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -839,7 +1234,11 @@ REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
                   ops::BatchNormOpInferVarType,
                   ops::BatchNormGradMaker<paddle::framework::OpDesc>,
                   ops::BatchNormGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
+REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
+                  ops::BatchNormDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp,
+                  ops::BatchNormDoubleGradOpInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(
     batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
@@ -848,3 +1247,7 @@ REGISTER_OP_CPU_KERNEL(
     batch_norm_grad,
     ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    batch_norm_grad_grad,
+    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index be834772679..2d5b395ac68 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -840,6 +841,45 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
   }
 };
 
+template <typename T>
+class BatchNormDoubleGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *X = ctx.Input<Tensor>("X");
+    const auto *Scale = ctx.Input<Tensor>("Scale");
+    const auto *dY = ctx.Input<Tensor>("DY");
+    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
+    const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const bool is_test = ctx.Attr<bool>("is_test");
+
+    PADDLE_ENFORCE_EQ(
+        is_test, false,
+        platform::errors::InvalidArgument(
+            "`is_test = True` CANNOT be used in train program. If "
+            "you want to use global status in pre_train model, "
+            "please set `use_global_stats = True`"));
+
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    const auto *ddX = ctx.Input<Tensor>("DDX");
+    const auto *ddScale = ctx.Input<Tensor>("DDScale");
+    const auto *ddBias = ctx.Input<Tensor>("DDBias");
+
+    auto *dX = ctx.Output<Tensor>("DX");
+    auto *dScale = ctx.Output<Tensor>("DScale");
+    auto *ddY = ctx.Output<Tensor>("DDY");
+
+    NormDoubleGradFunctor<platform::CUDADeviceContext, T>(
+        ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon,
+        use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -853,3 +893,7 @@ REGISTER_OP_CUDA_KERNEL(
     batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
     ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
     ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    batch_norm_grad_grad,
+    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>,
+    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index 9f844b7c078..1440b74290c 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -103,6 +103,42 @@ inline void TransToChannelFirst(const framework::ExecutionContext& context,
   }
 }
 
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const framework::ExecutionContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+
+  } else if (dim == 2) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+
 template <typename DeviceContext, typename T>
 inline void TransToChannelLast(const framework::ExecutionContext& context,
                                const Tensor* input, Tensor* transformed_input) {
@@ -154,6 +190,16 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
       const framework::OpKernelType& expected_kernel_type) const override;
 };
 
+class BatchNormDoubleGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override;
@@ -168,6 +214,15 @@ class BatchNormGradMaker : public framework::SingleGradOpMaker<T> {
   void Apply(GradOpPtr<T> op) const override;
 };
 
+template <typename T>
+class BatchNormDoubleGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override;
+};
+
 class BatchNormOpInferVarType
     : public framework::PassInDtypeAndVarTypeToOutput {
  protected:
@@ -190,5 +245,11 @@ class BatchNormGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override;
 };
 
+template <typename DeviceContext, typename T>
+class BatchNormDoubleGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index f72f7e8b85b..a5b270c1dfe 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -595,9 +595,13 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
 
         first_grad_arr +=
             inv_var_tile_data *
-            (dy_arr - dy_arr.colwise().sum() / sample_size -
+            (dy_arr -
+             dy_arr.colwise().sum().replicate(sample_size, 1) / sample_size -
              x_sub_mean_mul_invstd_arr *
-                 (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                 (dy_arr * x_sub_mean_mul_invstd_arr)
+                     .colwise()
+                     .sum()
+                     .replicate(sample_size, 1) /
                  sample_size);
         first_grad_arr = first_grad_arr * ddx_arr;
         for (int nc = 0; nc < NxC; ++nc) {
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
new file mode 100644
index 00000000000..07333f1ae11
--- /dev/null
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -0,0 +1,486 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+// math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
+// axis=(n,h,w)) *
+//          np.sum(dy, axis=(n,h,w)) -
+//          np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
+//          mean),
+//          axis=(n,h,w)) * inv_var.pow(2) *
+//          np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
+//          NxHxW *
+//          np.sum(ddx * (x - mean)) *
+//          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
+//          np.sum(dy,
+//          axis=(n,h,w)) * (x - mean) *
+//          (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
+//          inv_var
+//          *
+//          np.mean(dy, axis=(n,h,w)) -
+//          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
+//          axis=(n,h,w))))
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+__global__ void DoubleGradComputeDX(const T *x, const T *mean,
+                                    const T *variance, const T *ddx,
+                                    const T *dy, const T *scale,
+                                    const T *ddscale, const int N, const int C,
+                                    const int sample_size, const double epsilon,
+                                    T *dx) {
+  const int outer_size = C;
+  const int inner_size = N * sample_size;
+
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ T dy_sum_val;
+  __shared__ T ddx_sum_val;
+  __shared__ T dy_mul_ddx_sum_val;
+  __shared__ T dy_mul_x_sub_mean_sum_val;
+  __shared__ T ddx_mul_x_sub_mean_sum_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T mean_val = mean[i];
+    T var_val = variance[i];
+    T dy_sum = 0;
+    T ddx_sum = 0;
+    T dy_mul_ddx_sum = 0;
+    T dy_mul_x_sub_mean_sum = 0;
+    T ddx_mul_x_sub_mean_sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index =
+          layout == framework::DataLayout::kNCHW
+              ? (j / sample_size * C + i) * sample_size + j % sample_size
+              : j * outer_size + i;
+      T ddx_i = ddx[index];
+      T dy_i = dy[index];
+      T tmp = x[index] - mean_val;
+
+      dy_sum += dy_i;
+      ddx_sum += ddx_i;
+      dy_mul_ddx_sum += (ddx_i * dy_i);
+
+      dy_mul_x_sub_mean_sum += (dy_i * tmp);
+      ddx_mul_x_sub_mean_sum += (ddx_i * tmp);
+    }
+
+    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+    ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+    dy_mul_ddx_sum =
+        BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum());
+    dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                                .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+    ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                                 .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+
+    if (threadIdx.x == 0) {
+      dy_sum_val = dy_sum;
+      ddx_sum_val = ddx_sum;
+      dy_mul_ddx_sum_val = dy_mul_ddx_sum;
+      dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+      ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+    }
+    __syncthreads();
+
+    if (ddx != nullptr) {
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index =
+            layout == framework::DataLayout::kNCHW
+                ? (j / sample_size * C + i) * sample_size + j % sample_size
+                : j * outer_size + i;
+        dx[index] +=
+            ((x[index] - mean_val) * var_val * var_val * var_val / inner_size *
+                 (ddx_sum_val * dy_sum_val / inner_size - dy_mul_ddx_sum_val +
+                  3. * dy_mul_x_sub_mean_sum_val * var_val *
+                      ddx_mul_x_sub_mean_sum_val * var_val / inner_size) +
+             ddx_mul_x_sub_mean_sum_val * var_val / inner_size * var_val *
+                 var_val * (dy_sum_val / inner_size - dy[index]) +
+             dy_mul_x_sub_mean_sum_val * var_val / inner_size * var_val *
+                 var_val * (ddx_sum_val / inner_size - ddx[index])) *
+            scale[i];
+      }
+    }
+    __syncthreads();
+    if (ddscale != nullptr) {
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index =
+            layout == framework::DataLayout::kNCHW
+                ? (j / sample_size * C + i) * sample_size + j % sample_size
+                : j * outer_size + i;
+        dx[index] += (dy[index] * var_val - dy_sum_val / inner_size * var_val -
+                      (x[index] - mean_val) * var_val *
+                          dy_mul_x_sub_mean_sum_val * var_val / inner_size) *
+                     ddscale[i];
+      }
+    }
+  }
+}
+
+// math: ddy = (x - mean) * inv_var * ddscale + ddbias +
+//           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
+//           np.mean(ddx * (x - mean), axis=(n,h,w)))
+template <typename T, int BlockDim, framework::DataLayout layout>
+__global__ void DoubleGradComputeDDY(const T *x, const T *mean,
+                                     const T *variance, const T *ddscale,
+                                     const T *ddbias, const T *ddx,
+                                     const T *scale, const int N, const int C,
+                                     const int sample_size,
+                                     const double epsilon, T *ddy) {
+  const int outer_size = C;
+  const int inner_size = N * sample_size;
+
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ T ddx_sum_val;
+  __shared__ T ddx_mul_x_sub_mean_sum_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T mean_val = mean[i];
+    T var_val = variance[i];
+    T ddx_sum = 0;
+    T ddx_mul_x_sub_mean_sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index =
+          layout == framework::DataLayout::kNCHW
+              ? (j / sample_size * C + i) * sample_size + j % sample_size
+              : j * outer_size + i;
+      T ddx_i = ddx[index];
+      ddx_sum += ddx_i;
+      ddx_mul_x_sub_mean_sum += (ddx_i * (x[index] - mean_val));
+    }
+    ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+    ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                                 .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+
+    if (threadIdx.x == 0) {
+      ddx_sum_val = ddx_sum;
+      ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+    }
+    __syncthreads();
+
+    if (ddx != nullptr) {
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index =
+            layout == framework::DataLayout::kNCHW
+                ? (j / sample_size * C + i) * sample_size + j % sample_size
+                : j * outer_size + i;
+        ddy[index] += scale[i] * var_val *
+                      (ddx[index] - ddx_sum_val / inner_size -
+                       (x[index] - mean_val) * var_val *
+                           ddx_mul_x_sub_mean_sum_val * var_val / inner_size);
+      }
+    }
+    __syncthreads();
+    if (ddscale != nullptr) {
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index =
+            layout == framework::DataLayout::kNCHW
+                ? (j / sample_size * C + i) * sample_size + j % sample_size
+                : j * outer_size + i;
+        ddy[index] += (x[index] - mean_val) * var_val * ddscale[i];
+      }
+    }
+    __syncthreads();
+    if (ddbias != nullptr) {
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index =
+            layout == framework::DataLayout::kNCHW
+                ? (j / sample_size * C + i) * sample_size + j % sample_size
+                : j * outer_size + i;
+        ddy[index] += ddbias[i];
+      }
+    }
+  }
+}
+
+// math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
+//            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
+//            ddx
+template <typename T, int BlockDim, framework::DataLayout layout>
+__global__ void DoubleGradComputeDScale(const T *x, const T *mean,
+                                        const T *variance, const T *ddx,
+                                        const T *dy, const int N, const int C,
+                                        const int sample_size,
+                                        const double epsilon, T *dscale) {
+  const int outer_size = C;
+  const int inner_size = N * sample_size;
+
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
+  __shared__ T dy_sum_val;
+  __shared__ T dy_mul_x_sub_mean_sum_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T dy_sum = 0;
+    T dy_mul_x_sub_mean_sum = 0;
+    T mean_val = mean[i];
+    T var_val = variance[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index =
+          layout == framework::DataLayout::kNCHW
+              ? (j / sample_size * C + i) * sample_size + j % sample_size
+              : j * outer_size + i;
+      T dy_i = dy[index];
+      dy_sum += dy_i;
+      dy_mul_x_sub_mean_sum += (dy_i * (x[index] - mean_val));
+    }
+    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+    dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                                .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+
+    if (threadIdx.x == 0) {
+      dy_sum_val = dy_sum;
+      dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+    }
+    __syncthreads();
+
+    if (ddx != nullptr) {
+      T dscale_tmp = 0;
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index =
+            layout == framework::DataLayout::kNCHW
+                ? (j / sample_size * C + i) * sample_size + j % sample_size
+                : j * outer_size + i;
+        dscale_tmp += ddx[index] * var_val *
+                      (dy[index] - dy_sum_val / inner_size -
+                       dy_mul_x_sub_mean_sum_val * (x[index] - mean_val) *
+                           var_val * var_val / inner_size);
+      }
+      dscale_tmp =
+          BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
+
+      if (threadIdx.x == 0) {
+        dscale[i] += dscale_tmp;
+      }
+      __syncthreads();
+    }
+  }
+}
+
+// math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
+template <typename T, int BlockDim, framework::DataLayout layout>
+__global__ void DoubleGradComputeDScaleWithGlobal(
+    const T *ddx, const T *variance, const T *dy, const double epsilon,
+    const int N, const int C, const int sample_size, T *dscale) {
+  int outer_size = C;
+  int inner_size = N * sample_size;
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_dy_storage;
+  __shared__ T ddx_mul_dy_sum_val;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
+    T ddx_mul_dy_sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index =
+          layout == framework::DataLayout::kNCHW
+              ? (j / sample_size * C + i) * sample_size + j % sample_size
+              : j * outer_size + i;
+      T ddx_i = ddx[index];
+      T dy_i = dy[index];
+      ddx_mul_dy_sum += (ddx_i * dy_i);
+    }
+    ddx_mul_dy_sum =
+        BlockReduce(ddx_mul_dy_storage).Reduce(ddx_mul_dy_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      ddx_mul_dy_sum_val = ddx_mul_dy_sum;
+    }
+    __syncthreads();
+
+    if (ddx != nullptr) {
+      dscale[i] = inv_var_i * ddx_mul_dy_sum_val;
+    }
+  }
+}
+
+// math: dx = ddscale * dy * inv_var
+// math: ddy = scale * ddx * inv_var
+template <typename T, framework::DataLayout layout>
+__global__ void DoubleGradComputeDataWithGlobal(
+    const T *dy, const T *scale, const T *variance, const double epsilon,
+    const int C, const int sample_size, const int num, T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  if (scale != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      T inv_var = 1.0 / sqrt(variance[c] + epsilon);
+      dx[i] = dy[i] * scale[c] * inv_var;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
+                           const DataLayout data_layout, const Tensor *X,
+                           const Tensor *Scale, const Tensor *dY,
+                           const Tensor *Saved_mean,
+                           const Tensor *Saved_variance, const double epsilon,
+                           const bool use_global_stats, const Tensor *ddX,
+                           const Tensor *ddScale, const Tensor *ddBias,
+                           Tensor *dX, Tensor *dScale, Tensor *ddY) {
+  const T *x_data = X->data<T>();
+  const T *dy_data = dY->data<T>();
+  const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
+
+  const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
+  const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>());
+
+  auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+
+  auto &x_dims = X->dims();
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int N = x_dims[0];
+  const int num = X->numel();
+  const int sample_size = num / N / C;
+  Tensor scale_tmp;
+  if (!Scale) {
+    scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
+    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+  }
+  const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
+
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  int grid = std::min(C, max_blocks);
+  int grid1 = (num + block - 1) / block;
+
+  const T *mean_data, *variance_data;
+  if (use_global_stats) {
+    const auto *running_var = ctx.Input<Tensor>("Variance");
+    const auto *running_var_data = running_var->template data<T>();
+    variance_data = running_var_data;
+  } else {
+    const T *smean_data = Saved_mean->data<T>();
+    const T *svariance_data = Saved_variance->data<T>();
+    mean_data = smean_data;
+    variance_data = svariance_data;
+  }
+
+  if (dX) {
+    T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
+    set_constant(dev_ctx, dX, static_cast<T>(0));
+    if (use_global_stats) {
+      if (data_layout == DataLayout::kNHWC) {
+        DoubleGradComputeDataWithGlobal<
+            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
+            dx_data);
+      } else {
+        DoubleGradComputeDataWithGlobal<
+            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
+            dx_data);
+      }
+    } else {
+      if (data_layout == DataLayout::kNHWC) {
+        DoubleGradComputeDX<
+            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
+            ddscale_data, N, C, sample_size, epsilon, dx_data);
+      } else {
+        DoubleGradComputeDX<
+            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
+            ddscale_data, N, C, sample_size, epsilon, dx_data);
+      }
+    }
+  }
+  if (dScale) {
+    T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
+    set_constant(dev_ctx, dScale, static_cast<T>(0));
+    if (use_global_stats) {
+      if (data_layout == DataLayout::kNHWC) {
+        DoubleGradComputeDScaleWithGlobal<
+            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
+            dscale_data);
+      } else {
+        DoubleGradComputeDScaleWithGlobal<
+            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
+            dscale_data);
+      }
+    } else {
+      if (data_layout == DataLayout::kNHWC) {
+        DoubleGradComputeDScale<
+            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
+            sample_size, epsilon, dscale_data);
+      } else {
+        DoubleGradComputeDScale<
+            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
+            sample_size, epsilon, dscale_data);
+      }
+    }
+  }
+  if (ddY) {
+    T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
+    set_constant(dev_ctx, ddY, static_cast<T>(0));
+    if (use_global_stats) {
+      if (data_layout == DataLayout::kNHWC) {
+        DoubleGradComputeDataWithGlobal<
+            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            ddx_data, scale_data, variance_data, epsilon, C, sample_size, num,
+            ddy_data);
+      } else {
+        DoubleGradComputeDataWithGlobal<
+            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            ddx_data, scale_data, variance_data, epsilon, C, sample_size, num,
+            ddy_data);
+      }
+    } else {
+      if (data_layout == DataLayout::kNHWC) {
+        DoubleGradComputeDDY<
+            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, variance_data, ddscale_data, ddbias_data,
+            ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
+      } else {
+        DoubleGradComputeDDY<
+            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, variance_data, ddscale_data, ddbias_data,
+            ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 4a750f301a0..3e7d10f8d1a 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3167,7 +3167,7 @@ def instance_norm(input,
 
     param_shape = [channel_num]
 
-    if param_attr and bias_attr:
+    if param_attr != False and bias_attr != False:
         # create parameter
         scale = helper.create_parameter(
             attr=helper.param_attr,
@@ -3190,7 +3190,7 @@ def instance_norm(input,
     instance_norm_out = helper.create_variable_for_type_inference(dtype)
 
     inputs = {"X": input}
-    if param_attr and bias_attr:
+    if param_attr != False and bias_attr != False:
         inputs["Scale"] = scale
         inputs["Bias"] = bias
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index 720c9f95c25..39c6fca89cc 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -346,7 +346,7 @@ class TestRaiseNoDoubleGradOp(TestCase):
         with fluid.dygraph.guard():
             x = fluid.layers.ones(shape=[2, 3, 2, 2], dtype='float32')
             x.stop_gradient = False
-            y = paddle.fluid.layers.batch_norm(x)
+            y = paddle.fluid.layers.group_norm(x, groups=1)
 
             dx = fluid.dygraph.grad(
                 outputs=[y], inputs=[x], create_graph=True,
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index c44ea454271..a89b9fde7f9 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -68,5 +68,67 @@ class TestInstanceNormDoubleGradCheckWithoutParamBias(
                 [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
 
 
+class TestBatchNormDoubleGradCheck(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+    def init_test(self):
+        self.data_layout = 'NCHW'
+        self.use_global_stats = False
+        self.shape = [2, 3, 4, 5]
+
+    @prog_scope()
+    def func(self, place):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+            np.random.seed()
+            dtype = "float32"
+            eps = 0.005
+            atol = 1e-4
+            x = layers.create_parameter(dtype=dtype, shape=self.shape, name='x')
+            z = fluid.layers.batch_norm(
+                input=x,
+                data_layout=self.data_layout,
+                use_global_stats=self.use_global_stats)
+            x_arr = np.random.uniform(-1, 1, self.shape).astype(dtype)
+            gradient_checker.double_grad_check(
+                [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestBatchNormDoubleGradCheckCase1(TestBatchNormDoubleGradCheck):
+    def init_test(self):
+        self.data_layout = 'NHWC'
+        self.use_global_stats = False
+        self.shape = [2, 3, 4, 5]
+
+
+class TestBatchNormDoubleGradCheckCase2(TestBatchNormDoubleGradCheck):
+    def init_test(self):
+        self.data_layout = 'NCHW'
+        self.use_global_stats = True
+        self.shape = [2, 3, 4, 5]
+
+
+class TestBatchNormDoubleGradCheckCase3(TestBatchNormDoubleGradCheck):
+    def init_test(self):
+        self.data_layout = 'NHWC'
+        self.use_global_stats = True
+        self.shape = [2, 3, 4, 5]
+
+
+class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck):
+    def init_test(self):
+        self.data_layout = 'NCHW'
+        self.use_global_stats = False
+        self.shape = [2, 2, 3, 4, 5]
+
+
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From f11a53ee7626138e3d096c4c58953f78fa39dc3e Mon Sep 17 00:00:00 2001
From: LutaoChu <30695251+LutaoChu@users.noreply.github.com>
Date: Mon, 21 Sep 2020 21:26:30 +0800
Subject: [PATCH 159/261] Optimize argsort Op performance on GPU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* argsort op acceleration on GPU when the input size is equal to the length of the ‘axis’ dimension
---
 paddle/fluid/operators/argsort_op.cu          |  86 ++++++++++++---
 .../fluid/tests/unittests/test_argsort_op.py  | 100 +++++++++++++-----
 2 files changed, 142 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
index cbd7e33bc6b..7fc2a92b7d9 100644
--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <thrust/copy.h>
 #include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include "cub/cub.cuh"
 #include "paddle/fluid/framework/op_registry.h"
@@ -58,6 +60,16 @@ static __global__ void FillIndex(T* indices, T num_rows, T num_cols) {
   }
 }
 
+template <typename T, typename IndType>
+static __global__ void FillFlattenGrad(const T* dO, const IndType* indices,
+                                       int64_t size, T* dX) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = index; i < size; i += stride) {
+    dX[indices[i]] = dO[i];
+  }
+}
+
 template <typename T, typename IndType>
 static __global__ void FillGrad(const T* dO, const IndType* indices, T* dX,
                                 IndType num_rows, IndType num_cols) {
@@ -193,6 +205,23 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
 }
 
 template <typename T>
+void ArgFlattenAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
+                      const Tensor* indices, int64_t size, Tensor* dX) {
+  auto cu_stream = ctx.stream();
+
+  const int64_t block_size =
+      std::min(size, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock()));
+  int64_t max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int64_t max_blocks =
+      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+  const int64_t grid_size =
+      std::min(max_blocks, (size + block_size - 1) / block_size);
+
+  FillFlattenGrad<<<grid_size, block_size, 0, cu_stream>>>(
+      dO->data<T>(), indices->data<int64_t>(), size, dX->data<T>());
+}
+
+template <typename DeviceContext, typename T>
 class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -205,8 +234,25 @@ class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
     auto in_dims = input->dims();
     axis = (axis < 0) ? (in_dims.size() + axis) : axis;
 
-    int64_t numel = input->numel();
-    int64_t groups = numel / in_dims[axis];
+    const T* in_data = input->data<T>();
+    auto size = input->numel();
+    T* out_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    // Use thrust for parallel acceleration when the input size is equal to the
+    // length of the ‘axis’ dimension.
+    // Compared to the following 'Special case for full sort', ascending sort is
+    // 34 times faster and descending sort is 31 times faster.
+    if (size == in_dims[axis]) {
+      thrust::sequence(thrust::device, ids_data, ids_data + size);
+      thrust::copy(thrust::device, in_data, in_data + size, out_data);
+      thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data);
+      if (descending) {
+        thrust::reverse(thrust::device, out_data, out_data + size);
+        thrust::reverse(thrust::device, ids_data, ids_data + size);
+      }
+      return;
+    }
 
     // Special case for full sort, speedup ~190x.
     if (axis == -1 || axis + 1 == in_dims.size()) {
@@ -276,23 +322,28 @@ class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
     int axis = ctx.Attr<int>("axis");
 
     dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
 
-    auto in_dims = indices->dims();
+    auto in_dims = dX->dims();
     axis = (axis < 0) ? (in_dims.size() + axis) : axis;
 
-    int64_t numel = indices->numel();
+    int64_t size = dX->numel();
+    const auto& dev_ctx = ctx.cuda_device_context();
+
+    // Parallel acceleration when the input size is equal to the length of the
+    // ‘axis’ dimension.
+    // Compared to 'special case for full sort' below, the gradient calculation
+    // is 10 times faster.
+    if (size == in_dims[axis]) {
+      ArgFlattenAssign<T>(dev_ctx, dO, indices, size, dX);
+      return;
+    }
 
     // Special case for full sort, speedup ~190x.
     if (axis == -1 || axis + 1 == in_dims.size()) {
       const int64_t input_height = framework::product(
           framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
       const int64_t input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
       ArgFullAssign<T, int64_t>(dev_ctx, dO, indices, dX, input_height,
                                 input_width);
     } else {
@@ -316,7 +367,6 @@ class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
       Tensor trans_ind;
       trans_ind.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
       int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
       // Do transpose
       TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *dO,
                                                    &trans_dO, trans);
@@ -345,11 +395,17 @@ class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 REGISTER_OP_CUDA_KERNEL(
-    argsort, paddle::operators::ArgsortOpCUDAKernel<float>,
-    paddle::operators::ArgsortOpCUDAKernel<double>,
-    paddle::operators::ArgsortOpCUDAKernel<int>,
-    paddle::operators::ArgsortOpCUDAKernel<int64_t>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::float16>);
+    argsort,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           double>,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           int>,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           int64_t>,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           paddle::platform::float16>);
 REGISTER_OP_CUDA_KERNEL(
     argsort_grad, paddle::operators::ArgsortGradOpCUDAKernel<float>,
     paddle::operators::ArgsortGradOpCUDAKernel<double>,
diff --git a/python/paddle/fluid/tests/unittests/test_argsort_op.py b/python/paddle/fluid/tests/unittests/test_argsort_op.py
index 2a8e0e6c7f0..e324f0ec3d3 100644
--- a/python/paddle/fluid/tests/unittests/test_argsort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
@@ -348,57 +348,99 @@ class TestArgsortErrorOnGPU(TestArgsortErrorOnCPU):
 
 
 class TestArgsort(unittest.TestCase):
+    def init(self):
+        self.input_shape = [10000, ]
+        self.axis = 0
+
     def setUp(self):
+        self.init()
         if core.is_compiled_with_cuda():
             self.place = core.CUDAPlace(0)
         else:
             self.place = core.CPUPlace()
-        self.data = np.random.rand(2, 3, 4).astype("float32")
+        self.data = np.random.rand(*self.input_shape)
 
-    def test_api_0(self):
+    def test_api(self):
         with fluid.program_guard(fluid.Program()):
-            input = fluid.data(name="input", shape=[2, 3, 4], dtype="float32")
-            output = paddle.argsort(x=input)
-            exe = fluid.Executor(self.place)
-            result, = exe.run(feed={'input': self.data}, fetch_list=[output])
-            np_result = np.argsort(self.data)
-            self.assertEqual((result == np_result).all(), True)
+            input = fluid.data(
+                name="input", shape=self.input_shape, dtype="float64")
+
+            output = paddle.argsort(input, axis=self.axis)
+            output2 = paddle.argsort(input, axis=self.axis, descending=True)
 
-    def test_api_1(self):
-        with fluid.program_guard(fluid.Program()):
-            input = fluid.data(name="input", shape=[2, 3, 4], dtype="float32")
-            output = paddle.argsort(x=input, axis=1)
             exe = fluid.Executor(self.place)
-            result, = exe.run(feed={'input': self.data}, fetch_list=[output])
-            np_result = np.argsort(self.data, axis=1)
+            result, result2 = exe.run(feed={'input': self.data},
+                                      fetch_list=[output, output2])
+
+            np_result = np.argsort(self.data, axis=self.axis)
             self.assertEqual((result == np_result).all(), True)
 
+            np_result2 = np.argsort(-self.data, axis=self.axis)
+            self.assertEqual((result2 == np_result2).all(), True)
+
+
+class TestArgsort2(TestArgsort):
+    def init(self):
+        self.input_shape = [10000, 1]
+        self.axis = 0
+
+
+class TestArgsort3(TestArgsort):
+    def init(self):
+        self.input_shape = [1, 10000]
+        self.axis = 1
+
+
+class TestArgsort4(TestArgsort):
+    def init(self):
+        self.input_shape = [2, 3, 4]
+        self.axis = 1
+
+
+class TestArgsortImperative(unittest.TestCase):
+    def init(self):
+        self.input_shape = [10000, ]
+        self.axis = 0
 
-class TestArgsortDygraph(unittest.TestCase):
     def setUp(self):
-        self.input_data = np.random.rand(10, 10)
+        self.init()
+        self.input_data = np.random.rand(*self.input_shape)
         if core.is_compiled_with_cuda():
             self.place = core.CUDAPlace(0)
         else:
             self.place = core.CPUPlace()
 
-    def test_api_0(self):
+    def test_api(self):
         paddle.disable_static(self.place)
-        var_x = paddle.to_variable(self.input_data)
-        out = paddle.argsort(var_x)
-        self.assertEqual((np.argsort(self.input_data) == out.numpy()).all(),
-                         True)
-        paddle.enable_static()
+        var_x = paddle.to_tensor(self.input_data)
+        out = paddle.argsort(var_x, axis=self.axis)
+        expect = np.argsort(self.input_data, axis=self.axis)
+        self.assertEqual((expect == out.numpy()).all(), True)
+
+        out2 = paddle.argsort(var_x, axis=self.axis, descending=True)
+        expect2 = np.argsort(-self.input_data, axis=self.axis)
+        self.assertEqual((expect2 == out2.numpy()).all(), True)
 
-    def test_api_1(self):
-        paddle.disable_static(self.place)
-        var_x = paddle.to_variable(self.input_data)
-        out = paddle.argsort(var_x, axis=-1)
-        self.assertEqual(
-            (np.argsort(
-                self.input_data, axis=-1) == out.numpy()).all(), True)
         paddle.enable_static()
 
 
+class TestArgsortImperative2(TestArgsortImperative):
+    def init(self):
+        self.input_shape = [10000, 1]
+        self.axis = 0
+
+
+class TestArgsortImperative3(TestArgsortImperative):
+    def init(self):
+        self.input_shape = [1, 10000]
+        self.axis = 1
+
+
+class TestArgsortImperative2(TestArgsortImperative):
+    def init(self):
+        self.input_shape = [2, 3, 4]
+        self.axis = 1
+
+
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From 13a4c74efd34e3ec8445792642c633f20ae331fb Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Mon, 21 Sep 2020 22:00:01 +0800
Subject: [PATCH 160/261] add mv op(c++, python, unit test) (#27024)

---
 paddle/fluid/operators/mv_op.cc               | 125 ++++++++++++++++++
 paddle/fluid/operators/mv_op.cu               |  95 +++++++++++++
 paddle/fluid/operators/mv_op.h                | 105 +++++++++++++++
 python/paddle/__init__.py                     |   1 +
 .../fluid/tests/unittests/test_mv_op.py       |  94 +++++++++++++
 python/paddle/tensor/__init__.py              |   1 +
 python/paddle/tensor/linalg.py                |  64 ++++++++-
 7 files changed, 484 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/mv_op.cc
 create mode 100644 paddle/fluid/operators/mv_op.cu
 create mode 100644 paddle/fluid/operators/mv_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_mv_op.py

diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc
new file mode 100644
index 00000000000..1339982adaa
--- /dev/null
+++ b/paddle/fluid/operators/mv_op.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mv_op.h"
+namespace paddle {
+namespace operators {
+
+class MVOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The matrix input of mv op");
+    AddInput("Vec", "The vector input of mv op");
+    AddOutput("Out", "The output of mv op");
+    AddComment(R"DOC(
+MV Operator.
+
+This operator is used to perform matrix vector multiplication
+of the input tensors `X` and `Vec`.
+)DOC");
+  }
+};
+
+class MVOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv");
+    OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv");
+    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");
+
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Vec");
+    PADDLE_ENFORCE_EQ(
+        dim_x.size(), 2,
+        platform::errors::InvalidArgument(
+            "The rank of input X should be 2, but is %d", dim_x.size()));
+    PADDLE_ENFORCE_EQ(
+        dim_y.size(), 1,
+        platform::errors::InvalidArgument(
+            "The rank of input Vec should be 1, but is %d", dim_y.size()));
+    PADDLE_ENFORCE_EQ(dim_x[1] == dim_y[0], true,
+                      platform::errors::InvalidArgument(
+                          "The length of input X' second dim should equal the "
+                          "length of input Vec,"
+                          " but X[%d, %d], Vec[%d]",
+                          dim_x[0], dim_x[1], dim_y[0]));
+
+    framework::DDim dim_out = framework::make_ddim({dim_x[0]});
+
+    context->SetOutputDim("Out", dim_out);
+    context->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename T>
+class MVOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("mv_grad");
+    retv->SetInput("X", this->Input("X"));
+    retv->SetInput("Vec", this->Input("Vec"));
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    retv->SetOutput(framework::GradVarName("Vec"), this->InputGrad("Vec"));
+  }
+};
+
+class MVOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv");
+    OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv");
+    OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "mv");
+    auto x_dims = context->GetInputDim("X");
+    auto vec_dims = context->GetInputDim("Vec");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto vec_grad_name = framework::GradVarName("Vec");
+
+    if (context->HasOutput(x_grad_name)) {
+      context->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (context->HasOutput(vec_grad_name)) {
+      context->SetOutputDim(vec_grad_name, vec_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker,
+                  ops::MVOpGradMaker<paddle::framework::OpDesc>,
+                  ops::MVOpGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(mv_grad, ops::MVOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    mv, ops::MVKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MVKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    mv_grad, ops::MVGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MVGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mv_op.cu b/paddle/fluid/operators/mv_op.cu
new file mode 100644
index 00000000000..9a16fe025cd
--- /dev/null
+++ b/paddle/fluid/operators/mv_op.cu
@@ -0,0 +1,95 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mv_op.h"
+#include "paddle/fluid/platform/gpu_launch_param_config.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void MVGradCUDAKernel(const int m, const int n, const T *dout,
+                                 const T *vec, T *dx) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < m * n; idx += blockDim.x * gridDim.x) {
+    int i = idx / n;
+    int j = idx % n;
+    dx[idx] = dout[i] * vec[j];
+  }
+}
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// dX = | dOut Vec^T
+// dVec = | X^T dOut
+template <typename T>
+class MVGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<framework::Tensor>("X");
+    auto *vec = context.Input<framework::Tensor>("Vec");
+    auto *dout =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *dvec =
+        context.Output<framework::Tensor>(framework::GradVarName("Vec"));
+
+    auto dim_x = x->dims();
+    int m = dim_x[0];
+    int n = dim_x[1];
+
+    dx->Resize(framework::make_ddim({m * n}));
+
+    // get data ptr
+    const T *x_data = x->data<T>();
+    const T *vec_data = vec->data<T>();
+    const T *dout_data = dout->data<T>();
+
+    T *dx_data = dx->mutable_data<T>(context.GetPlace());
+    T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
+
+    auto &dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+
+    // calculate dx
+    auto stream = context.cuda_device_context().stream();
+    auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);
+    MVGradCUDAKernel<
+        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+        m, n, dout_data, vec_data, dx_data);
+
+    dx->Resize(framework::make_ddim({m, n}));
+
+    // calculate dvec
+    blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
+              static_cast<T>(0), dvec_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    mv, ops::MVKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MVKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    mv_grad, ops::MVGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MVGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mv_op.h b/paddle/fluid/operators/mv_op.h
new file mode 100644
index 00000000000..3c63f3640ff
--- /dev/null
+++ b/paddle/fluid/operators/mv_op.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class MVKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<framework::Tensor>("X");
+    auto *vec = context.Input<framework::Tensor>("Vec");
+
+    auto *out = context.Output<framework::Tensor>("Out");
+
+    auto dim_x = x->dims();
+
+    // get data ptr
+    const T *x_data = x->data<T>();
+    const T *vec_data = vec->data<T>();
+    T *out_data = out->mutable_data<T>(context.GetPlace());
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+
+    blas.GEMV(false, dim_x[0], dim_x[1], static_cast<T>(1), x_data, vec_data,
+              static_cast<T>(0), out_data);
+  }
+};
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// dX = | dOut vec^T
+// dVec = | X^T dOut
+template <typename DeviceContext, typename T>
+class MVGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<framework::Tensor>("X");
+    auto *vec = context.Input<framework::Tensor>("Vec");
+    auto *dout =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *dvec =
+        context.Output<framework::Tensor>(framework::GradVarName("Vec"));
+
+    auto dim_x = x->dims();
+    int m = dim_x[0];
+    int n = dim_x[1];
+
+    dx->Resize(framework::make_ddim({m * n}));
+
+    // get data ptr
+    const T *x_data = x->data<T>();
+    const T *vec_data = vec->data<T>();
+    const T *dout_data = dout->data<T>();
+
+    T *dx_data = dx->mutable_data<T>(context.GetPlace());
+    T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+
+    // calculate dx
+    for (int i = 0; i < m; ++i) {
+      for (int j = 0; j < n; ++j)
+        dx_data[i * n + j] = dout_data[i] * vec_data[j];
+    }
+
+    dx->Resize(framework::make_ddim({m, n}));
+
+    // calculate dvec
+    blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
+              static_cast<T>(0), dvec_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 661471599cb..1e0dc0e07b4 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -90,6 +90,7 @@ from .tensor.linalg import cholesky  #DEFINE_ALIAS
 # from .tensor.linalg import tensordot        #DEFINE_ALIAS
 from .tensor.linalg import bmm  #DEFINE_ALIAS
 from .tensor.linalg import histogram  #DEFINE_ALIAS
+from .tensor.linalg import mv  #DEFINE_ALIAS
 from .tensor.logic import equal  #DEFINE_ALIAS
 from .tensor.logic import greater_equal  #DEFINE_ALIAS
 from .tensor.logic import greater_than  #DEFINE_ALIAS
diff --git a/python/paddle/fluid/tests/unittests/test_mv_op.py b/python/paddle/fluid/tests/unittests/test_mv_op.py
new file mode 100644
index 00000000000..6b930e59aa5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_mv_op.py
@@ -0,0 +1,94 @@
+#Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.core as core
+from op_test import OpTest
+
+
+class TestMVOp(OpTest):
+    def setUp(self):
+        self.op_type = "mv"
+        self.init_config()
+        self.inputs = {'X': self.x, 'Vec': self.vec}
+        self.outputs = {'Out': np.dot(self.x, self.vec)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Vec'], 'Out')
+
+    def init_config(self):
+        self.x = np.random.random((5, 100)).astype("float64")
+        self.vec = np.random.random((100)).astype("float64")
+
+
+class TestMVAPI(unittest.TestCase):
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+
+        self.x_data = np.random.random((5, 100)).astype("float64")
+        self.x = paddle.to_tensor(self.x_data)
+        self.vec_data = np.random.random((100)).astype("float64")
+        self.vec = paddle.to_tensor(self.vec_data)
+        z = paddle.mv(self.x, self.vec)
+        np_z = z.numpy()
+        z_expected = np.array(np.dot(self.x_data, self.vec_data))
+        self.assertTrue(np.allclose(np_z, z_expected))
+
+        paddle.enable_static()
+
+    def test_static_graph(self):
+        paddle.enable_static()
+
+        self.input_x = np.random.rand(5, 100).astype("float64")
+        self.input_vec = np.random.rand(100).astype("float64")
+
+        data_x = paddle.static.data("x", shape=[5, 100], dtype="float64")
+        data_vec = paddle.static.data("vec", shape=[100], dtype="float64")
+        result_vec = paddle.mv(data_x, data_vec)
+        self.place = paddle.CPUPlace()
+        exe = paddle.static.Executor(self.place)
+        res, = exe.run(feed={"x": self.input_x,
+                             "vec": self.input_vec},
+                       fetch_list=[result_vec])
+        z_expected = np.array(np.dot(self.input_x, self.input_vec))
+        self.assertTrue(np.allclose(res, z_expected))
+
+
+class TestMVError(unittest.TestCase):
+    def test_input(self):
+        def test_shape():
+            paddle.enable_static()
+
+            self.input_x = np.random.rand(5, 100).astype("float64")
+            self.input_vec = np.random.rand(100).astype("float64")
+
+            data_x = paddle.static.data("x", shape=[5, 100], dtype="float64")
+            data_vec = paddle.static.data(
+                "vec", shape=[100, 2], dtype="float64")
+            result_vec = paddle.mv(data_x, data_vec)
+
+        self.assertRaises(ValueError, test_shape)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index a713663e182..2df9473c4b2 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -56,6 +56,7 @@ from .linalg import cholesky  #DEFINE_ALIAS
 # from .linalg import tensordot        #DEFINE_ALIAS
 from .linalg import bmm  #DEFINE_ALIAS
 from .linalg import histogram  #DEFINE_ALIAS
+from .linalg import mv  #DEFINE_ALIAS
 from .logic import equal  #DEFINE_ALIAS
 from .logic import greater_equal  #DEFINE_ALIAS
 from .logic import greater_than  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 67e3ce21ffb..f27cfba487d 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -32,7 +32,8 @@ __all__ = [
     'cholesky',
     #       'tensordot',
     'bmm',
-    'histogram'
+    'histogram',
+    'mv'
 ]
 
 
@@ -920,3 +921,64 @@ def histogram(input, bins=100, min=0, max=0):
                'min': min,
                'max': max})
     return out
+
+
+def mv(x, vec, name=None):
+    """
+    Performs a matrix-vector product of the matrix x and the vector vec.
+
+    Args:
+        x (Variable): A tensor with shape :math:`[M, N]` , The data type of the input Tensor x
+            should be one of float32, float64.
+        vec (Variable): A tensor with shape :math:`[N]` , The data type of the input Tensor x
+            should be one of float32, float64.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this
+            property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The tensor which is producted by x and vec.
+
+    Examples:
+        .. code-block:: python
+
+            # x: [M, N], vec: [N]
+            # paddle.mv(x, vec)  # out: [M]
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+            x_data = np.array([[2, 1, 3], [3, 0, 1]]).astype("float64")
+            x = paddle.to_tensor(x_data)
+            vec_data = np.array([3, 5, 1])
+            vec = paddle.to_tensor(vec_data).astype("float64")
+            out = paddle.mv(x, vec)
+            paddle.enable_static()
+    """
+    if in_dygraph_mode():
+        out = core.ops.mv(x, vec)
+        return out
+
+    def __check_input(x, vec):
+        var_names = {'x': x, 'vec': vec}
+        for name, val in var_names.items():
+            check_variable_and_dtype(val, name, ['float32', 'float64'], 'mv')
+        x_shape = list(x.shape)
+        vec_shape = list(vec.shape)
+        if len(x_shape) != 2:
+            raise ValueError(
+                "x should be 2-dimensional. But received x's dimention: {}".
+                format(x_shape))
+        if len(vec_shape) != 1:
+            raise ValueError(
+                "vec should be 1-dimensional. But received vec's dimention: {}".
+                format(vec_shape))
+
+    __check_input(x, vec)
+
+    helper = LayerHelper('mv', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='mv', inputs={'X': x,
+                           'Vec': vec}, outputs={'Out': out})
+    return out
-- 
GitLab


From 3fe176df350168992dd88d0b8755dd1e70ea9f8f Mon Sep 17 00:00:00 2001
From: guofei <52460041+gfwm2013@users.noreply.github.com>
Date: Mon, 21 Sep 2020 23:35:19 +0800
Subject: [PATCH 161/261] Fix test_gast_with_compatibility.py due to the
 problem of gast in python3.8 (#27433)

test=develop
---
 .../unittests/test_gast_with_compatibility.py | 96 +++++++++++--------
 1 file changed, 55 insertions(+), 41 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py b/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
index c7476a8a742..c176ff09e02 100644
--- a/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
+++ b/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import ast
 import gast
+import sys
 import textwrap
 import unittest
 
@@ -143,47 +144,60 @@ class TestPythonCompatibility(unittest.TestCase):
         """
         self._check_compatibility(source, target)
 
-    def test_with(self):
-        """
-        The fileds `context_expr/optional_vars` of `ast.With` in PY2
-        is moved into `ast.With.items.withitem` in PY3.
-        """
-        source = """
-        with guard():
-            a = 1
-        """
-        target = """
-        with guard_new():
-            a = 1
-        """
-        self._check_compatibility(source, target)
-
-    def test_subscript_Index(self):
-        source = """
-            x = y()[10]
-        """
-        target = """
-            x = y()[20]
-        """
-        self._check_compatibility(source, target)
-
-    def test_subscript_Slice(self):
-        source = """
-            x = y()[10:20]
-        """
-        target = """
-            x = y()[20:40]
-        """
-        self._check_compatibility(source, target)
-
-    def test_call(self):
-        source = """
-            y = foo(*arg)
-        """
-        target = """
-            y = foo(*arg_new)
-        """
-        self._check_compatibility(source, target)
+    # The 0.3.3 version of gast has a bug in python3.8 that
+    # would cause the following tests to fail. But this 
+    # problem doesn't affect the use of Paddle's related 
+    # functions, therefore, the following tests would be 
+    # disable in python3.8.
+    #
+    # This problem had been fixed and updated to version 
+    # 0.4.1 of gast.
+    #
+    # More information please refer to:
+    # https://github.com/serge-sans-paille/gast/issues/49
+    if sys.version_info < (3, 8):
+
+        def test_with(self):
+            """
+            The fileds `context_expr/optional_vars` of `ast.With` in PY2
+            is moved into `ast.With.items.withitem` in PY3.
+            """
+            source = """
+            with guard():
+                a = 1
+            """
+            target = """
+            with guard_new():
+                a = 1
+            """
+            self._check_compatibility(source, target)
+
+        def test_subscript_Index(self):
+            source = """
+                x = y()[10]
+            """
+            target = """
+                x = y()[20]
+            """
+            self._check_compatibility(source, target)
+
+        def test_subscript_Slice(self):
+            source = """
+                x = y()[10:20]
+            """
+            target = """
+                x = y()[20:40]
+            """
+            self._check_compatibility(source, target)
+
+        def test_call(self):
+            source = """
+                y = foo(*arg)
+            """
+            target = """
+                y = foo(*arg_new)
+            """
+            self._check_compatibility(source, target)
 
 
 if __name__ == '__main__':
-- 
GitLab


From 81823370962df89d6ecf8fd73f5a27d31ad2d3de Mon Sep 17 00:00:00 2001
From: Pei Yang <peiyang@baidu.com>
Date: Tue, 22 Sep 2020 10:03:58 +0800
Subject: [PATCH 162/261] clear pass logs (#27434)

---
 paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc  | 3 ++-
 paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index 23f794c11c2..9f6032ffa5b 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -176,7 +176,8 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
               return false;
             }
             if (x->IsVar() && x->Var() && x->Var()->GetShape().size() > 2) {
-              LOG(WARNING) << "repeated fc relu only supports input dims = 2";
+              VLOG(3) << "repeated fc relu only supports input dims = 2, so it "
+                         "is not applied.";
               return false;
             }
             int fc_idx = FindFCIdx(x);
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
index 74ba0093a17..8bdf3940928 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -35,8 +35,6 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
   const std::string pattern_name = "shufflechannel_pattern";
   FusePassBase::Init(pattern_name, graph);
 
-  LOG(WARNING) << "There is fluid.layers.shuffle_channel API already, you can "
-                  "use it instead of (reshape + transpose +reshape)";
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
                 ->NewNode("x")
@@ -85,6 +83,9 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
     // Delete the unneeded nodes.
     GraphSafeRemoveNodes(graph, {reshape1_op, reshape1_out, transpose_op,
                                  transpose_out, reshape2_op});
+    LOG_FIRST_N(WARNING, 1)
+        << "There is fluid.layers.shuffle_channel API already, maybe you can "
+           "use it instead of (reshape + transpose + reshape)";
   };
 
   gpd(graph, handler);
-- 
GitLab


From afe94903c31c9ca66f23bc54166b160192935e2a Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 22 Sep 2020 10:22:12 +0800
Subject: [PATCH 163/261] Rename fluid_inference to paddle_inference. (#27422)

---
 cmake/inference_lib.cmake                     | 54 +++++++++----------
 go/README_cn.md                               |  2 +-
 paddle/fluid/inference/api/demo_ci/run.sh     |  2 +-
 .../api/demo_ci/run_windows_demo.bat          |  2 +-
 paddle/fluid/train/demo/README.md             |  4 +-
 paddle/fluid/train/demo/run.sh                |  4 +-
 paddle/fluid/train/imdb_demo/README.md        |  4 +-
 paddle/scripts/paddle_build.bat               |  6 +--
 paddle/scripts/paddle_build.sh                | 18 +++----
 paddle/scripts/windows_build/build.bat        |  4 +-
 10 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 20f27715e00..e3c2409f103 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 # make package for paddle fluid shared and static library
-set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
-  "A path setting fluid shared and static libraries")
+set(PADDLE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_install_dir" CACHE STRING
+  "A path setting paddle shared and static libraries")
 
-set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
-  "A path setting fluid inference shared and static libraries")
+set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir" CACHE STRING
+  "A path setting paddle inference shared and static libraries")
   
 # TODO(zhaolong)
 # At present, the size of static lib in Windows exceeds the system limit,
@@ -142,14 +142,14 @@ set(inference_lib_deps third_party paddle_fluid paddle_fluid_c paddle_fluid_shar
 add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps})
 
 
-set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/threadpool")
+set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/threadpool")
 copy(inference_lib_dist
         SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
         DSTS ${dst_dir})
 
 # Only GPU need cudaErrorMessage.pb
 IF(WITH_GPU)
-        set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
+        set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
         copy(inference_lib_dist
                 SRCS ${cudaerror_INCLUDE_DIR}
                 DSTS ${dst_dir})
@@ -158,9 +158,9 @@ ENDIF()
 # CMakeCache Info
 copy(inference_lib_dist
         SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-        DSTS ${FLUID_INFERENCE_INSTALL_DIR})
+        DSTS ${PADDLE_INFERENCE_INSTALL_DIR})
 
-copy_part_of_thrid_party(inference_lib_dist ${FLUID_INFERENCE_INSTALL_DIR})
+copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
@@ -177,39 +177,39 @@ endif(WIN32)
 if(WIN32 AND NOT WITH_STATIC_LIB)
         copy(inference_lib_dist
                 SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
-                DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib
-                      ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib)
+                DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
+                      ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
 else()
         copy(inference_lib_dist
                 SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
-                DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib)
+                DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
 endif()
 
 copy(inference_lib_dist
         SRCS  ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-        DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal)
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/internal)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h
-        DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
 include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 
 # CAPI inference library for only inference
-set(FLUID_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_c_install_dir" CACHE STRING
-"A path setting CAPI fluid inference shared")
-copy_part_of_thrid_party(inference_lib_dist ${FLUID_INFERENCE_C_INSTALL_DIR})
+set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
+"A path setting CAPI paddle inference shared")
+copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
 
 copy(inference_lib_dist
       SRCS  ${src_dir}/inference/capi/paddle_c_api.h  ${paddle_fluid_c_lib}
-      DSTS  ${FLUID_INFERENCE_C_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_C_INSTALL_DIR}/paddle/lib)
+      DSTS  ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib)
 
 # fluid library for both train and inference
 set(fluid_lib_deps inference_lib_dist)
 add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})
 
-set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
+set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid")
 set(module "inference")
 if(WIN32 AND NOT WITH_STATIC_LIB)
         copy(fluid_lib_dist
@@ -273,22 +273,22 @@ copy(fluid_lib_dist
         DSTS ${dst_dir}/${module}
         )
 
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
+set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/eigen3")
 copy(inference_lib_dist
         SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
         DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported)
 
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost")
+set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/boost")
 copy(inference_lib_dist
         SRCS ${BOOST_INCLUDE_DIR}/boost
         DSTS ${dst_dir})
 
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/dlpack")
+set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/dlpack")
 copy(inference_lib_dist
         SRCS ${DLPACK_INCLUDE_DIR}/dlpack
         DSTS ${dst_dir})
 
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
+set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/install/zlib")
 copy(inference_lib_dist
         SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
         DSTS ${dst_dir} ${dst_dir}/lib)
@@ -296,8 +296,8 @@ copy(inference_lib_dist
 
 # CMakeCache Info
 copy(fluid_lib_dist
-        SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-        DSTS ${FLUID_INSTALL_DIR} ${FLUID_INSTALL_DIR}
+        SRCS ${PADDLE_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
+        DSTS ${PADDLE_INSTALL_DIR} ${PADDLE_INSTALL_DIR}
         )
 
 # paddle fluid version
@@ -323,6 +323,6 @@ function(version version_file)
     endif()
     
 endfunction()
-version(${FLUID_INSTALL_DIR}/version.txt)
-version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
-version(${FLUID_INFERENCE_C_INSTALL_DIR}/version.txt)
+version(${PADDLE_INSTALL_DIR}/version.txt)
+version(${PADDLE_INFERENCE_INSTALL_DIR}/version.txt)
+version(${PADDLE_INFERENCE_C_INSTALL_DIR}/version.txt)
diff --git a/go/README_cn.md b/go/README_cn.md
index 57af05ce0af..8ffc31adf85 100644
--- a/go/README_cn.md
+++ b/go/README_cn.md
@@ -1,7 +1,7 @@
 # Paddle 预测golang API
 
 ## 安装
-首先cmake编译时打开`-DON_INFER=ON`,在编译目录下得到``fluid_inference_c_install_dir``,将该目录移动到当前目录中并重命名为`paddle_c`
+首先cmake编译时打开`-DON_INFER=ON`,在编译目录下得到``paddle_inference_c_install_dir``,将该目录移动到当前目录中并重命名为`paddle_c`
 
 ## 在Go中使用Paddle预测
 首先创建预测配置
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index d8d9e218781..6b7fb0f619a 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -7,7 +7,7 @@ DATA_DIR=$4 # dataset
 TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include
 TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
 
-inference_install_dir=${PADDLE_ROOT}/build/fluid_inference_install_dir
+inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 
 cd `dirname $0`
 current_dir=`pwd`
diff --git a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
index 5199b83413a..523dafa6649 100644
--- a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
+++ b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
@@ -21,7 +21,7 @@ if /i "%use_mkl%"=="N" (
 )
 
 :set_paddle_infernece_lib
-SET /P paddle_infernece_lib="Please input the path of paddle inference library, such as D:\fluid_inference_install_dir   =======>"
+SET /P paddle_infernece_lib="Please input the path of paddle inference library, such as D:\paddle_inference_install_dir   =======>"
 set tmp_var=!paddle_infernece_lib!
 call:remove_space
 set paddle_infernece_lib=!tmp_var!
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
index bd53ab4b0c0..8a44c25aea9 100644
--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
@@ -7,7 +7,7 @@
 # WITH_MKLDNN=ON|OFF
 
 PADDLE_LIB=/paddle/lib/dir
-cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
+cmake .. -DPADDLE_INSTALL_DIR=$PADDLE_LIB \
          -DCMAKE_BUILD_TYPE=Release \
          -DWITH_GPU=OFF \
          -DWITH_STYLE_CHECK=OFF \
@@ -41,7 +41,7 @@ cd build
 # WITH_MKLDNN=ON|OFF
 PADDLE_LIB=/paddle/lib/dir
 
-# PADDLE_LIB is the same with FLUID_INSTALL_DIR when building the lib
+# PADDLE_LIB is the same with PADDLE_INSTALL_DIR when building the lib
 cmake .. -DPADDLE_LIB=$PADDLE_LIB \
          -DWITH_MKLDNN=OFF \
          -DWITH_MKL=OFF
diff --git a/paddle/fluid/train/demo/run.sh b/paddle/fluid/train/demo/run.sh
index f7efb3b3b7d..2955e7574da 100755
--- a/paddle/fluid/train/demo/run.sh
+++ b/paddle/fluid/train/demo/run.sh
@@ -14,12 +14,12 @@ function download() {
 download
 
 # build demo trainer
-fluid_install_dir=${PADDLE_ROOT}/build/fluid_install_dir
+paddle_install_dir=${PADDLE_ROOT}/build/paddle_install_dir
 
 mkdir -p build
 cd build
 rm -rf *
-cmake .. -DPADDLE_LIB=$fluid_install_dir \
+cmake .. -DPADDLE_LIB=$paddle_install_dir \
          -DWITH_MKLDNN=$TURN_ON_MKL \
          -DWITH_MKL=$TURN_ON_MKL
 make
diff --git a/paddle/fluid/train/imdb_demo/README.md b/paddle/fluid/train/imdb_demo/README.md
index ecc985e13f8..28fd66710f8 100644
--- a/paddle/fluid/train/imdb_demo/README.md
+++ b/paddle/fluid/train/imdb_demo/README.md
@@ -11,7 +11,7 @@ PADDLE_ROOT=./Paddle
 cd Paddle
 mkdir build
 cd build
-cmake -DFLUID_INFERENCE_INSTALL_DIR=$PADDLE_ROOT \
+cmake -DPADDLE_INFERENCE_INSTALL_DIR=$PADDLE_ROOT \
       -DCMAKE_BUILD_TYPE=Release \
       -DWITH_PYTHON=OFF \
       -DWITH_MKL=OFF \
@@ -40,7 +40,7 @@ see: [IMDB Dataset of 50K Movie Reviews | Kaggle](https://www.kaggle.com/lakshmi
     mkdir build
     cd build
     rm -rf *
-    PADDLE_LIB=path/to/Paddle/build/fluid_install_dir
+    PADDLE_LIB=path/to/Paddle/build/paddle_install_dir
     cmake .. -DPADDLE_LIB=$PADDLE_LIB  -DWITH_MKLDNN=OFF -DWITH_MKL=OFF
     make
 ```
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 9e150763dbb..99450d1e15c 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -213,10 +213,10 @@ echo    ========================================
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
 call :timestamp "%start%" "%end%" "Build"
-tree /F %cd%\fluid_inference_install_dir\paddle
-%cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\fluid_inference_install_dir\paddle\lib > lib_size.txt
+tree /F %cd%\paddle_inference_install_dir\paddle
+%cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\paddle_inference_install_dir\paddle\lib > lib_size.txt
 set /p libsize=< lib_size.txt
-for /F %%i in ("%libsize%") do echo "Windows FLuid_Inference Size: %%i"
+for /F %%i in ("%libsize%") do echo "Windows Paddle_Inference Size: %%i"
 %cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\python\dist > whl_size.txt
 set /p whlsize=< whl_size.txt
 for /F %%i in ("%whlsize%") do echo "Windows PR whl Size: %%i"
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index ac89116fc49..f87925056ff 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -362,12 +362,12 @@ function build_size() {
     Calculate /paddle/build size and PR whl size
     ============================================
 EOF
-    if [ "$1" == "fluid_inference" ]; then
+    if [ "$1" == "paddle_inference" ]; then
         cd ${PADDLE_ROOT}/build
-        cp -r fluid_inference_install_dir fluid_inference
-        tar -czf fluid_inference.tgz fluid_inference
-        buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/fluid_inference.tgz |awk '{print $1}')
-        echo "FLuid_Inference Size: $buildSize"
+        cp -r paddle_inference_install_dir paddle_inference
+        tar -czf paddle_inference.tgz paddle_inference
+        buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference.tgz |awk '{print $1}')
+        echo "Paddle_Inference Size: $buildSize"
     else
         SYSTEM=`uname -s`
         if [ "$SYSTEM" == "Darwin" ]; then
@@ -1446,7 +1446,7 @@ EOF
     fi
     endTime_s=`date +%s`
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
-    build_size "fluid_inference"
+    build_size "paddle_inference"
 }
 
 function tar_fluid_lib() {
@@ -1456,10 +1456,10 @@ function tar_fluid_lib() {
     ========================================
 EOF
     cd ${PADDLE_ROOT}/build
-    cp -r fluid_install_dir fluid
+    cp -r paddle_install_dir fluid
     tar -czf fluid.tgz fluid
-    cp -r fluid_inference_install_dir fluid_inference
-    tar -czf fluid_inference.tgz fluid_inference
+    cp -r paddle_inference_install_dir paddle_inference
+    tar -czf paddle_inference.tgz paddle_inference
 }
 
 function test_fluid_lib() {
diff --git a/paddle/scripts/windows_build/build.bat b/paddle/scripts/windows_build/build.bat
index 65d44877d12..6f99c23ccd2 100644
--- a/paddle/scripts/windows_build/build.bat
+++ b/paddle/scripts/windows_build/build.bat
@@ -118,8 +118,8 @@ call:Build
 echo PACKAGE INFERENCE LIBRARY
 
 mkdir inference_dist
-%PYTHON_DIR%\python.exe -c "import shutil;shutil.make_archive('inference_dist/fluid_inference_install_dir', 'zip', root_dir='fluid_inference_install_dir')"
-%PYTHON_DIR%\python.exe -c "import shutil;shutil.make_archive('inference_dist/fluid_install_dir', 'zip', root_dir='fluid_install_dir')"
+%PYTHON_DIR%\python.exe -c "import shutil;shutil.make_archive('inference_dist/paddle_inference_install_dir', 'zip', root_dir='paddle_inference_install_dir')"
+%PYTHON_DIR%\python.exe -c "import shutil;shutil.make_archive('inference_dist/paddle_install_dir', 'zip', root_dir='paddle_install_dir')"
 
 echo BUILD INFERENCE LIBRARY COMPLETE
 goto :END
-- 
GitLab


From f4c750d721a1226738bea382f6c0cf725cca8481 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 22 Sep 2020 10:28:42 +0800
Subject: [PATCH 164/261] Add the cpu version of segment sum mean max min op

Add the cpu version of segment sum mean max min op
---
 paddle/fluid/operators/CMakeLists.txt         |   2 +-
 paddle/fluid/operators/math/CMakeLists.txt    |   1 +
 .../fluid/operators/math/segment_pooling.cc   | 148 +++++++++++++
 paddle/fluid/operators/math/segment_pooling.h |  46 ++++
 paddle/fluid/operators/segment_pool_op.cc     | 166 ++++++++++++++
 paddle/fluid/operators/segment_pool_op.h      | 130 +++++++++++
 .../fluid/tests/unittests/test_segment_ops.py | 202 ++++++++++++++++++
 7 files changed, 694 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/math/segment_pooling.cc
 create mode 100644 paddle/fluid/operators/math/segment_pooling.h
 create mode 100644 paddle/fluid/operators/segment_pool_op.cc
 create mode 100644 paddle/fluid/operators/segment_pool_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_segment_ops.py

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index f0a04d850df..53e6f4aa6e4 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -92,7 +92,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows
 lod_tensor maxouting unpooling pooling lod_rank_table context_project
-sequence_pooling executor device_memory_aligment generator)
+sequence_pooling segment_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 10d335b828b..24ed4fcf668 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -76,6 +76,7 @@ math_library(prelu)
 math_library(bert_encoder_functor)
 math_library(tree2col DEPS math_function)
 math_library(matrix_inverse)
+math_library(segment_pooling)
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
diff --git a/paddle/fluid/operators/math/segment_pooling.cc b/paddle/fluid/operators/math/segment_pooling.cc
new file mode 100644
index 00000000000..3c77d3d4cf8
--- /dev/null
+++ b/paddle/fluid/operators/math/segment_pooling.cc
@@ -0,0 +1,148 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/segment_pooling.h"
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, typename IndexT>
+class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& segments, framework::Tensor* output,
+                  framework::Tensor* index,
+                  const std::string pooltype = "SUM") {
+    const IndexT* segment_ids = segments.data<IndexT>();
+    auto curent_id = segment_ids[0];
+    int64_t last_idx = 0;
+    int64_t w = input.numel() / input.dims()[0];
+    auto& place = *context.eigen_device();
+    for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
+      if (idx < segments.numel()) {
+        if (segment_ids[idx] == curent_id) continue;
+        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
+                          platform::errors::InvalidArgument(
+                              "The segment ids should be sorted, but got "
+                              "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+                              idx - 1, curent_id, idx, segment_ids[idx]));
+      }
+
+      Tensor out_t = output->Slice(curent_id, curent_id + 1);
+      Tensor in_t = input.Slice(last_idx, idx);
+
+      int64_t h = idx - last_idx;
+      auto in_e =
+          framework::EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
+      auto out_e = framework::EigenVector<T>::Flatten(out_t);
+
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      if (pooltype == "MEAN") {
+        out_e.device(place) = in_e.mean(reduce_dim);
+      } else if (pooltype == "SUM") {
+        out_e.device(place) = in_e.sum(reduce_dim);
+      } else if (pooltype == "MAX") {
+        out_e.device(place) = in_e.maximum(reduce_dim);
+      } else if (pooltype == "MIN") {
+        out_e.device(place) = in_e.minimum(reduce_dim);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
+            "available, but got %s.",
+            pooltype));
+      }
+
+      last_idx = idx;
+      if (idx < segments.numel()) curent_id = segment_ids[idx];
+    }
+  }
+};
+
+template <typename T, typename IndexT>
+class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& segments, framework::Tensor* in_grad,
+                  const framework::Tensor* index = nullptr,
+                  const std::string pooltype = "SUM") {
+    const IndexT* segment_ids = segments.data<IndexT>();
+    auto& place = *context.eigen_device();
+    auto curent_id = segment_ids[0];
+    int64_t last_idx = 0;
+    int64_t w = in_grad->numel() / in_grad->dims()[0];
+    for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
+      if (idx < segments.numel()) {
+        if (segment_ids[idx] == curent_id) continue;
+        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
+                          platform::errors::InvalidArgument(
+                              "The segment ids should be sorted, but got "
+                              "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+                              idx - 1, curent_id, idx, segment_ids[idx]));
+      }
+
+      Tensor out_g_t = out_grad.Slice(curent_id, curent_id + 1);
+      Tensor in_g_t = in_grad->Slice(last_idx, idx);
+
+      int64_t h = idx - last_idx;
+      auto in_g_e = framework::EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = framework::EigenMatrix<T>::From(out_g_t, {1, w});
+      Eigen::DSizes<int, 2> bcast(h, 1);
+
+      if (pooltype == "MEAN") {
+        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
+      } else if (pooltype == "SUM") {
+        in_g_e.device(place) = out_g_e.broadcast(bcast);
+      } else if (pooltype == "MAX" || pooltype == "MIN") {
+        Tensor out_t = output.Slice(curent_id, curent_id + 1);
+        Tensor in_t = input.Slice(last_idx, idx);
+        auto in_e = framework::EigenMatrix<T>::From(in_t, {h, w});
+        auto out_e = framework::EigenMatrix<T>::From(out_t, {1, w});
+        in_g_e.device(place) =
+            (in_e == out_e.broadcast(bcast)).template cast<T>() *
+            out_g_e.broadcast(bcast);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
+            "available, but got %s.",
+            pooltype));
+      }
+
+      last_idx = idx;
+      if (idx < segments.numel()) curent_id = segment_ids[idx];
+    }
+  }
+};
+
+using CPU = platform::CPUDeviceContext;
+template class SegmentPoolFunctor<CPU, float, int>;
+template class SegmentPoolFunctor<CPU, float, int64_t>;
+template class SegmentPoolFunctor<CPU, double, int>;
+template class SegmentPoolFunctor<CPU, double, int64_t>;
+template class SegmentPoolGradFunctor<CPU, float, int>;
+template class SegmentPoolGradFunctor<CPU, float, int64_t>;
+template class SegmentPoolGradFunctor<CPU, double, int>;
+template class SegmentPoolGradFunctor<CPU, double, int64_t>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/segment_pooling.h b/paddle/fluid/operators/math/segment_pooling.h
new file mode 100644
index 00000000000..561fad6921f
--- /dev/null
+++ b/paddle/fluid/operators/math/segment_pooling.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T, typename IndexT>
+class SegmentPoolFunctor {
+ public:
+  /* mean pool has summed_ids output */
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& segments, framework::Tensor* output,
+                  framework::Tensor* summed_ids = nullptr,
+                  const std::string pooltype = "SUM");
+};
+
+template <typename DeviceContext, typename T, typename IndexT>
+class SegmentPoolGradFunctor {
+ public:
+  /* mean pool has summed_ids output */
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& segments, framework::Tensor* in_grad,
+                  const framework::Tensor* summed_ids = nullptr,
+                  const std::string pooltype = "SUM");
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc
new file mode 100644
index 00000000000..322cd97f01c
--- /dev/null
+++ b/paddle/fluid/operators/segment_pool_op.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/segment_pool_op.h"
+#include <memory>
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class SegmentPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPool");
+    OP_INOUT_CHECK(ctx->HasInput("SegmentIds"), "Input", "SegmentIds",
+                   "SegmentPool");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SegmentPool");
+    auto dims = ctx->GetInputDim("X");
+    dims[0] = -1;
+    ctx->SetOutputDim("Out", dims);
+
+    if (ctx->Attrs().Get<std::string>("pooltype") == "MEAN") {
+      OP_INOUT_CHECK(ctx->HasOutput("SummedIds"), "Output", "SummedIds",
+                     "SegmentPool");
+      ctx->SetOutputDim("SummedIds", {-1, 1});
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class SegmentPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input data of SegmentPoolOp");
+    AddInput("SegmentIds",
+             "(Tensor) 1-D tensor which have the same size with the fist "
+             "dimension of input X.");
+    AddOutput("Out", "(Tensor) The output of SegmentPoolOp.");
+    AddOutput("SummedIds",
+              "(Tensor) This tensor is used to counts of segment ids for the "
+              "backward of the mean pool.")
+        .AsIntermediate();
+    AddAttr<std::string>(
+        "pooltype",
+        "(string, default 'SUM') the pooling type of SegmentPoolOp.")
+        .SetDefault("SUM")
+        .InEnum({"SUM", "MEAN", "MIN", "MAX"});
+    AddComment(R"DOC(
+Segment Pool Operator.
+
+This operator will pool the elements of input `X` which with the same index
+in `SegmentIds`.
+
+For SUM operation, it computes a tensor such that $Out_i = \sum_{j} X_{j}$
+where sum is over j such that `SegmentIds[j] == i`.
+
+For MEAN operation, it computes a tensor such that
+$Out_i = \frac{1}{n_i}  \sum_{j} X_{j}$ where sum is over j such that
+`SegmentIds[j] == i` and $n_i$ is the number of all index `SegmentIds[j] == i`.
+
+For MIN operation, it computes a tensor such that $Out_i = \min_{j} X_{j}$
+where min is over j such that `SegmentIds[j] == i`.
+
+For MAX operation, it computes a tensor such that $Out_i = \max_{j} X_{j}$
+where max is over j such that `SegmentIds[j] == i`.
+    )DOC");
+  }
+};
+
+class SegmentPoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "SegmentPoolGrad");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPoolGrad");
+    auto og_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(og_dims.size(), x_dims.size(),
+                      platform::errors::InvalidArgument(
+                          "The rank of output grad must equal to Input(X). But "
+                          "received: input rank %u, input shape [%s].",
+                          og_dims.size(), og_dims));
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(
+          og_dims[i], x_dims[i],
+          platform::errors::InvalidArgument(
+              "The dimension mismatch between Input(OUT@GRAD) and "
+              "Input(X). Received Input(OUT@GRAD): input rank %u, "
+              "input shape [%s]; received Input(X): input rank %u, "
+              "input shape [%s].",
+              og_dims.size(), og_dims, x_dims.size(), x_dims));
+    }
+
+    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class SegmentPoolGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op_desc_ptr) const override {
+    op_desc_ptr->SetType("segment_pool_grad");
+    op_desc_ptr->SetInput("X", this->Input("X"));
+    op_desc_ptr->SetInput("SegmentIds", this->Input("SegmentIds"));
+    op_desc_ptr->SetInput("Out", this->Output("Out"));
+    if (BOOST_GET_CONST(std::string, this->GetAttr("pooltype")) == "MEAN") {
+      op_desc_ptr->SetInput("SummedIds", this->Output("SummedIds"));
+    }
+    op_desc_ptr->SetInput(framework::GradVarName("Out"),
+                          this->OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op_desc_ptr->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(segment_pool, ops::SegmentPoolOp, ops::SegmentPoolOpMaker,
+                  ops::SegmentPoolGradOpMaker<paddle::framework::OpDesc>,
+                  ops::SegmentPoolGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(segment_pool_grad, ops::SegmentPoolGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    segment_pool,
+    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    segment_pool_grad,
+    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
new file mode 100644
index 00000000000..a505946b9f5
--- /dev/null
+++ b/paddle/fluid/operators/segment_pool_op.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/segment_pooling.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T, typename IndexT>
+void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
+  auto* input = context.Input<Tensor>("X");
+  auto* segment = context.Input<Tensor>("SegmentIds");
+  auto* output = context.Output<Tensor>("Out");
+  std::string pooltype = context.Attr<std::string>("pooltype");
+  Tensor* summed_ids = nullptr;
+
+  int64_t num_indices = segment->numel();
+  PADDLE_ENFORCE_EQ(
+      num_indices, input->dims()[0],
+      platform::errors::InvalidArgument(
+          "Segment_ids should be the same size as dimension 0 of input X."));
+  PADDLE_ENFORCE_EQ(num_indices, segment->dims()[0],
+                    platform::errors::InvalidArgument(
+                        "Segment_ids should be 1-D tensor, or it's other "
+                        "dimension size is 1. Segment_ids's shape is: [%s].",
+                        segment->dims()));
+
+  if (input->numel() == 0 || segment->numel() == 0) {
+    return;
+  }
+
+  bool cpu_place = context.GetPlace().type() == typeid(platform::CPUPlace);
+  if (cpu_place) {
+    auto dims = input->dims();
+    auto* segment_ids = segment->data<IndexT>();
+    dims[0] = static_cast<int64_t>(segment_ids[segment->numel() - 1] + 1);
+    PADDLE_ENFORCE_GT(
+        dims[0], 0,
+        platform::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", dims[0]));
+    output->Resize({dims});
+    output->mutable_data<T>(context.GetPlace());
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, output, static_cast<T>(0));
+  }
+
+  SegmentPoolFunctor<DeviceContext, T, IndexT> pool;
+
+  pool(context.template device_context<DeviceContext>(), *input, *segment,
+       output, summed_ids, pooltype);
+}
+
+template <typename DeviceContext, typename T>
+class SegmentPoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* segment = context.Input<Tensor>("SegmentIds");
+    auto index_type = segment->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      SegmentKernelLaunchHelper<DeviceContext, T, int>(context);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      SegmentKernelLaunchHelper<DeviceContext, T, int64_t>(context);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported index type, Expected int, int64, but got %s.",
+          index_type));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SegmentPoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Input<Tensor>("Out");
+    auto* segment = context.Input<Tensor>("SegmentIds");
+    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* in_g = context.Output<Tensor>(framework::GradVarName("X"));
+    std::string pooltype = context.Attr<std::string>("pooltype");
+
+    const Tensor* summed_ids = nullptr;
+    if (pooltype == "MEAN") {
+      summed_ids = context.Input<Tensor>("SummedIds");
+    }
+
+    in_g->mutable_data<T>(context.GetPlace());
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, in_g, static_cast<T>(0));
+
+    auto index_type = segment->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      SegmentPoolGradFunctor<DeviceContext, T, int> pool;
+      pool(context.template device_context<DeviceContext>(), *input, *output,
+           *out_g, *segment, in_g, summed_ids, pooltype);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      SegmentPoolGradFunctor<DeviceContext, T, int64_t> pool;
+      pool(context.template device_context<DeviceContext>(), *input, *output,
+           *out_g, *segment, in_g, summed_ids, pooltype);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported index type, Expected int, int64, but got %s.",
+          index_type));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_segment_ops.py b/python/paddle/fluid/tests/unittests/test_segment_ops.py
new file mode 100644
index 00000000000..b58d66676b0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_segment_ops.py
@@ -0,0 +1,202 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+from op_test import OpTest
+
+
+def compute_segment_sum(x, segment_ids):
+    length = segment_ids[-1] + 1
+    target_shape = list(x.shape)
+    target_shape[0] = length
+    results = np.zeros(target_shape, dtype=x.dtype)
+    for index, ids in enumerate(segment_ids):
+        results[ids, :] += x[index, :]
+    return results
+
+
+def compute_segment_mean(x, segment_ids):
+    length = segment_ids[-1] + 1
+    target_shape = list(x.shape)
+    target_shape[0] = length
+    results = np.zeros(target_shape, dtype=x.dtype)
+    count = np.zeros(length, dtype=x.dtype) + 1e-8
+    for index, ids in enumerate(segment_ids):
+        results[ids, :] += x[index, :]
+        count[ids] += 1
+    results = results / count.reshape([-1, 1])
+    return results
+
+
+def compute_segment_min_max(x, segment_ids, pooltype="MAX"):
+    length = segment_ids[-1] + 1
+    target_shape = list(x.shape)
+    target_shape[0] = length
+    gradient = np.zeros_like(x)
+    results = np.zeros(target_shape, dtype=x.dtype)
+    last_idx = 0
+    current_id = segment_ids[0]
+    for idx in range(1, len(segment_ids) + 1):
+        if idx < len(segment_ids):
+            if segment_ids[idx] == current_id:
+                continue
+        sub_x = x[last_idx:idx, :]
+        if pooltype == "MAX":
+            results[current_id] = np.amax(sub_x, axis=0)
+        elif pooltype == "MIN":
+            results[current_id] = np.amin(sub_x, axis=0)
+        else:
+            raise ValueError("Invalid pooltype, only MAX, MIN supported!")
+        gradient[last_idx:idx, :][sub_x == results[current_id]] = 1
+        last_idx = idx
+        if idx < len(segment_ids):
+            current_id = segment_ids[idx]
+
+    return results, gradient / results.size
+
+
+class TestSegmentOps(OpTest):
+    def set_data(self):
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        segment_ids = self.set_segment(len(x), len(x) // 5 + 1)
+        return x, segment_ids
+
+    def set_segment(self, origin_len, reduce_len):
+        segment = np.zeros(reduce_len, dtype='int64')
+        segment = np.random.randint(0, reduce_len, size=[origin_len])
+        segment = np.sort(segment)
+        return segment.astype('int64')
+
+    def compute(self, x, segment_ids):
+        return compute_segment_sum(x, segment_ids)
+
+    def prepare(self):
+        self.op_type = "segment_pool"
+        self.dtype = np.float64
+        self.shape = [30, 15]
+        self.attrs = {"pooltype": "SUM"}
+
+    def setUp(self):
+        self.prepare()
+        x, segment_ids = self.set_data()
+        result = self.compute(x, segment_ids)
+        self.inputs = {
+            'X': x.astype(self.dtype),
+            'SegmentIds': segment_ids.astype(np.int64)
+        }
+        self.outputs = {'Out': result.astype(self.dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestSegmentSum2(TestSegmentOps):
+    def prepare(self):
+        super(TestSegmentSum2, self).prepare()
+        self.shape = [40, 20]
+        self.dtype = np.float32
+
+    def setUp(self):
+        self.prepare()
+        x, segment_ids = self.set_data()
+        result = self.compute(x, segment_ids)
+        self.inputs = {
+            'X': x.astype(self.dtype),
+            'SegmentIds': segment_ids.astype(np.int32)
+        }
+        self.outputs = {'Out': result.astype(self.dtype)}
+
+
+class TestSegmentMax(TestSegmentOps):
+    def compute(self, x, segment_ids):
+        return compute_segment_min_max(x, segment_ids, pooltype="MAX")
+
+    def prepare(self):
+        super(TestSegmentMax, self).prepare()
+        self.shape = [40, 20]
+        self.attrs = {'pooltype': "MAX"}
+
+    def setUp(self):
+        self.prepare()
+        x, segment_ids = self.set_data()
+        result, self.gradient = self.compute(x, segment_ids)
+        self.inputs = {
+            'X': x.astype(self.dtype),
+            'SegmentIds': segment_ids.astype(np.int32)
+        }
+        self.outputs = {'Out': result.astype(self.dtype)}
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", user_defined_grads=[self.gradient])
+
+
+class TestSegmentMax2(TestSegmentMax):
+    def prepare(self):
+        super(TestSegmentMax2, self).prepare()
+        self.dtype = np.float32
+
+
+class TestSegmentMin(TestSegmentMax):
+    def compute(self, x, segment_ids):
+        return compute_segment_min_max(x, segment_ids, pooltype="MIN")
+
+    def prepare(self):
+        super(TestSegmentMin, self).prepare()
+        self.attrs = {'pooltype': "MIN"}
+
+
+class TestSegmentMin2(TestSegmentMin):
+    def prepare(self):
+        super(TestSegmentMin2, self).prepare()
+        self.dtype = np.float32
+
+
+class TestSegmentMean(TestSegmentOps):
+    def compute(self, x, segment_ids):
+        return compute_segment_mean(x, segment_ids)
+
+    def prepare(self):
+        super(TestSegmentMean, self).prepare()
+        self.shape = [40, 20]
+        self.attrs = {'pooltype': "MEAN"}
+
+    def setUp(self):
+        self.prepare()
+        x, segment_ids = self.set_data()
+        result = self.compute(x, segment_ids)
+        self.inputs = {'X': x, 'SegmentIds': segment_ids}
+        self.outputs = {
+            'Out': result,
+            'SummedIds': compute_segment_sum(
+                np.ones([len(x), 1]).astype(self.dtype), segment_ids)
+        }
+
+
+class TestSegmentMean2(TestSegmentMean):
+    def prepare(self):
+        super(TestSegmentMean2, self).prepare()
+        self.dtype = np.float32
+        self.shape = [30, 20]
+        self.attrs = {'pooltype': "MEAN"}
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab


From 18f2ea66bd8d4c3f9d9f6c59556851316f421e6d Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Tue, 22 Sep 2020 13:19:18 +0800
Subject: [PATCH 165/261] remove paddle.readers API (#26727)

* remove paddle.readers API. test=develop
---
 python/paddle/__init__.py                              |  1 -
 python/paddle/dataset/flowers.py                       |  2 +-
 python/paddle/fluid/io.py                              |  6 +++---
 .../unittests/test_decoupled_py_reader_data_check.py   |  2 +-
 .../test_imperative_star_gan_with_gradient_penalty.py  |  2 +-
 .../unittests/test_multiprocess_reader_exception.py    |  3 ++-
 .../tests/unittests/test_py_reader_combination.py      |  5 +++--
 python/paddle/io/__init__.py                           | 10 ----------
 python/paddle/reader/__init__.py                       |  2 +-
 python/paddle/tensor/__init__.py                       |  1 -
 python/paddle/tensor/random.py                         |  3 ---
 11 files changed, 12 insertions(+), 25 deletions(-)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 1e0dc0e07b4..29e739a0edf 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -204,7 +204,6 @@ from .tensor.math import prod  #DEFINE_ALIAS
 from .tensor.random import standard_normal
 from .tensor.random import normal
 from .tensor.random import uniform  #DEFINE_ALIAS
-from .tensor.random import shuffle  #DEFINE_ALIAS
 from .tensor.random import randn  #DEFINE_ALIAS
 from .tensor.random import rand  #DEFINE_ALIAS
 from .tensor.random import randint  #DEFINE_ALIAS
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 969ad3c922f..bb60c58211c 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -37,7 +37,7 @@ from .common import download
 import tarfile
 import scipy.io as scio
 from paddle.dataset.image import *
-from paddle.reader import *
+from paddle.reader import map_readers, xmap_readers
 from paddle import compat as cpt
 import os
 import numpy as np
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 6e5f7fd035a..fe5b683bdea 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -26,13 +26,13 @@ from functools import reduce
 import numpy as np
 
 import paddle
-import paddle.reader
-from paddle.reader import *
 from paddle.fluid import layers
 from paddle.fluid.executor import Executor, global_scope
 from paddle.fluid.evaluator import Evaluator
 from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, \
     program_guard, dygraph_not_support
+from paddle.reader import cache, map_readers, buffered, compose, chain, shuffle, \
+    ComposeNotAligned, firstn, xmap_readers, multiprocess_reader
 from .wrapped_decorator import signature_safe_contextmanager
 from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.log_helper import get_logger
@@ -62,7 +62,7 @@ __all__ = [
     'set_program_state',
     'get_program_parameter',
     'get_program_persistable_vars',
-] + reader.__all__ + paddle.reader.__all__
+] + reader.__all__
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py
index 4d767709ef5..b2cb3141aad 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py
@@ -37,7 +37,7 @@ class TestClass(unittest.TestCase):
                     low=0, high=9, size=label_shape).astype('int64')
                 yield img, label
 
-        reader = fluid.io.cache(fake_reader)
+        reader = paddle.reader.cache(fake_reader)
         batch_reader = fluid.io.batch(reader, batch_size=batch_size)
 
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index e94157fa047..1ab37aaed23 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -592,7 +592,7 @@ class TestStarGANWithGradientPenalty(unittest.TestCase):
         cfg = Config(place)
 
         dataset = create_mnist_dataset(cfg)
-        dataset = fluid.io.cache(dataset)
+        dataset = paddle.reader.cache(dataset)
 
         static_graph_model = StaticGraphTrainModel(cfg)
         static_loss = []
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
index 39cb6651a4b..9634f5af30a 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
-from paddle.fluid.io import multiprocess_reader
+from paddle.reader import multiprocess_reader
 import unittest
 import numpy as np
 import six
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_combination.py b/python/paddle/fluid/tests/unittests/test_py_reader_combination.py
index 2d977caa033..624927d809f 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_combination.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_combination.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
 import unittest
 import numpy as np
@@ -60,8 +61,8 @@ class TestPyReaderCombination(unittest.TestCase):
             py_reader2 = fluid.io.PyReader(
                 feed_list=[image, label], capacity=16, iterable=True)
 
-            reader1 = fluid.io.cache(self.create_reader(self.n1))
-            reader2 = fluid.io.cache(self.create_reader(self.n2))
+            reader1 = paddle.reader.cache(self.create_reader(self.n1))
+            reader2 = paddle.reader.cache(self.create_reader(self.n2))
             py_reader1.decorate_batch_generator(reader1, places=place)
             py_reader2.decorate_batch_generator(reader2, places=place)
 
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index b67779cb2a2..6f0b0f3c9c1 100644
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -31,15 +31,6 @@ __all__ = [
     'set_program_state',
     'load_inference_model',
     'save_inference_model',
-    'batch',
-    'shuffle',
-    'buffered',
-    'cache',
-    'chain',
-    'firstn',
-    'compose',
-    'map_readers',
-    'xmap_readers'
 ]
 
 from ..fluid.io import DataLoader
@@ -47,4 +38,3 @@ from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worke
         TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler
 from ..fluid.io import load, save, load_program_state, set_program_state, \
         load_inference_model, save_inference_model, batch
-from ..reader import shuffle, buffered, cache, chain, firstn, compose, map_readers, xmap_readers
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 29337cf0668..881cfd81314 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -66,4 +66,4 @@ An example implementation for multiple item data reader creator:
 import paddle.reader.decorator
 from paddle.reader.decorator import *
 
-__all__ = decorator.__all__
+__all__ = []
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 2df9473c4b2..cec989fba8b 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -171,7 +171,6 @@ from .math import prod  #DEFINE_ALIAS
 from .random import standard_normal
 from .random import normal
 from .random import uniform  #DEFINE_ALIAS
-from .random import shuffle  #DEFINE_ALIAS
 from .random import randn  #DEFINE_ALIAS
 from .random import rand  #DEFINE_ALIAS
 from .random import randint  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index b38a1d0f5b7..9ffd81995ed 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -21,14 +21,11 @@ from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtyp
 from ..fluid.layers import utils
 import paddle
 
-from ..fluid.io import shuffle  #DEFINE_ALIAS
-
 __all__ = [
     'bernoulli',
     'standard_normal',
     'normal',
     'uniform',
-    'shuffle',
     'randn',
     'rand',
     'randint',
-- 
GitLab


From b7371fa55dab7013edce0c35401837d4c8ffd571 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Tue, 22 Sep 2020 13:28:44 +0800
Subject: [PATCH 166/261] judge whether remove build dir to accelerate
 compile,test=develop (#27334)

---
 paddle/scripts/paddle_build.bat | 58 ++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 15 deletions(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 99450d1e15c..60e4496bc54 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -20,14 +20,51 @@ rem       Paddle CI Task On Windows Platform
 rem =================================================
 
 rem -------clean up environment-----------
-wmic process where name="op_function_generator.exe" call terminate  2>NUL
 set work_dir=%cd%
-mkdir build
+wmic process where name="op_function_generator.exe" call terminate  2>NUL
+
+rem ------initialize common variable------
+if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
+if not defined BRANCH set BRANCH=develop
+if not defined WITH_MKL set WITH_MKL=ON
+if not defined WITH_GPU set WITH_GPU=OFF
+if not defined WITH_AVX set WITH_AVX=ON
+if not defined WITH_TESTING set WITH_TESTING=ON
+if not defined WITH_PYTHON set WITH_PYTHON=ON
+if not defined ON_INFER set ON_INFER=ON
+if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
+if not defined WITH_CACHE set WITH_CACHE=ON
+if not defined WITH_TPCACHE set WITH_TPCACHE=ON
+
+rem -------set cache build work directory-----------
+if "%WITH_CACHE%"=="OFF" (
+    rmdir build /s/q
+    goto :mkbuild
+)
+
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
+set day_now=%datetime:~6,2%
+set day_before=-1
+set /p day_before=<day.txt
+if %day_now% NEQ %day_before% (
+    echo %day_now% > day.txt
+    type day.txt
+    rmdir build /s/q
+)
+git diff origin/develop --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
+if %ERRORLEVEL% EQU 0 (
+    rmdir build /s/q
+)
+
+:mkbuild
+if not exist build (
+    mkdir build
+)
 cd /d build
-tree .
+dir .
 dir paddle\fluid\pybind\Release
 
-rem ------initialize the virtual environment------
+rem ------initialize the python environment------
 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
 set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
 
@@ -38,7 +75,7 @@ rem %PYTHON_EXECUTABLE% -m pip install virtualenv
 rem %PYTHON_EXECUTABLE% -m virtualenv paddle_winci
 rem call paddle_winci\Scripts\activate.bat
 
-rem ------pre install requirement----------
+rem ------pre install python requirement----------
 where python
 where pip
 pip install --upgrade pip --user
@@ -62,16 +99,6 @@ set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
 :: set maximum cache size to 20G
 clcache.exe -M 21474836480
 
-rem ------initialize common variable------
-if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
-if not defined BRANCH set BRANCH=develop
-if not defined WITH_AVX set WITH_AVX=ON
-if not defined WITH_TESTING set WITH_TESTING=ON
-if not defined WITH_PYTHON set WITH_PYTHON=ON
-if not defined ON_INFER set ON_INFER=ON
-if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
-if not defined WITH_TPCACHE set WITH_TPCACHE=ON
-
 rem ------set cache third_party------
 set cache_dir=%work_dir:Paddle=cache%
 dir %cache_dir%
@@ -418,6 +445,7 @@ taskkill /f /im rc.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
 taskkill /f /im python.exe  2>NUL
 call paddle_winci\Scripts\deactivate.bat 2>NUL
+del %PADDLE_WHL_FILE_WIN%
 taskkill /f /im python.exe  2>NUL
 echo Windows CI run successfully!
 exit /b 0
-- 
GitLab


From dd4c2d86a5f6df51091983b52badcad744548793 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Tue, 22 Sep 2020 13:42:29 +0800
Subject: [PATCH 167/261] enhance error messages, test=develop (#27423)

---
 paddle/fluid/inference/capi/pd_predictor.cc   |  3 +-
 .../fused/fusion_seqpool_cvm_concat_op.cc     | 31 +++++++++++++------
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc
index 0509a619021..31915496893 100644
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -130,7 +130,8 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
   VLOG(3) << "The inputs' size is " << input_names.size();
   PADDLE_ENFORCE_EQ(
       input_names.size(), in_size,
-      "The number of input and the number of model's input must match. ");
+      paddle::platform::errors::InvalidArgument(
+          "The number of input and the number of model's input must match."));
   for (int i = 0; i < in_size; ++i) {
     auto input_t = predictor->GetInputTensor(inputs[i].name);
     std::vector<int> tensor_shape;
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
index f64e4f134d6..ecb7db46a9d 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
@@ -24,20 +24,27 @@ void FusionSeqPoolCVMConcatOp::InferShape(
     framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_GE(
       ctx->Inputs("X").size(), 1UL,
-      "Inputs(X) of FusionSeqPoolCVMConcatOp should not be empty.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Output(Out) of FusionSeqPoolCVMConcatOp should not be null.");
+      paddle::platform::errors::InvalidArgument(
+          "Inputs(X) of FusionSeqPoolCVMConcatOp should not be empty."));
+  PADDLE_ENFORCE(
+      ctx->HasOutput("Out"),
+      paddle::platform::errors::InvalidArgument(
+          "Output(Out) of FusionSeqPoolCVMConcatOp should not be null."));
   int axis = ctx->Attrs().Get<int>("axis");
   PADDLE_ENFORCE_EQ(
-      axis, 1, "FusionSeqPoolCVMConcatOp only supports concat axis=1 yet.");
+      axis, 1,
+      paddle::platform::errors::InvalidArgument(
+          "FusionSeqPoolCVMConcatOp only supports concat axis=1 yet."));
   bool use_cvm = ctx->Attrs().Get<bool>("use_cvm");
   PADDLE_ENFORCE_EQ(
       use_cvm, true,
-      "FusionSeqPoolCVMConcatOp only supports use_cvm is true yet.");
+      paddle::platform::errors::InvalidArgument(
+          "FusionSeqPoolCVMConcatOp only supports use_cvm is true yet."));
 
   auto ins_dims = ctx->GetInputsDim("X");
   const size_t n = ins_dims.size();
-  PADDLE_ENFORCE_GT(n, 0UL, "Input tensors count should > 0.");
+  PADDLE_ENFORCE_GT(n, 0UL, paddle::platform::errors::InvalidArgument(
+                                "Input tensors count should > 0."));
   if (n == 1) {
     LOG(WARNING) << "Only have one input, may waste memory";
   }
@@ -45,7 +52,8 @@ void FusionSeqPoolCVMConcatOp::InferShape(
   // The output height should be confirmed in Compute,
   // since input lod is not accessible here.
   PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2,
-                    "The dims size of first input should be 2.");
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims size of first input should be 2."));
   ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)});
 }
 
@@ -99,7 +107,8 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
 
     int w = ins[0]->numel() / x0_dims[0];
     PADDLE_ENFORCE_EQ(y_dims[1] % w, 0,
-                      "The output of dims[1] should be dividable of w");
+                      paddle::platform::errors::InvalidArgument(
+                          "The output of dims[1] should be dividable of w"));
     jit::seq_pool_attr_t attr(w, jit::SeqPoolType::kSum);
     if (pooltype == "AVERAGE") {
       attr.type = jit::SeqPoolType::kAvg;
@@ -117,9 +126,11 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
       const T* src = ins[i]->data<T>();
       T* dst = y_data + i * w;
       PADDLE_ENFORCE_EQ(static_cast<int>(ins[i]->numel() / x_dims[0]), w,
-                        "Width of all inputs should be equal.");
+                        paddle::platform::errors::InvalidArgument(
+                            "Width of all inputs should be equal."));
       PADDLE_ENFORCE_EQ(x_lod.size(), bs + 1,
-                        "Batchsize of all inputs should be equal.");
+                        paddle::platform::errors::InvalidArgument(
+                            "Batchsize of all inputs should be equal."));
       for (size_t j = 0; j < bs; ++j) {
         attr.h = static_cast<int>(x_lod[j + 1] - x_lod[j]);
         seqpool(src, dst, &attr);
-- 
GitLab


From 7ba6279a2517884622f5fa56da1f91ab61e812f0 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Tue, 22 Sep 2020 14:08:03 +0800
Subject: [PATCH 168/261] [Dy2stat] Refine error msg of @to_static if not in
 imperative mode (#27371)

* refine error mesg
---
 .../dygraph_to_static/program_translator.py   | 17 +++++++++++
 python/paddle/fluid/dygraph/layers.py         |  1 +
 .../dygraph_to_static/test_declarative.py     | 19 ++++++++++++
 .../test_program_translator.py                | 29 +++++++++++++++++++
 4 files changed, 66 insertions(+)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 5218c0aac95..3b3b9bbe96f 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -370,6 +370,7 @@ class StaticLayer(object):
         Returns:
             Traced ConcreteProgram and executable translated Layer.
         """
+
         # 1. unify args/kwargs and replace Tensor with InputSpec
         if len(args) != len(self._function_spec.args_name):
             args, kwargs = self._function_spec.unified_args_and_kwargs(args,
@@ -522,6 +523,19 @@ def _switch_declarative_mode_guard_(is_declarative=True):
     _in_declarative_mode_ = original_val
 
 
+def _verify_init_in_dynamic_mode(class_instance):
+    """
+    Verifies the instance is initialized in dynamic mode.
+    """
+    if isinstance(class_instance, layers.Layer):
+        if not class_instance._init_in_dynamic_mode:
+            raise RuntimeError(
+                " `paddle.jit.to_static` is only available in dynamic mode. Please call `paddle.disable_static()` before "
+                "initializing your Layer class `{}` . Because parameters of Layer class should be initialized firstly "
+                "in dynamic mode while applying transformation.".format(
+                    class_instance))
+
+
 class ConcreteProgram(object):
 
     __slots__ = [
@@ -554,6 +568,9 @@ class ConcreteProgram(object):
             func_spec(FunctionSpec): A FunctionSpec instance for decorated function.
             input_spec(list[InputSpec]): 
         """
+        # verify the instance is initialized in imperative mode.
+        _verify_init_in_dynamic_mode(class_instance)
+
         # Transforms dygraph function into static function and caches it.
         dygraph_function = func_spec.dygraph_function
         static_func = convert_to_static(dygraph_function)
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 7075024369f..9c79deaab73 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -91,6 +91,7 @@ class Layer(core.Layer):
         self._helper = LayerObjectHelper(self._full_name)
         self._built = False
         self._dtype = dtype
+        self._init_in_dynamic_mode = framework.in_dygraph_mode()
 
         self._parameters = collections.OrderedDict()
         # Buffers the variable (not parameter) created in layer
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index 5582a65304d..450ef7557bc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -358,5 +358,24 @@ class TestDecorateModelDirectly(unittest.TestCase):
         self.assertListEqual(list(input_shape), [-1, 16, 10])
 
 
+class TestErrorWithInitFromStaticMode(unittest.TestCase):
+    def test_raise_error(self):
+        # disable imperative
+        paddle.enable_static()
+
+        net = SimpleNet()
+        with self.assertRaisesRegexp(RuntimeError,
+                                     "only available in dynamic mode"):
+            net.forward.concrete_program
+
+        with self.assertRaisesRegexp(RuntimeError,
+                                     "only available in dynamic mode"):
+            net.forward.inputs
+
+        with self.assertRaisesRegexp(RuntimeError,
+                                     "only available in dynamic mode"):
+            net.forward.outputs
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index 873d9ecb535..b0ab55758ee 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -21,6 +21,7 @@ import numpy as np
 import textwrap
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.jit import declarative
@@ -279,5 +280,33 @@ class TestEnableDeclarative(unittest.TestCase):
                     static_output.numpy(), dygraph_output.numpy(), atol=1e-4))
 
 
+class Net(fluid.dygraph.layers.Layer):
+    def __init__(self):
+        super(Net, self).__init__()
+
+    def forward(self, x):
+        return x + 1
+
+
+class TestErrorWithInitFromStaticMode(unittest.TestCase):
+    def setUp(self):
+        self.program_translator = ProgramTranslator()
+        self.x = np.random.randn(10, 32).astype('float32')
+
+    def test_raise_error(self):
+        # disable imperative
+        paddle.enable_static()
+        net = Net()
+
+        self.program_translator.enable(True)
+        with self.assertRaisesRegexp(RuntimeError,
+                                     "only available in dynamic mode"):
+            self.program_translator.get_output(net.forward, self.x)
+
+        with self.assertRaisesRegexp(RuntimeError,
+                                     "only available in dynamic mode"):
+            self.program_translator.get_program(net.forward, self.x)
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From 9f3a9be76ac5b1345acd3c9e5397b7fa96f2db39 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Tue, 22 Sep 2020 15:00:30 +0800
Subject: [PATCH 169/261] update python 2.7.15 (#27435)

---
 paddle/scripts/paddle_build.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index f87925056ff..69303013d2a 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -140,18 +140,18 @@ function cmake_base() {
         if [ "$1" != "" ]; then
             echo "using python abi: $1"
             if [ "$1" == "cp27-cp27m" ]; then
-                export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.15-ucs4/lib:}
                 export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
             -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.15-ucs2/lib/libpython2.7.so"
                 pip install -r ${PADDLE_ROOT}/python/requirements.txt
             elif [ "$1" == "cp27-cp27mu" ]; then
-                export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.15-ucs2/lib:}
                 export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
             -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.15-ucs4/lib/libpython2.7.so"
                 pip install -r ${PADDLE_ROOT}/python/requirements.txt
             elif [ "$1" == "cp27-cp27m-gcc82" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.15-ucs4/lib:}
-- 
GitLab


From 905e2346acb72c9bd8c0d955473141bc5e02107e Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Tue, 22 Sep 2020 15:23:10 +0800
Subject: [PATCH 170/261] add endpoints log;test=develop (#27439)

---
 python/paddle/distributed/fleet/launch.py       | 5 ++---
 python/paddle/distributed/fleet/launch_utils.py | 8 ++++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 17fa0a0c7c3..d63c9f9184c 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -463,9 +463,8 @@ def launch():
         cuda_device_num = 0
 
     if len(has_ps_args) > 0 or cuda_device_num == 0:
-        logger.info(
-            "Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}".
-            format(has_ps_args, cuda_device_num))
+        logger.info("Run parameter-sever cpu mode. pserver arguments:{}".format(
+            has_ps_args))
         launch_ps(args)
     elif len(has_collective_args) > 0:
         logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}".
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 17d3b96cf44..7540cd9f4c1 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -435,9 +435,17 @@ def start_local_trainers(cluster,
                             len(pod.trainers),
                             pretty_print_envs(proc_env, ("Distributed Envs",
                                                          "Value"))))
+            logger.info(
+                "details abouts PADDLE_TRAINER_ENDPOINTS can be found in {}/endpoints.log.".
+                format(log_dir))
         fn = None
         if log_dir is not None:
             os.system("mkdir -p {}".format(log_dir))
+            if os.path.exists("%s/endpoints.log" % log_dir):
+                os.system("rm -f {}/endpoints.log".format(log_dir))
+            with open("%s/endpoints.log" % log_dir, "w") as f:
+                f.write("PADDLE_TRAINER_ENDPOINTS: \n")
+                f.write("\n".join(cluster.trainers_endpoints()))
             fn = open("%s/workerlog.%d" % (log_dir, idx), "a")
             proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
         else:
-- 
GitLab


From fda54c0212fac68ec83d95725665703012afa651 Mon Sep 17 00:00:00 2001
From: Pei Yang <peiyang@baidu.com>
Date: Tue, 22 Sep 2020 16:43:50 +0800
Subject: [PATCH 171/261] errmsg refine of trt plugin (#27309)

---
 .../fluid/inference/tensorrt/plugin/trt_plugin_factory.cc  | 6 ++++--
 paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h  | 7 ++++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
index 3c20b6d1e72..76b0832c546 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
@@ -25,8 +25,10 @@ PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
   const char* plugin_type;
   DeserializeValue(&serial_data, &serial_length, &plugin_type);
 
-  PADDLE_ENFORCE(Has(plugin_type),
-                 "trt plugin type %s does not exists, check it.", plugin_type);
+  PADDLE_ENFORCE_EQ(
+      Has(plugin_type), true,
+      platform::errors::NotFound(
+          "trt plugin type %s does not exists, check it.", plugin_type));
   auto plugin = plugin_registry_[plugin_type](serial_data, serial_length);
   owned_plugins_.emplace_back(plugin);
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
index 18037179c7b..6fcb70c6d32 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
@@ -103,7 +103,12 @@ struct Serializer<std::vector<T>,
     DeserializeValue(buffer, buffer_size, &size);
     value->resize(size);
     size_t nbyte = value->size() * sizeof(T);
-    PADDLE_ENFORCE_GE(*buffer_size, nbyte);
+    PADDLE_ENFORCE_GE(
+        *buffer_size, nbyte,
+        platform::errors::InvalidArgument("Expect buffer size >= value size in "
+                                          "trt plugin deserialization, but got "
+                                          "buffer size = %d, value size = %d.",
+                                          *buffer_size, nbyte));
     std::memcpy(value->data(), *buffer, nbyte);
     reinterpret_cast<char const*&>(*buffer) += nbyte;
     *buffer_size -= nbyte;
-- 
GitLab


From 0a862fd356c6c7aa78d1dffbcd599ddb3febfeda Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Tue, 22 Sep 2020 02:06:37 -0700
Subject: [PATCH 172/261] refine the precious of linspace Op using half way
 (#27452)

---
 paddle/fluid/operators/linspace_op.cu | 41 +++++++++++++++++----------
 paddle/fluid/operators/linspace_op.h  |  8 +++++-
 python/paddle/fluid/layers/tensor.py  |  2 +-
 3 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
index c51e8785263..a4f06933232 100644
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -23,9 +23,16 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T>
-__global__ void LinspaceKernel(T start, double step, int64_t size, T* out) {
-  CUDA_KERNEL_LOOP(index, size) {
-    out[index] = static_cast<T>(start + step * index);
+__global__ void LinspaceKernel(T start, T stop, double step, int64_t size,
+                               T* out) {
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (; index < size; index += blockDim.x * gridDim.x) {
+    if (index < size / 2) {
+      out[index] = static_cast<T>(start + step * index);
+    } else {
+      out[index] = static_cast<T>(stop - step * (size - index - 1));
+    }
   }
 }
 
@@ -55,13 +62,15 @@ class CUDALinspaceKernel : public framework::OpKernel<T> {
     framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
     framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
 
-    framework::Tensor n;
-    framework::TensorCopy(start_t, platform::CPUPlace(), &n);
-    T start = n.data<T>()[0];
-    framework::TensorCopy(stop_t, platform::CPUPlace(), &n);
-    T stop = n.data<T>()[0];
-    framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
-    int32_t num = n.data<int32_t>()[0];
+    framework::Tensor n_start;
+    framework::Tensor n_stop;
+    framework::Tensor n_num;
+    framework::TensorCopy(start_t, platform::CPUPlace(), &n_start);
+    T start = n_start.data<T>()[0];
+    framework::TensorCopy(stop_t, platform::CPUPlace(), &n_stop);
+    T stop = n_stop.data<T>()[0];
+    framework::TensorCopy(*num_t, platform::CPUPlace(), &n_num);
+    int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);
 
     PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
                                   "The num of linspace op should be larger "
@@ -72,14 +81,16 @@ class CUDALinspaceKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
     double step = 0;
-    if (num != 1) {
-      step = (static_cast<double>(stop - start)) / (num - 1);
-    }
-
     auto stream = context.cuda_device_context().stream();
     int block = 512;
     int grid = (num + block - 1) / block;
-    LinspaceKernel<T><<<grid, block, 0, stream>>>(start, step, num, out_data);
+    if (num != 1) {
+      step = (static_cast<double>(stop - start)) / (num - 1);
+      LinspaceKernel<T><<<grid, block, 0, stream>>>(start, stop, step, num,
+                                                    out_data);
+    } else {
+      LinspaceSpecialKernel<T><<<grid, block, 0, stream>>>(start, out_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
index 2c30a66ef8e..d8e0fefe175 100644
--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -56,9 +56,15 @@ class CPULinspaceKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
     if (num > 1) {
+      // step should be of double type for all types
       double step = (static_cast<double>(stop - start)) / (num - 1);
+      int half_num = num / 2;
       for (int i = 0; i < num; ++i) {
-        out_data[i] = static_cast<T>(start + step * i);
+        if (i < half_num) {
+          out_data[i] = static_cast<T>(start + step * i);
+        } else {
+          out_data[i] = static_cast<T>(stop - step * (num - i - 1));
+        }
       }
     } else {
       out_data[0] = static_cast<T>(start);
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 0ce7c098e2d..cf52f3b00fb 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -1424,7 +1424,7 @@ def linspace(start, stop, num, dtype=None, name=None):
         stop(int|float|Tensor): The input :attr:`stop` is start variable of range. It is a scalar, \
             or a Tensor of shape [1] with input data type int32, int64, float32 or float64.
         num(int|Tensor): The input :attr:`num` is given num of the sequence. It is an int scalar, \
-            or a Tensor of shape [1] with data type int32 or int64.
+            or a Tensor of shape [1] with data type int32.
         dtype(np.dtype|str, optional): The data type of output tensor, it could be
             int32, int64, float32 and float64. Default: if None, the data type is float32.
         name(str, optional): Normally there is no need for user to set this property. 
-- 
GitLab


From 827ac36faa265f7d183c59bd05390915fdceec97 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Tue, 22 Sep 2020 04:50:09 -0500
Subject: [PATCH 173/261] Use dygraph mode by default (#27443)

* default open dygraph mode

* fix CI-Mac

* fix Mac-CI other unittest file

* fix CI-Py3

* fix test_communicator_geo and test_buffer_shared_memory_reuse_pass

* add enable_static to fix CI-Py3

* add enable_static to fix CI-coverage

* delete try except
---
 python/paddle/__init__.py                                     | 2 ++
 python/paddle/fluid/contrib/slim/tests/convert_model2dot.py   | 3 +++
 .../slim/tests/quant2_int8_image_classification_comparison.py | 2 ++
 .../fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py    | 2 ++
 .../slim/tests/quant_int8_image_classification_comparison.py  | 2 ++
 python/paddle/fluid/contrib/slim/tests/save_quant_model.py    | 2 ++
 python/paddle/fluid/contrib/slim/tests/test_graph.py          | 2 ++
 python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py | 2 ++
 .../contrib/slim/tests/test_imperative_qat_channelwise.py     | 2 ++
 .../slim/tests/test_post_training_quantization_mnist.py       | 2 ++
 .../slim/tests/test_post_training_quantization_mobilenetv1.py | 2 ++
 .../slim/tests/test_post_training_quantization_resnet50.py    | 3 +++
 .../fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py  | 3 +++
 .../fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py | 1 +
 .../paddle/fluid/contrib/slim/tests/test_quantization_pass.py | 2 ++
 .../fluid/contrib/slim/tests/test_quantization_scale_pass.py  | 2 ++
 .../contrib/slim/tests/test_user_defined_quantization.py      | 2 ++
 .../slim/tests/test_weight_quantization_mobilenetv1.py        | 3 +++
 python/paddle/fluid/contrib/tests/test_correlation.py         | 3 +++
 python/paddle/fluid/contrib/tests/test_fp16_utils.py          | 3 +++
 .../fluid/contrib/tests/test_image_classification_fp16.py     | 2 ++
 python/paddle/fluid/contrib/tests/test_quantize_transpiler.py | 3 +++
 python/paddle/fluid/contrib/tests/test_weight_decay_extend.py | 2 ++
 python/paddle/fluid/install_check.py                          | 4 ++++
 python/paddle/fluid/tests/book/test_fit_a_line.py             | 2 ++
 python/paddle/fluid/tests/book/test_image_classification.py   | 2 ++
 python/paddle/fluid/tests/book/test_label_semantic_roles.py   | 2 ++
 python/paddle/fluid/tests/book/test_machine_translation.py    | 2 ++
 python/paddle/fluid/tests/book/test_recognize_digits.py       | 2 ++
 python/paddle/fluid/tests/book/test_recommender_system.py     | 2 ++
 python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py    | 3 +++
 python/paddle/fluid/tests/book/test_word2vec.py               | 2 ++
 python/paddle/fluid/tests/custom_op/test_custom_op.py         | 2 ++
 python/paddle/fluid/tests/test_beam_search_decoder.py         | 2 ++
 python/paddle/fluid/tests/test_data_feeder.py                 | 3 +++
 python/paddle/fluid/tests/test_detection.py                   | 3 +++
 python/paddle/fluid/tests/test_error_clip.py                  | 1 +
 python/paddle/fluid/tests/test_if_else_op.py                  | 2 ++
 python/paddle/fluid/tests/test_python_operator_overriding.py  | 3 +++
 python/paddle/fluid/tests/unittests/c_comm_init_op.py         | 3 +++
 python/paddle/fluid/tests/unittests/check_nan_inf_base.py     | 2 ++
 .../paddle/fluid/tests/unittests/collective_allgather_api.py  | 2 ++
 .../paddle/fluid/tests/unittests/collective_allgather_op.py   | 2 ++
 .../paddle/fluid/tests/unittests/collective_allreduce_api.py  | 2 ++
 .../paddle/fluid/tests/unittests/collective_allreduce_op.py   | 2 ++
 python/paddle/fluid/tests/unittests/collective_barrier_api.py | 2 ++
 .../paddle/fluid/tests/unittests/collective_broadcast_api.py  | 2 ++
 .../paddle/fluid/tests/unittests/collective_broadcast_op.py   | 2 ++
 python/paddle/fluid/tests/unittests/collective_reduce_api.py  | 2 ++
 python/paddle/fluid/tests/unittests/collective_reduce_op.py   | 2 ++
 .../fluid/tests/unittests/collective_reduce_op_calc_stream.py | 2 ++
 .../paddle/fluid/tests/unittests/collective_reducescatter.py  | 2 ++
 .../fluid/tests/unittests/collective_reducescatter_op.py      | 2 ++
 python/paddle/fluid/tests/unittests/collective_scatter_api.py | 2 ++
 python/paddle/fluid/tests/unittests/collective_scatter_op.py  | 2 ++
 python/paddle/fluid/tests/unittests/dist_allreduce_op.py      | 2 ++
 python/paddle/fluid/tests/unittests/dist_fleet_ctr.py         | 2 ++
 python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py   | 2 ++
 python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py  | 2 ++
 python/paddle/fluid/tests/unittests/dist_mnist.py             | 2 ++
 python/paddle/fluid/tests/unittests/dist_se_resnext.py        | 2 ++
 .../fluid/tests/unittests/dygraph_to_static/test_yolov3.py    | 2 ++
 python/paddle/fluid/tests/unittests/test_allgather.py         | 3 +++
 python/paddle/fluid/tests/unittests/test_allreduce.py         | 3 +++
 python/paddle/fluid/tests/unittests/test_auto_checkpoint.py   | 1 +
 python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py  | 1 +
 python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py  | 1 +
 python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py  | 1 +
 .../fluid/tests/unittests/test_auto_checkpoint_dist_basic.py  | 1 +
 .../fluid/tests/unittests/test_auto_checkpoint_multiple.py    | 1 +
 python/paddle/fluid/tests/unittests/test_broadcast.py         | 3 +++
 .../tests/unittests/test_buffer_shared_memory_reuse_pass.py   | 1 +
 .../fluid/tests/unittests/test_collective_allgather_api.py    | 3 +++
 .../fluid/tests/unittests/test_collective_allreduce_api.py    | 3 +++
 .../fluid/tests/unittests/test_collective_barrier_api.py      | 3 +++
 .../fluid/tests/unittests/test_collective_broadcast_api.py    | 3 +++
 python/paddle/fluid/tests/unittests/test_collective_reduce.py | 3 +++
 .../fluid/tests/unittests/test_collective_reduce_api.py       | 3 +++
 .../paddle/fluid/tests/unittests/test_collective_scatter.py   | 3 +++
 .../fluid/tests/unittests/test_collective_scatter_api.py      | 3 +++
 python/paddle/fluid/tests/unittests/test_communicator_geo.py  | 3 +++
 .../fluid/tests/unittests/test_communicator_half_async.py     | 3 +++
 python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py | 3 +++
 .../tests/unittests/test_dist_fleet_a_sync_optimizer_async.py | 2 ++
 .../tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py  | 2 ++
 .../unittests/test_dist_fleet_a_sync_optimizer_auto_async.py  | 2 ++
 .../unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py    | 2 ++
 .../tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py   | 2 ++
 .../tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py  | 2 ++
 python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py    | 3 +++
 .../paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py | 3 +++
 .../fluid/tests/unittests/test_dist_fleet_heter_program.py    | 3 +++
 python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py     | 3 +++
 python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py    | 2 ++
 python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py    | 3 +++
 python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py    | 3 +++
 python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py    | 3 +++
 python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py | 3 +++
 .../fluid/tests/unittests/test_dist_mnist_backward_deps.py    | 3 +++
 .../fluid/tests/unittests/test_dist_mnist_batch_merge.py      | 3 +++
 .../paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py  | 3 +++
 .../fluid/tests/unittests/test_dist_mnist_fleet_save.py       | 3 +++
 .../paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py  | 3 +++
 .../fluid/tests/unittests/test_dist_mnist_hallreduce.py       | 3 +++
 .../fluid/tests/unittests/test_dist_mnist_multi_comm.py       | 3 +++
 python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py     | 3 +++
 .../fluid/tests/unittests/test_dist_mnist_ring_allreduce.py   | 3 +++
 .../fluid/tests/unittests/test_dist_mnist_with_program.py     | 3 +++
 python/paddle/fluid/tests/unittests/test_dist_op.py           | 2 ++
 .../paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py | 3 +++
 .../fluid/tests/unittests/test_dist_transpiler_async_decay.py | 3 +++
 .../fluid/tests/unittests/test_dist_transpiler_config.py      | 3 +++
 .../unittests/test_fleet_graph_execution_meta_optimizer.py    | 2 ++
 .../paddle/fluid/tests/unittests/test_listen_and_serv_op.py   | 2 ++
 python/paddle/fluid/tests/unittests/test_nan_inf.py           | 3 +++
 python/paddle/fluid/tests/unittests/test_reducescatter.py     | 3 +++
 python/paddle/fluid/tests/unittests/test_reducescatter_api.py | 3 +++
 python/paddle/tests/test_text.py                              | 2 ++
 tools/test_runner.py                                          | 3 +++
 119 files changed, 282 insertions(+)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 29e739a0edf..e749cf88b6a 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -276,3 +276,5 @@ from .hapi import callbacks
 from .hapi import summary
 import paddle.text
 import paddle.vision
+
+disable_static()
diff --git a/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py b/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py
index 877897c0a0e..0018d81dbf2 100644
--- a/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py
+++ b/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py
@@ -19,6 +19,9 @@ import argparse
 import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
 from paddle.fluid import core
+import paddle
+
+paddle.enable_static()
 
 
 def parse_args():
diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
index 17e0f452e98..3fba0e89218 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
@@ -27,6 +27,8 @@ from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
 from paddle.fluid import core
 
+paddle.enable_static()
+
 logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
 _logger = logging.getLogger(__name__)
 _logger.setLevel(logging.INFO)
diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
index a534edb7efd..12d1cfcc41d 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
@@ -25,6 +25,8 @@ from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
 from paddle.fluid import core
 
+paddle.enable_static()
+
 logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
 _logger = logging.getLogger(__name__)
 _logger.setLevel(logging.INFO)
diff --git a/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
index 5f0a8f2d6fa..b81ef7b30ed 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
@@ -27,6 +27,8 @@ from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import QuantInt8MkldnnPass
 from paddle.fluid import core
 
+paddle.enable_static()
+
 logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
 _logger = logging.getLogger(__name__)
 _logger.setLevel(logging.INFO)
diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
index dab4b63cda4..e38148250af 100644
--- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
@@ -27,6 +27,8 @@ from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
 from paddle.fluid import core
 
+paddle.enable_static()
+
 
 def parse_args():
     parser = argparse.ArgumentParser()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py
index 2cf897ec418..435cefd73e7 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -22,6 +22,8 @@ import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
 from paddle.fluid import core
 
+paddle.enable_static()
+
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 os.environ["CPU_NUM"] = "1"
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 0d047a0cd3b..df505cf2435 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -32,6 +32,8 @@ from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.log_helper import get_logger
 
+paddle.enable_static()
+
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
index 17c613281a8..80d388ac0da 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -32,6 +32,8 @@ from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.log_helper import get_logger
 
+paddle.enable_static()
+
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index 3ac1590b8aa..3ea1c84f976 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -25,6 +25,8 @@ import paddle.fluid as fluid
 from paddle.dataset.common import download
 from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
 
+paddle.enable_static()
+
 random.seed(0)
 np.random.seed(0)
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 864631ec278..18389d9433b 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -26,6 +26,8 @@ import paddle.fluid as fluid
 from paddle.dataset.common import download
 from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
 
+paddle.enable_static()
+
 random.seed(0)
 np.random.seed(0)
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
index a6c19b5e45a..12b5a2458a4 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
@@ -15,6 +15,9 @@
 import sys
 import unittest
 from test_post_training_quantization_mobilenetv1 import TestPostTrainingQuantization
+import paddle
+
+paddle.enable_static()
 
 
 class TestPostTrainingForResnet50(TestPostTrainingQuantization):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
index 7b519731314..7f9209c8b3f 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
@@ -18,6 +18,9 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
+import paddle
+
+paddle.enable_static()
 
 
 class TestQuant2Int8MkldnnPass(unittest.TestCase):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
index 3acbd897419..7ee0fd1d3e2 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
@@ -25,6 +25,7 @@ from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
 from paddle.fluid.contrib.slim.quantization import QuantInt8MkldnnPass
 from paddle.fluid import core
 
+paddle.enable_static()
 os.environ["CPU_NUM"] = "1"
 
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index dc9b83e4435..768a9ba7cfc 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -27,6 +27,8 @@ from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
 from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
 from paddle.fluid import core
 
+paddle.enable_static()
+
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 os.environ["CPU_NUM"] = "1"
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
index 9e8c5027ebb..b03281546a5 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
@@ -27,6 +27,8 @@ from paddle.fluid.contrib.slim.quantization import OutScaleForInferencePass
 from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
 from paddle.fluid import core
 
+paddle.enable_static()
+
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 os.environ["CPU_NUM"] = "1"
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
index 32292c8a47b..f03d0faa398 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
@@ -29,6 +29,8 @@ from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
 
+paddle.enable_static()
+
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 os.environ["CPU_NUM"] = "1"
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
index ff22b1b61e6..1e8fa51d635 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
@@ -17,6 +17,9 @@ import os
 import time
 from paddle.dataset.common import download, DATA_HOME
 from paddle.fluid.contrib.slim.quantization import WeightQuantization
+import paddle
+
+paddle.enable_static()
 
 
 class TestWeightQuantization(unittest.TestCase):
diff --git a/python/paddle/fluid/contrib/tests/test_correlation.py b/python/paddle/fluid/contrib/tests/test_correlation.py
index 7fcef4dbcd1..50b091415a5 100644
--- a/python/paddle/fluid/contrib/tests/test_correlation.py
+++ b/python/paddle/fluid/contrib/tests/test_correlation.py
@@ -16,6 +16,9 @@ import unittest
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.base import to_variable
+import paddle
+
+paddle.enable_static()
 
 
 def corr(x_1,
diff --git a/python/paddle/fluid/contrib/tests/test_fp16_utils.py b/python/paddle/fluid/contrib/tests/test_fp16_utils.py
index e286bb0150e..0b51f2dcc86 100644
--- a/python/paddle/fluid/contrib/tests/test_fp16_utils.py
+++ b/python/paddle/fluid/contrib/tests/test_fp16_utils.py
@@ -16,6 +16,9 @@ import unittest
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.contrib.mixed_precision import fp16_utils
+import paddle
+
+paddle.enable_static()
 
 
 class AMPTest(unittest.TestCase):
diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
index 5fb1dba40a3..1bf1a234834 100644
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
@@ -25,6 +25,8 @@ import os
 import copy
 import numpy as np
 
+paddle.enable_static()
+
 
 def resnet_cifar10(input, depth=32):
     def conv_bn_layer(input,
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index 77fdf0087b9..342be7db3ed 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -20,6 +20,9 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid.contrib.quantize.quantize_transpiler import _original_var_name
 from paddle.fluid.contrib.quantize.quantize_transpiler import QuantizeTranspiler
+import paddle
+
+paddle.enable_static()
 
 
 def linear_fc(num):
diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
index a5f08ca969a..906d83fff4f 100644
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
@@ -21,6 +21,8 @@ import paddle
 import paddle.fluid as fluid
 import contextlib
 
+paddle.enable_static()
+
 
 def get_places():
     places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index ef469377acf..51fa1677b86 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -62,6 +62,8 @@ def run_check():
             # Your Paddle Fluid works well on MUTIPLE GPU or CPU.
             # Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now
     """
+    paddle.enable_static()
+
     print("Running Verify Fluid Program ... ")
 
     device_list = []
@@ -157,3 +159,5 @@ def run_check():
         print(
             "Your Paddle Fluid is installed successfully ONLY for SINGLE GPU or CPU! "
             "\n Let's start deep Learning with Paddle Fluid now")
+
+    paddle.disable_static()
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index a7d5a030599..9a2cc4ab1a1 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -23,6 +23,8 @@ import math
 import sys
 import os
 
+paddle.enable_static()
+
 
 def train(use_cuda, save_dirname, is_local):
     x = fluid.layers.data(name='x', shape=[13], dtype='float32')
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 22b74f29228..7c2d5c693a9 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -24,6 +24,8 @@ import unittest
 import os
 import numpy as np
 
+paddle.enable_static()
+
 
 def resnet_cifar10(input, depth=32):
     def conv_bn_layer(input,
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index ef14600e644..568d7518a1e 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -25,6 +25,8 @@ import paddle
 import paddle.dataset.conll05 as conll05
 import paddle.fluid as fluid
 
+paddle.enable_static()
+
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
 label_dict_len = len(label_dict)
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 5e241aaa327..a0056ba3bab 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -24,6 +24,8 @@ from paddle.fluid.executor import Executor
 import unittest
 import os
 
+paddle.enable_static()
+
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
 hidden_dim = 32
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 4fbb146752e..71c57b85160 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -26,6 +26,8 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
 
+paddle.enable_static()
+
 BATCH_SIZE = 64
 
 
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 433b5498de7..c2ab249f571 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -26,6 +26,8 @@ import paddle.fluid.nets as nets
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import SGDOptimizer
 
+paddle.enable_static()
+
 IS_SPARSE = True
 USE_GPU = False
 BATCH_SIZE = 256
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index 0d65513c122..3791e386ecf 100644
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -25,6 +25,9 @@ import math
 import sys
 import unittest
 from paddle.fluid.executor import Executor
+import paddle
+
+paddle.enable_static()
 
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index c919584554b..aae4de70aca 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -23,6 +23,8 @@ import numpy as np
 import math
 import sys
 
+paddle.enable_static()
+
 
 def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
     PASS_NUM = 100
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_op.py b/python/paddle/fluid/tests/custom_op/test_custom_op.py
index 0d02da53d66..c9f7d0b7c96 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_op.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_op.py
@@ -21,6 +21,8 @@ import contextlib
 import paddle
 import paddle.fluid as fluid
 
+paddle.enable_static()
+
 file_dir = os.path.dirname(os.path.abspath(__file__))
 fluid.load_op_library(os.path.join(file_dir, 'librelu2_op.so'))
 
diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py
index fe8a9daa3be..69f3ff46b3a 100644
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
@@ -29,6 +29,8 @@ from paddle.fluid.contrib.decoder.beam_search_decoder import *
 import unittest
 import os
 
+paddle.enable_static()
+
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
 src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py
index 16a33fd3ab3..d50c57e670b 100644
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -16,6 +16,9 @@ from __future__ import print_function
 
 import paddle.fluid as fluid
 import unittest
+import paddle
+
+paddle.enable_static()
 
 
 class TestDataFeeder(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 425c4e3c7e3..05b9067ec40 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -24,6 +24,9 @@ import numpy as np
 from unittests.test_imperative_base import new_program_scope
 from paddle.fluid.dygraph import base
 from paddle.fluid import core
+import paddle
+
+paddle.enable_static()
 
 
 class LayerTest(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index 3c977afc7c8..7859fca15f6 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -22,6 +22,7 @@ BATCH_SIZE = 128
 CLIP_MAX = 2e-6
 CLIP_MIN = -1e-6
 
+paddle.enable_static()
 prog = fluid.framework.Program()
 
 with fluid.program_guard(main_program=prog):
diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
index 1c992b9d8cd..b7792e5ce27 100644
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -28,6 +28,8 @@ from paddle.fluid.layers.control_flow import ConditionalBlock
 import unittest
 import numpy as np
 
+paddle.enable_static()
+
 
 class TestMNISTIfElseOp(unittest.TestCase):
     # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
diff --git a/python/paddle/fluid/tests/test_python_operator_overriding.py b/python/paddle/fluid/tests/test_python_operator_overriding.py
index 5f92c437ec7..fd9dc961988 100644
--- a/python/paddle/fluid/tests/test_python_operator_overriding.py
+++ b/python/paddle/fluid/tests/test_python_operator_overriding.py
@@ -21,6 +21,9 @@ import numpy as np
 import paddle.fluid.layers as layers
 import paddle.fluid.framework as framework
 import paddle.fluid as fluid
+import paddle
+
+paddle.enable_static()
 
 
 class TestPythonOperatorOverride(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/c_comm_init_op.py b/python/paddle/fluid/tests/unittests/c_comm_init_op.py
index db77477cca6..ed6a75230c6 100644
--- a/python/paddle/fluid/tests/unittests/c_comm_init_op.py
+++ b/python/paddle/fluid/tests/unittests/c_comm_init_op.py
@@ -19,6 +19,9 @@ import os
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
+import paddle
+
+paddle.enable_static()
 
 
 class TestCCommInitOp(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
index 8e75b3c3438..c682c795019 100644
--- a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
+++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
@@ -28,6 +28,8 @@ import paddle
 import paddle.fluid as fluid
 import paddle.compat as cpt
 
+paddle.enable_static()
+
 np.random.seed(0)
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective_allgather_api.py
index bdf4ca07ae9..63d7f52c11a 100644
--- a/python/paddle/fluid/tests/unittests/collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_allgather_api.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveAllgatherAPI(TestCollectiveAPIRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_allgather_op.py b/python/paddle/fluid/tests/unittests/collective_allgather_op.py
index 34999654768..f77a97aa915 100644
--- a/python/paddle/fluid/tests/unittests/collective_allgather_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_allgather_op.py
@@ -34,6 +34,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveAllGather(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
index aea429ae5e3..67242b274fc 100644
--- a/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_op.py b/python/paddle/fluid/tests/unittests/collective_allreduce_op.py
index 9aef8879cab..eef59ee3dde 100644
--- a/python/paddle/fluid/tests/unittests/collective_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_op.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveAllreduce(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_barrier_api.py b/python/paddle/fluid/tests/unittests/collective_barrier_api.py
index 09b3c27126d..dbcc70d540b 100644
--- a/python/paddle/fluid/tests/unittests/collective_barrier_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_barrier_api.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveBarrierAPI(TestCollectiveAPIRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
index a879a027b50..08a3d948906 100644
--- a/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_broadcast_op.py b/python/paddle/fluid/tests/unittests/collective_broadcast_op.py
index 18f0485f923..127f48be618 100644
--- a/python/paddle/fluid/tests/unittests/collective_broadcast_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_broadcast_op.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveBroadcast(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective_reduce_api.py
index 3e89b1cb3ee..41e31146a22 100644
--- a/python/paddle/fluid/tests/unittests/collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_api.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveReduceAPI(TestCollectiveAPIRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op.py b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
index da61284344b..0448c66d132 100644
--- a/python/paddle/fluid/tests/unittests/collective_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveReduce(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
index 7e690428623..7a9e0b148d5 100644
--- a/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveReduce(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_reducescatter.py b/python/paddle/fluid/tests/unittests/collective_reducescatter.py
index 2f14277ae1e..8b989c73d4d 100644
--- a/python/paddle/fluid/tests/unittests/collective_reducescatter.py
+++ b/python/paddle/fluid/tests/unittests/collective_reducescatter.py
@@ -34,6 +34,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveReduceScatter(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py b/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py
index 3e286d7f43d..91712e2b50f 100644
--- a/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveReduceScatter(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective_scatter_api.py
index f68929ad3b3..ca36c8c83a5 100644
--- a/python/paddle/fluid/tests/unittests/collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_api.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveScatterAPI(TestCollectiveAPIRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_op.py b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
index efe5e17bcce..7afa4aec639 100644
--- a/python/paddle/fluid/tests/unittests/collective_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveScatter(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
index 88a3cd14c43..de52072d4a8 100644
--- a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
@@ -30,6 +30,8 @@ import signal
 from functools import reduce
 from test_dist_base import TestDistRunnerBase, runtime_main
 
+paddle.enable_static()
+
 DTYPE = "float32"
 paddle.dataset.mnist.fetch()
 
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 1b0ce0c03e7..8277499fcce 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -30,6 +30,8 @@ import ctr_dataset_reader
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
 from paddle.distributed.fleet.base.util_factory import fleet_util
 
+paddle.enable_static()
+
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
 fluid.default_main_program().random_seed = 1
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index a5633bb0450..2f938a813d8 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -31,6 +31,8 @@ from test_dist_fleet_heter_base import runtime_main, FleetDistHeterRunnerBase
 from dist_fleet_ctr import TestDistCTR2x2, fake_ctr_reader
 from paddle.distributed.fleet.base.util_factory import fleet_util
 
+paddle.enable_static()
+
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
 fluid.default_main_program().random_seed = 1
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
index 7d5ca4fc6e3..2ea69e1b676 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -34,6 +34,8 @@ from functools import reduce
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
 from paddle.distributed.fleet.base.util_factory import fleet_util
 
+paddle.enable_static()
+
 DTYPE = "int64"
 DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000'
 DATA_MD5 = '24e49366eb0611c552667989de2f57d5'
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index 20e89bd46c6..f63139464e7 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -31,6 +31,8 @@ from functools import reduce
 from test_dist_base import TestDistRunnerBase, runtime_main
 from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
 
+paddle.enable_static()
+
 DTYPE = "float32"
 paddle.dataset.mnist.fetch()
 
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index a2fd61e2387..5ba40c7c838 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -30,6 +30,8 @@ import sys
 import signal
 from test_dist_base import TestDistRunnerBase, runtime_main
 
+paddle.enable_static()
+
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
 fluid.default_main_program().random_seed = 1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py
index 4453dff892f..6aa9156a0d4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py
@@ -17,12 +17,14 @@ import random
 import time
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import ProgramTranslator
 from paddle.fluid.dygraph import to_variable
 
 from yolov3 import cfg, YOLOv3
 
+paddle.enable_static()
 random.seed(0)
 np.random.seed(0)
 
diff --git a/python/paddle/fluid/tests/unittests/test_allgather.py b/python/paddle/fluid/tests/unittests/test_allgather.py
index 877ae6f6e16..9bb34d3db43 100644
--- a/python/paddle/fluid/tests/unittests/test_allgather.py
+++ b/python/paddle/fluid/tests/unittests/test_allgather.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestAllGatherOp(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_allreduce.py b/python/paddle/fluid/tests/unittests/test_allreduce.py
index e0b6422a67b..660f559535c 100644
--- a/python/paddle/fluid/tests/unittests/test_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_allreduce.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestAllReduceOp(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
index fd009db5fd0..3f33120d1f7 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
@@ -31,6 +31,7 @@ from paddle.io import Dataset, BatchSampler, DataLoader
 
 from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
 
+paddle.enable_static()
 logger = get_logger()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
index 55173325f62..fca1baf85e5 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
@@ -32,6 +32,7 @@ from paddle.io import Dataset, BatchSampler, DataLoader
 from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
 from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
 
+paddle.enable_static()
 logger = get_logger()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
index 5d72fa01008..0c17807a689 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
@@ -32,6 +32,7 @@ from paddle.io import Dataset, BatchSampler, DataLoader
 from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
 from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
 
+paddle.enable_static()
 logger = get_logger()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
index 5382f7e328e..ca103be59b9 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
@@ -32,6 +32,7 @@ from paddle.io import Dataset, BatchSampler, DataLoader
 from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
 from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
 
+paddle.enable_static()
 logger = get_logger()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
index 3c78438bdf6..3eeff91ff2d 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
@@ -32,6 +32,7 @@ from paddle.io import Dataset, BatchSampler, DataLoader
 from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
 from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
 
+paddle.enable_static()
 logger = get_logger()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
index 8c10cd0e992..f8c12f89051 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
@@ -32,6 +32,7 @@ from paddle.io import Dataset, BatchSampler, DataLoader
 from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
 from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
 
+paddle.enable_static()
 logger = get_logger()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast.py b/python/paddle/fluid/tests/unittests/test_broadcast.py
index 029e881d6f6..8b8cdb1235c 100644
--- a/python/paddle/fluid/tests/unittests/test_broadcast.py
+++ b/python/paddle/fluid/tests/unittests/test_broadcast.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCBroadcastOp(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index 43d485a0a6d..2c9168df472 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -36,6 +36,7 @@ class InplaceTestBase(unittest.TestCase):
         self.fuse_all_optimizer_ops = False
 
     def setUp(self):
+        paddle.enable_static()
         self.initParameter()
         if self.use_cuda and fluid.core.is_compiled_with_cuda():
             self.device_count = fluid.core.get_cuda_device_count()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
index 71777df4651..dbf77fafcc4 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_api_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCollectiveAllgatherAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
index 24dd7cacff6..a405da80ada 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_api_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCollectiveAllreduceAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py b/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
index ebf86f6ae14..d0a67baa61e 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_api_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCollectiveBarrierAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
index b1cf4f1ac4c..702e0431157 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_api_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCollectiveBroadcastAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce.py b/python/paddle/fluid/tests/unittests/test_collective_reduce.py
index 36837d6a227..c0627467428 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_reduce.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCReduceOp(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
index bf3975f3fc1..8d28c794f02 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_api_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCollectiveReduceAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter.py b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
index 7fe3ce73359..ea34d1cab5a 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_scatter.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCScatterOp(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
index cae842b3961..3a37da52b8e 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_api_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCollectiveScatterAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index 30207340a27..d9fc9262b31 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -28,6 +28,8 @@ import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.distributed.fleet as fleet
 
+paddle.enable_static()
+
 
 class TestCommunicatorGeoEnd2End(unittest.TestCase):
     def net(self):
@@ -140,6 +142,7 @@ import paddle.distributed.fleet as fleet
 
 from test_communicator_geo import TestCommunicatorGeoEnd2End
 
+paddle.enable_static()
 
 class RunServer(TestCommunicatorGeoEnd2End):
     def runTest(self):
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
index 542d1874179..391588780f3 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
@@ -29,6 +29,8 @@ import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
+paddle.enable_static()
+
 
 class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
     def net(self):
@@ -120,6 +122,7 @@ from test_communicator_half_async import TestCommunicatorHalfAsyncEnd2End
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
+paddle.enable_static()
 
 class RunServer(TestCommunicatorHalfAsyncEnd2End):
     def runTest(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
index fbeff20c63b..2adf6e41931 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
@@ -15,6 +15,9 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistNCCL2(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
index 9df55a6b873..a82612b0ed2 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -19,6 +19,8 @@ import unittest
 import paddle
 import paddle.distributed.fleet.base.role_maker as role_maker
 
+paddle.enable_static()
+
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
index 5a5d8afc55b..5b7e0fb94c6 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
@@ -18,6 +18,8 @@ import os
 import paddle.distributed.fleet.base.role_maker as role_maker
 import time
 
+paddle.enable_static()
+
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
index 9085556c04c..3dff9d0f9d8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -18,6 +18,8 @@ import os
 import paddle.distributed.fleet.base.role_maker as role_maker
 import time
 
+paddle.enable_static()
+
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
index 4787d048bd2..bdfa3a9a7d5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
@@ -18,6 +18,8 @@ import os
 import paddle.distributed.fleet.base.role_maker as role_maker
 import time
 
+paddle.enable_static()
+
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
index 59ca41a11e3..db73069bf7d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
@@ -18,6 +18,8 @@ import os
 import paddle.distributed.fleet.base.role_maker as role_maker
 import time
 
+paddle.enable_static()
+
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
index e0993e022e1..b05a53c88bb 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -19,6 +19,8 @@ import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 import time
 
+paddle.enable_static()
+
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
index 7d18e935f58..82a8f46a945 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
@@ -22,6 +22,9 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import f
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 from test_dist_fleet_base import TestFleetBase
 from dist_fleet_simnet_bow import train_network
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistGeoCtr_2x2(TestFleetBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
index 02a739c060c..b3e38a42128 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
@@ -18,6 +18,9 @@ import os
 import unittest
 import tempfile
 from test_dist_fleet_heter_base import TestFleetHeterBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistHeterDatasetAsync2x2(TestFleetHeterBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
index 33690396612..00301f9b1c6 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
@@ -21,6 +21,9 @@ import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
 from paddle.distributed.fleet.base.util_factory import fleet_util
 from paddle.distributed.fleet import fleet
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistFleetHeterProgram(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
index 8132add37a6..d766e6bf2af 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
@@ -19,6 +19,9 @@ import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle
+
+paddle.enable_static()
 
 # For Net
 base_lr = 0.2
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
index e7b10be2349..218eb77d0b5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -24,6 +24,8 @@ import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.distributed.fleet as fleet
 
+paddle.enable_static()
+
 # For Net
 base_lr = 0.2
 emb_lr = base_lr * 3
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
index de4363f255b..8d101a34b68 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
@@ -19,6 +19,9 @@ import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle
+
+paddle.enable_static()
 
 # For Net
 base_lr = 0.2
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
index dc40b2eb5c6..379bcaf684d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
@@ -19,6 +19,9 @@ import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle
+
+paddle.enable_static()
 
 # For Net
 base_lr = 0.2
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
index 5e525bdb54d..fd069793473 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
@@ -19,6 +19,9 @@ import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle
+
+paddle.enable_static()
 
 # For Net
 base_lr = 0.2
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
index ec34993905e..e0fa590db2a 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
@@ -18,6 +18,9 @@ import os
 import unittest
 import tempfile
 from test_dist_fleet_base import TestFleetBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistSimnetASync2x2(TestFleetBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py
index 1f6274ec164..23a2b8fd306 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py
@@ -15,6 +15,9 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistNCCL2BackWardDeps(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
index 24c9b9a1397..4cf2cf5f367 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
@@ -16,6 +16,9 @@ from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
 import os
+import paddle
+
+paddle.enable_static()
 
 flag_name = os.path.splitext(__file__)[0]
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
index 0b9b85d5d52..9bc48ac0a1b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
@@ -18,6 +18,9 @@ from test_dist_base import TestDistBase
 
 import os
 import subprocess
+import paddle
+
+paddle.enable_static()
 flag_name = os.path.splitext(__file__)[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py
index 7dac1153562..7336794578e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py
@@ -17,6 +17,9 @@ import shutil
 import os
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistFleetSave(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
index d5ebe09adca..255fd9b2855 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
@@ -15,6 +15,9 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistNCCL2FleetApi(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py
index cc002582371..356c5573f95 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py
@@ -17,6 +17,9 @@ import unittest
 from test_dist_base import TestDistBase
 
 import os
+import paddle
+
+paddle.enable_static()
 flag_name = os.path.splitext(__file__)[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py
index f43ccc8becb..d9e6be8609d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py
@@ -17,6 +17,9 @@ import unittest
 from test_dist_base import TestDistBase
 
 import os
+import paddle
+
+paddle.enable_static()
 flag_name = os.path.splitext(__file__)[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
index d063f8473e0..28ef31875db 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
@@ -15,6 +15,9 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistNCCL2(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py
index fd15020275b..4436064dc28 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py
@@ -15,6 +15,9 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistNCCL2(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
index 4f4941aa217..d55582fbb4d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
@@ -15,6 +15,9 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistLocalSGDFleetApi(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_op.py b/python/paddle/fluid/tests/unittests/test_dist_op.py
index 1f46e0e7f9c..0f71027d274 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_op.py
@@ -19,6 +19,8 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 
+paddle.enable_static()
+
 
 def dist(x, y, p):
     if p == 0.:
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
index dbf0319d305..64217135be7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
@@ -18,6 +18,9 @@ from test_dist_base import TestDistBase
 import os
 
 import os
+import paddle
+
+paddle.enable_static()
 flag_name = os.path.splitext(__file__)[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
index 761d57408b9..dd5c393f49c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
@@ -17,6 +17,9 @@ from __future__ import print_function
 import unittest
 import gc
 import paddle.fluid as fluid
+import paddle
+
+paddle.enable_static()
 
 
 class TranspilerAsyncLRDecayTest(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py
index c8d0d840872..e6bc99fc225 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py
@@ -15,6 +15,9 @@
 import unittest
 import paddle.fluid as fluid
 import gc
+import paddle
+
+paddle.enable_static()
 
 gc.set_debug(gc.DEBUG_COLLECTABLE)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
index 927c155ff11..f06f1eaefae 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
@@ -17,6 +17,8 @@ import paddle
 import os
 from launch_function_helper import launch_func, wait, _find_free_port
 
+paddle.enable_static()
+
 
 class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 6da37fe4d29..6751c887061 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -28,6 +28,8 @@ import unittest
 from multiprocessing import Process
 from op_test import OpTest
 
+paddle.enable_static()
+
 
 def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
     remove_ps_flag(os.getpid())
diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py
index d4a971d25bc..dc9ea5d957a 100644
--- a/python/paddle/fluid/tests/unittests/test_nan_inf.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py
@@ -19,6 +19,9 @@ import unittest
 import os
 import sys
 import subprocess
+import paddle
+
+paddle.enable_static()
 
 
 class TestNanInf(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_reducescatter.py b/python/paddle/fluid/tests/unittests/test_reducescatter.py
index 58bcc11cd89..7c355d46285 100644
--- a/python/paddle/fluid/tests/unittests/test_reducescatter.py
+++ b/python/paddle/fluid/tests/unittests/test_reducescatter.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestReduceScatterOp(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_reducescatter_api.py b/python/paddle/fluid/tests/unittests/test_reducescatter_api.py
index 5fa75cc3eff..5a494b5529e 100644
--- a/python/paddle/fluid/tests/unittests/test_reducescatter_api.py
+++ b/python/paddle/fluid/tests/unittests/test_reducescatter_api.py
@@ -16,9 +16,12 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import paddle.fluid as fluid
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestReduceScatterAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/tests/test_text.py b/python/paddle/tests/test_text.py
index 43968896c18..fa83b0cc6f3 100644
--- a/python/paddle/tests/test_text.py
+++ b/python/paddle/tests/test_text.py
@@ -28,6 +28,8 @@ from paddle import Model, set_device
 from paddle.static import InputSpec as Input
 from paddle.text import *
 
+paddle.enable_static()
+
 
 class ModuleApiTest(unittest.TestCase):
     @classmethod
diff --git a/tools/test_runner.py b/tools/test_runner.py
index 9b9f165e736..bad98f9b5c3 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -17,12 +17,14 @@ from __future__ import print_function
 import unittest
 import os
 import sys
+import paddle
 import paddle.fluid as fluid
 import importlib
 from six.moves import cStringIO
 
 
 def main():
+    paddle.enable_static()
     sys.path.append(os.getcwd())
     some_test_failed = False
     for module_name in sys.argv[1:]:
@@ -44,6 +46,7 @@ def main():
                             'failed\n',
                             buffer.getvalue(),
                             file=sys.stderr)
+    paddle.disable_static()
 
     if some_test_failed:
         exit(1)
-- 
GitLab


From a04524759ef8d316b2ca5ceb903e829e6a55203a Mon Sep 17 00:00:00 2001
From: 123malin <malin10@baidu.com>
Date: Tue, 22 Sep 2020 21:18:44 +0800
Subject: [PATCH 174/261]  Enhance Op's Error Message (#27455)

* test=develop, update error message
---
 paddle/fluid/operators/concat_op.cc           |  4 +-
 .../optimizers/decayed_adagrad_op.cc          | 78 ++++++++++---------
 .../operators/optimizers/decayed_adagrad_op.h | 22 +++---
 .../operators/optimizers/lars_momentum_op.h   |  7 +-
 4 files changed, 64 insertions(+), 47 deletions(-)

diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 4f337c03599..7937e432d22 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/concat_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -78,7 +79,8 @@ class ConcatOp : public framework::OperatorWithKernel {
       }
     }
     if (flag == 0) {
-      PADDLE_THROW("All Inputs of Concat OP are Empty!");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "All Inputs of Concat OP are Empty!"));
     }
 #ifdef PADDLE_WITH_MKLDNN
     if (platform::CanMKLDNNBeUsed(ctx)) {
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
index 5c6c38da928..eb41d21e092 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
@@ -23,46 +23,54 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        "Input(LearningRate) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Grad").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of DecayedAdagradOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param",
+                   "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Moment"), "Input", "Moment",
+                   "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate",
+                   "DecayedAdagradOp");
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputsVarType("Param").front(),
+        framework::proto::VarType::LOD_TENSOR,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Param").front(),
+            ctx->GetInputsVarType("Param").front()));
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputsVarType("Grad").front(),
+        framework::proto::VarType::LOD_TENSOR,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Grad").front(),
+            ctx->GetInputsVarType("Grad").front()));
+
+    OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut",
+                   "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), "Output", "MomentOut",
+                   "DecayedAdagradOp");
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
+                      platform::errors::InvalidArgument(
+                          "Maybe the Input variable LearningRate has not "
+                          "been initialized. You may need to confirm "
+                          "if you put exe.run(startup_program) "
+                          "after optimizer.minimize function."));
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "LearningRate should have one element");
+                      platform::errors::InvalidArgument(
+                          "LearningRate should have one element"));
     auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"),
-                      "Param and Grad input of DecayedAdagradOp should have "
-                      "the same dimension.");
-    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"),
-                      "Param and Moment input of DecayedAdagradOp should have "
-                      "the same dimension.");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        platform::errors::InvalidArgument(
+            "Param and Grad input of DecayedAdagradOp should have "
+            "the same dimension."));
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment"),
+        platform::errors::InvalidArgument(
+            "Param and Moment input of DecayedAdagradOp should have "
+            "the same dimension."));
 
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("MomentOut", param_dims);
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
index 279edfb015c..f264ebf8a32 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
@@ -24,17 +24,19 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
     const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Grad").front(),
-                   framework::ToTypeName(grad_var->Type()));
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
 
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h
index b579b5143dd..55775bc08fb 100755
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
@@ -30,7 +30,12 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
     auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
     auto* grad_var = ctx.InputVar("Grad");
     // only support dense for now.
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true);
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
     auto grad = ctx.Input<framework::LoDTensor>("Grad");
 
     param_out->mutable_data<T>(ctx.GetPlace());
-- 
GitLab


From 76fb95fe769f991685818059324664da3d1d1af4 Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Tue, 22 Sep 2020 09:06:10 -0700
Subject: [PATCH 175/261] avoid data transform for linspace OP (#27444)

---
 paddle/fluid/operators/linspace_op.cc | 11 +++++++++--
 python/paddle/fluid/layers/tensor.py  |  9 ++++++---
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index 667c6e89295..7cc07383bfa 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/linspace_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -21,7 +22,7 @@ class LinspaceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace");
     OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace");
     OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace");
@@ -50,11 +51,17 @@ class LinspaceOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
   }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const override {
+    return expected_kernel_type;
+  }
 };
 
 class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index cf52f3b00fb..2fba578ec07 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -1453,11 +1453,14 @@ def linspace(start, stop, num, dtype=None, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
     if not isinstance(start, Variable):
-        tensor_start = fill_constant([1], dtype, start)
+        with device_guard("cpu"):
+            tensor_start = fill_constant([1], dtype, start)
     if not isinstance(stop, Variable):
-        tensor_stop = fill_constant([1], dtype, stop)
+        with device_guard("cpu"):
+            tensor_stop = fill_constant([1], dtype, stop)
     if not isinstance(num, Variable):
-        tensor_num = fill_constant([1], 'int32', num)
+        with device_guard("cpu"):
+            tensor_num = fill_constant([1], 'int32', num)
     if in_dygraph_mode():
         return core.ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
                                  dtype)
-- 
GitLab


From 0721767ba90536ee205ca04ac35dd0c124a797c8 Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Wed, 23 Sep 2020 10:18:32 +0800
Subject: [PATCH 176/261] fix server_num bug;test=develop (#27442)

---
 python/paddle/distributed/fleet/base/role_maker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 81d5908ccd4..f66f013e4db 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -605,7 +605,8 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         """
         if not self._role_is_generated:
             self._generate_role()
-        return len(self._get_pserver_endpoints())
+        return len(self._get_pserver_endpoints(
+        )) if self._get_pserver_endpoints() is not None else 0
 
     def _node_num(self):
         """
-- 
GitLab


From 765064476b26601d1aba653823a05b21a423ef25 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 23 Sep 2020 10:23:10 +0800
Subject: [PATCH 177/261] Polish some lost invalid error message (#27445)

* polish some lost error msg

* add some math file to white list

* polish detail based reviewer commnet
---
 paddle/fluid/inference/capi/pd_predictor.cc   |  4 ++-
 paddle/fluid/inference/io.cc                  | 25 +++++++++++--------
 .../tensorrt/plugin/trt_plugin_factory.cc     |  4 +--
 .../tensorrt/plugin/trt_plugin_utils.h        | 11 ++++----
 paddle/fluid/inference/utils/singleton.h      |  4 ++-
 paddle/fluid/platform/cuda_profiler.h         |  6 ++++-
 paddle/fluid/train/demo/demo_trainer.cc       |  7 ++++--
 paddle/fluid/train/imdb_demo/demo_trainer.cc  | 14 ++++++++---
 .../train/test_train_recognize_digits.cc      |  3 ++-
 tools/enforce/count_enforce_by_file.sh        |  9 ++++++-
 10 files changed, 59 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc
index 31915496893..c1bf4c974fa 100644
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -131,7 +131,9 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
   PADDLE_ENFORCE_EQ(
       input_names.size(), in_size,
       paddle::platform::errors::InvalidArgument(
-          "The number of input and the number of model's input must match."));
+          "The number of input and the number of model's input must match. The "
+          "number of input is %d, the number of model's input is %d.",
+          input_names.size(), in_size));
   for (int i = 0; i < in_size; ++i) {
     auto input_t = predictor->GetInputTensor(inputs[i].name);
     std::vector<int> tensor_shape;
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index c497ab384b5..84e011c6505 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -47,7 +47,9 @@ void Init(const std::vector<std::string> argv) {
 
 void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("Failed to open file %s.", filename));
   fin.seekg(0, std::ios::end);
   contents->clear();
   contents->resize(fin.tellg());
@@ -133,9 +135,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));
 
   // model_from_memory is false in separate parameters.
   LoadPersistables(executor, scope, *main_program, dirname, "",
@@ -151,9 +154,10 @@ std::unique_ptr<framework::ProgramDesc> Load(
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));
 
   LoadPersistables(executor, scope, *main_program, "", param_filename,
                    false /* model_from_memory */);
@@ -165,9 +169,10 @@ std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
     const std::string& prog_buffer, const std::string& param_buffer) {
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(prog_buffer));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));
 
   LoadPersistables(executor, scope, *main_program, "", param_buffer,
                    true /* model_filename */);
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
index 76b0832c546..0bf8a1691e2 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
@@ -27,8 +27,8 @@ PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
 
   PADDLE_ENFORCE_EQ(
       Has(plugin_type), true,
-      platform::errors::NotFound(
-          "trt plugin type %s does not exists, check it.", plugin_type));
+      platform::errors::NotFound("TensorRT plugin type `%s` does not exists.",
+                                 plugin_type));
   auto plugin = plugin_registry_[plugin_type](serial_data, serial_length);
   owned_plugins_.emplace_back(plugin);
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
index 6fcb70c6d32..16751c764bd 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
@@ -103,12 +103,11 @@ struct Serializer<std::vector<T>,
     DeserializeValue(buffer, buffer_size, &size);
     value->resize(size);
     size_t nbyte = value->size() * sizeof(T);
-    PADDLE_ENFORCE_GE(
-        *buffer_size, nbyte,
-        platform::errors::InvalidArgument("Expect buffer size >= value size in "
-                                          "trt plugin deserialization, but got "
-                                          "buffer size = %d, value size = %d.",
-                                          *buffer_size, nbyte));
+    PADDLE_ENFORCE_GE(*buffer_size, nbyte,
+                      platform::errors::InvalidArgument(
+                          "Insufficient data in buffer, expect contains %d "
+                          "byte, but actually only contains %d byte.",
+                          *buffer_size, nbyte));
     std::memcpy(value->data(), *buffer, nbyte);
     reinterpret_cast<char const*&>(*buffer) += nbyte;
     *buffer_size -= nbyte;
diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h
index 990bef35949..6828924c300 100644
--- a/paddle/fluid/inference/utils/singleton.h
+++ b/paddle/fluid/inference/utils/singleton.h
@@ -46,7 +46,9 @@ struct Registry {
 
   template <typename ItemChild>
   void Register(const std::string& name) {
-    PADDLE_ENFORCE_EQ(items_.count(name), 0);
+    PADDLE_ENFORCE_EQ(items_.count(name), 0,
+                      platform::errors::AlreadyExists(
+                          "Item `%s` has beed registered.", name));
     items_[name] = new ItemChild;
   }
 
diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
index 957bdf1e698..a9382f2c8ad 100644
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
@@ -24,7 +24,11 @@ namespace platform {
 
 void CudaProfilerInit(std::string output_file, std::string output_mode,
                       std::string config_file) {
-  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
+                 platform::errors::InvalidArgument(
+                     "Unsupported cuda profiler output mode, expect `kvp` or "
+                     "`csv`, but received `%s`.",
+                     output_mode));
   cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
   PADDLE_ENFORCE_CUDA_SUCCESS(
       cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index 1087f567245..1ef98720f83 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -29,7 +29,9 @@ namespace train {
 
 void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("Failed to open file %s.", filename));
   fin.seekg(0, std::ios::end);
   contents->clear();
   contents->resize(fin.tellg());
@@ -70,7 +72,8 @@ int main() {
     }
   }
 
-  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+  PADDLE_ENFORCE_NE(loss_name, "",
+                    platform::errors::NotFound("Loss name is not found."));
 
   // init all parameters
   executor.Run(*startup_program, &scope, 0);
diff --git a/paddle/fluid/train/imdb_demo/demo_trainer.cc b/paddle/fluid/train/imdb_demo/demo_trainer.cc
index d45edd563f0..a08069a57ca 100644
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ b/paddle/fluid/train/imdb_demo/demo_trainer.cc
@@ -45,7 +45,9 @@ namespace train {
 
 void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("Failed to open file %s.", filename));
   fin.seekg(0, std::ios::end);
   contents->clear();
   contents->resize(fin.tellg());
@@ -98,7 +100,11 @@ int main(int argc, char* argv[]) {
       file_vec.push_back(filename);
     }
   }
-  PADDLE_ENFORCE_GE(file_vec.size(), 1, "At least one file to train");
+  PADDLE_ENFORCE_GE(
+      file_vec.size(), 1,
+      platform::errors::InvalidArgument(
+          "At least one file to train, but received number of file is %d.",
+          file_vec.size()));
   paddle::framework::InitDevices(false);
   const auto cpu_place = paddle::platform::CPUPlace();
   paddle::framework::Executor executor(cpu_place);
@@ -148,7 +154,9 @@ int main(int argc, char* argv[]) {
     const std::vector<paddle::framework::DataFeed*> readers =
         dataset_ptr->GetReaders();
     PADDLE_ENFORCE_EQ(readers.size(), 1,
-                      "readers num should be equal to thread num");
+                      platform::errors::InvalidArgument(
+                          "Readers num(%d) should be equal to thread num(1).",
+                          readers.size()));
     readers[0]->SetPlace(paddle::platform::CPUPlace());
     const std::vector<std::string>& input_feed_names =
         readers[0]->GetUseSlotAlias();
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
index 45c438e8925..e7b698e1a34 100644
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
@@ -51,7 +51,8 @@ void Train() {
     }
   }
 
-  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+  PADDLE_ENFORCE_NE(loss_name, "",
+                    platform::errors::NotFound("Loss name is not found."));
 
   // prepare data
   auto x_var = scope.Var("img");
diff --git a/tools/enforce/count_enforce_by_file.sh b/tools/enforce/count_enforce_by_file.sh
index 1858bd0fd17..c1e2903c092 100644
--- a/tools/enforce/count_enforce_by_file.sh
+++ b/tools/enforce/count_enforce_by_file.sh
@@ -57,7 +57,14 @@ FILE_WHITE_LIST="\
     random_crop_op.h \
     elementwise_op_function.cu.h \
     fused_elemwise_activation_op.cc \
-    auc_op.cu"
+    auc_op.cu \
+    unsqueeze_op.h \
+    unsqueeze_op.cc \
+    enforce.h \
+    errors_test.cc \
+    cross_entropy.cu \
+    cross_entropy.h \
+    unpooling.cu"
 
 function count_file_recursively(){
     dir_name=$1
-- 
GitLab


From 5034d181f382eb3bd6d4676bc587002fd158f77b Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Wed, 23 Sep 2020 10:31:49 +0800
Subject: [PATCH 178/261] update for 2.0 inference api. (#27473)

---
 paddle/fluid/inference/api/analysis_predictor.cc |  1 +
 paddle/fluid/inference/api/api_impl.cc           |  1 +
 python/paddle/inference/__init__.py              | 16 ++++++++++++++++
 python/setup.py.in                               |  1 +
 4 files changed, 19 insertions(+)
 create mode 100644 python/paddle/inference/__init__.py

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index ac914700643..42e62011f84 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1048,6 +1048,7 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
     const AnalysisConfig &config) {
+  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
   return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
       config);
 }
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index ca0a5148f06..c78cdf24dec 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -373,6 +373,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
     const NativeConfig &config) {
+  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
   return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
 }
 
diff --git a/python/paddle/inference/__init__.py b/python/paddle/inference/__init__.py
new file mode 100644
index 00000000000..c388301ec34
--- /dev/null
+++ b/python/paddle/inference/__init__.py
@@ -0,0 +1,16 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid.inference import Config, DataType, PlaceType, PrecisionType, Tensor, \
+    Predictor, create_predictor, get_version, get_num_bytes_of_data_type, PredictorPool
diff --git a/python/setup.py.in b/python/setup.py.in
index d85a23a5edd..467c5cb8677 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -155,6 +155,7 @@ packages=['paddle',
           'paddle.distributed.fleet.utils',
           'paddle.framework',
           'paddle.jit',
+          'paddle.inference',
           'paddle.fluid',
           'paddle.fluid.inference',
           'paddle.fluid.dygraph',
-- 
GitLab


From 906e7f921e6df8d6376902aa581e9c2f03fdc0dc Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Wed, 23 Sep 2020 10:34:02 +0800
Subject: [PATCH 179/261] add fuse_bn_act op (#27230)

* add fused_bn_add_relu op
---
 cmake/operators.cmake                         |   3 +-
 paddle/fluid/operators/fused/CMakeLists.txt   |   8 +-
 .../fused/fused_bn_add_activation_op.cc       | 255 +++++++++++++
 .../fused/fused_bn_add_activation_op.cu       | 338 ++++++++++++++++++
 .../fused/fused_bn_add_activation_op.h        | 106 ++++++
 python/paddle/fluid/contrib/layers/nn.py      | 191 +++++++++-
 .../contrib/mixed_precision/fp16_lists.py     |   1 +
 .../contrib/mixed_precision/fp16_utils.py     |   9 +-
 .../tests/unittests/test_fused_bn_add_act.py  | 215 +++++++++++
 9 files changed, 1120 insertions(+), 6 deletions(-)
 create mode 100644 paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
 create mode 100644 paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
 create mode 100644 paddle/fluid/operators/fused/fused_bn_add_activation_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index aea972ab3db..21080fbe8fd 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -127,7 +127,8 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op")
+"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
+"fused_bn_add_activation_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 3fc5f3bfc6b..477a9162fe3 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -8,7 +8,8 @@ register_operators(EXCLUDES
     multihead_matmul_op
     fused_embedding_eltwise_layernorm_op
     fusion_group_op
-    fusion_gru_op)
+    fusion_gru_op
+    fused_bn_add_activation_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@@ -47,4 +48,9 @@ if (WITH_GPU)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_group);\n")
         cc_test(test_fusion_group_op SRCS fusion_group_op_test.cc DEPS fusion_group_op)
     endif()
+    # fused_bn_add_activation
+    if (NOT ${CUDNN_VERSION} VERSION_LESS 7401)
+    op_library(fused_bn_add_activation_op)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_bn_add_activation);\n")
+    endif()
 endif()
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
new file mode 100644
index 00000000000..5b3ed03bb64
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -0,0 +1,255 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+void FusedBatchNormAddActOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias",
+                 "FusedBatchNormAddActOp");
+
+  // check output
+  OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("MeanOut"), "Output", "MeanOut",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("VarianceOut"), "Output", "VarianceOut",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), "Output", "SavedVariance",
+                 "FusedBatchNormAddActOp");
+
+  const auto x_dims = ctx->GetInputDim("X");
+  const auto z_dims = ctx->GetInputDim("Z");
+  PADDLE_ENFORCE_EQ(x_dims, z_dims,
+                    platform::errors::InvalidArgument(
+                        "ShapeError: the shapes of input "
+                        "must be equal. But received: the shape "
+                        "of input X = [%s], and the shape of "
+                        "input Y = [%s]",
+                        x_dims, z_dims));
+  PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::InvalidArgument(
+                                          "ShapeError: the dimensions of input "
+                                          "must greater than or equal to 2."
+                                          "But received: the shape of input "
+                                          "= [%s], the dimension of input = "
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
+  PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::InvalidArgument(
+                                          "ShapeError: the dimensions of input "
+                                          "must smaller than or equal to 5."
+                                          "But received: the shape of input "
+                                          "= [%s], the dimension of input = "
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
+
+  const int64_t C = x_dims[x_dims.size() - 1];
+
+  auto scale_dim = ctx->GetInputDim("Scale");
+  auto bias_dim = ctx->GetInputDim("Bias");
+
+  PADDLE_ENFORCE_EQ(
+      scale_dim.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "ShapeError: the dimension of scale must equal to 1."
+          "But received: the shape of scale is [%s], the dimension "
+          "of scale is [%d]",
+          scale_dim, scale_dim.size()));
+  PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "ShapeError: the dimension of bias must equal to 1."
+                        "But received: the shape of bias is [%s],the dimension "
+                        "of bias is [%d]",
+                        bias_dim, bias_dim.size()));
+
+  bool check = true;
+  if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
+                              framework::product(bias_dim) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(scale_dim[0], C,
+                      platform::errors::InvalidArgument(
+                          "ShapeError: the shape of scale must equal to [%d]"
+                          "But received: the shape of scale is [%d]",
+                          C, scale_dim[0]));
+    PADDLE_ENFORCE_EQ(bias_dim[0], C,
+                      platform::errors::InvalidArgument(
+                          "ShapeError: the shape of bias must equal to [%d]"
+                          "But received: the shape of bias is [%d]",
+                          C, bias_dim[0]));
+  }
+  ctx->SetOutputDim("Y", x_dims);
+  ctx->SetOutputDim("MeanOut", {C});
+  ctx->SetOutputDim("VarianceOut", {C});
+  ctx->SetOutputDim("SavedMean", {C});
+  ctx->SetOutputDim("SavedVariance", {C});
+  ctx->ShareLoD("X", "Y");
+}
+
+framework::OpKernelType FusedBatchNormAddActOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+  // By default, the type of the scale, bias, mean,
+  // and var tensors should be float when input tensor's dtype is float16.
+  auto bn_param_type = framework::proto::VarType::FP32;
+
+  PADDLE_ENFORCE_EQ(
+      bn_param_type, ctx.Input<Tensor>("Scale")->type(),
+      platform::errors::InvalidArgument("Scale input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type, ctx.Input<Tensor>("Bias")->type(),
+      platform::errors::InvalidArgument("Bias input should be of float type"));
+
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                 library);
+}
+
+void FusedBatchNormAddActOpMaker::Make() {
+  AddInput("X", "The input tensor");
+  AddInput("Z", "The input tensor");
+  AddInput("Scale",
+           "Scale is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Bias",
+           "Bias is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddOutput("Y", "result after normalization");
+  AddOutput("MeanOut",
+            "Share memory with Mean. "
+            "Store the global mean when training");
+  AddOutput("VarianceOut",
+            "Share memory with Variance. "
+            "Store the global Variance when training");
+  AddOutput("SavedMean",
+            "Mean of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("SavedVariance",
+            "Variance of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("ReserveSpace",
+            "Reserve GPU space for triggering the new semi-persistent "
+            "NHWC kernel");
+  AddAttr<float>("momentum", "").SetDefault(0.9);
+  AddAttr<float>("epsilon", "")
+      .SetDefault(1e-5)
+      .AddCustomChecker([](const float &epsilon) {
+        PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true,
+                          platform::errors::InvalidArgument(
+                              "'epsilon' should be between 0.0 and 0.001."));
+      });
+  AddAttr<std::string>("act_type", "The activation type to be fused.")
+      .SetDefault("relu");
+  AddComment(R"DOC(
+Fused Batch Normalization with activation.
+
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Batch Norm can be used as a normalizer function for conv2d and fully_connected operations.
+Now, the required data format for FusedBatchNormAddActOp is NHWC `[batch, in_height, in_width, in_channels]`.
+
+)DOC");
+}
+
+void FusedBatchNormAddActGradOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  // check input
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
+                 framework::GradVarName("Y"), "FusedBatchNormAddActGradOp");
+
+  // check output
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                 framework::GradVarName("X"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Z")), "Output",
+                 framework::GradVarName("Z"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale")), "Output",
+                 framework::GradVarName("Scale"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias")), "Output",
+                 framework::GradVarName("Bias"), "FusedBatchNormAddActGradOp");
+
+  const auto in_dims = ctx->GetInputDim("X");
+  const int C = in_dims[in_dims.size() - 1];
+
+  ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
+  ctx->SetOutputDim(framework::GradVarName("Z"), in_dims);
+  ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+  ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+}
+
+framework::OpKernelType FusedBatchNormAddActGradOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+  if (var == nullptr) {
+    PADDLE_THROW(platform::errors::NotFound(
+        "Can not find Y@GRAD in the execution context."));
+  }
+  const Tensor *t = nullptr;
+  if (var->IsType<Tensor>()) {
+    t = &var->Get<Tensor>();
+  } else if (var->IsType<LoDTensor>()) {
+    t = &var->Get<LoDTensor>();
+  }
+  if (t == nullptr) {
+    PADDLE_THROW(
+        platform::errors::NotFound("Can not get the tensor value of Y@GRAD."));
+  }
+
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+  return framework::OpKernelType(
+      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout,
+      library);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_bn_add_activation, ops::FusedBatchNormAddActOp,
+    ops::FusedBatchNormAddActOpMaker, ops::FusedBatchNormAddActOpInferVarType,
+    ops::FusedBatchNormAddActGradOpMaker<paddle::framework::OpDesc>,
+    ops::FusedBatchNormAddActGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_bn_add_activation_grad,
+                  ops::FusedBatchNormAddActGradOp);
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
new file mode 100644
index 00000000000..7f1d297cda3
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -0,0 +1,338 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/float16.h"
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T>
+class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    float momentum = ctx.Attr<float>("momentum");
+    std::string act_type = ctx.Attr<std::string>("act_type");
+
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+    // Get the size for each dimension.
+    // NHWC [batch_size, in_height, in_width, in_channels]
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *z = ctx.Input<Tensor>("Z");
+    const auto &in_dims = x->dims();
+
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    auto *y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    int N, C, H, W, D;
+    const DataLayout data_layout = DataLayout::kNHWC;
+    ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    // ------------------- cudnn descriptors ---------------------
+    auto handle = dev_ctx.cudnn_handle();
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+
+    std::vector<int> dims = {N, C, H, W, D};
+    std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
+                                                         data_desc_, mode_));
+
+    double this_factor = 1. - momentum;
+    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
+    platform::ScopedActivationDescriptor scope_act_desc;
+    cudnnActivationDescriptor_t activation_desc_ =
+        scope_act_desc.descriptor<T>(act_type);
+    size_t workspace_size = 0;
+    size_t reserve_space_size = 0;
+    void *reserve_space_ptr = nullptr;
+    void *workspace_ptr = nullptr;
+    Tensor workspace_tensor;
+    // Create reserve space and workspace for batch norm.
+    // Create tensor for each batchnorm op, it will be used in the
+    // backward. Thus this tensor shouldn't be temp.
+    auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+    PADDLE_ENFORCE_NOT_NULL(
+        reserve_space,
+        platform::errors::NotFound(
+            "The argument ReserveSpace of batch_norm op is not found."));
+
+    // --------------- cudnn batchnorm workspace ---------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::
+            cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+                /*handle=*/handle,
+                /*mode=*/mode_,
+                /*bnOps=*/bnOps_,
+                /*xDesc=*/data_desc_,
+                /*zDesc=*/data_desc_,
+                /*yDesc=*/data_desc_,
+                /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                /*activationDesc=*/activation_desc_,
+                /*sizeInBytes=*/&workspace_size));
+
+    // -------------- cudnn batchnorm reserve space --------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+            /*handle=*/handle,
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*activationDesc=*/activation_desc_,
+            /*xDesc=*/data_desc_,
+            /*sizeInBytes=*/&reserve_space_size));
+
+    reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
+                                                    reserve_space_size);
+    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
+                                                  workspace_size);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
+            handle, mode_, bnOps_, CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+            data_desc_, z->template data<T>(), data_desc_,
+            y->template data<T>(), bn_param_desc_,
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), this_factor,
+            mean_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            variance_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
+                         ctx.GetPlace()),
+            saved_variance->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
+            reserve_space_size));
+
+    // clean when exit.
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+template <typename T>
+class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    std::string act_type = ctx.Attr<std::string>("act_type");
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *z = ctx.Input<Tensor>("Z");
+    const auto *y = ctx.Input<Tensor>("Y");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
+
+    const auto &in_dims = x->dims();
+
+    int N, C, H, W, D;
+    const DataLayout data_layout = DataLayout::kNHWC;
+    ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_z = ctx.Output<Tensor>(framework::GradVarName("Z"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_z->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(
+        d_scale && d_bias, true,
+        platform::errors::PreconditionNotMet(
+            "Both the scale grad and the bias grad must not be null."));
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL,
+                      platform::errors::PreconditionNotMet(
+                          "The scale only has one dimension."));
+    PADDLE_ENFORCE_EQ(
+        scale->dims()[0], C,
+        platform::errors::PreconditionNotMet(
+            "The size of scale is equal to the channel of Input(X)."));
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    std::vector<int> dims = {N, C, H, W, D};
+    std::vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
+    // ------------------- cudnn descriptors ---------------------
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
+                                                         data_desc_, mode_));
+
+    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+    const auto *saved_mean_data =
+        saved_mean->template data<BatchNormParamType<T>>();
+    const auto *saved_var_data =
+        saved_var->template data<BatchNormParamType<T>>();
+
+    size_t workspace_size = 0;
+    void *workspace_ptr = nullptr;
+    Tensor workspace_tensor;
+    auto reserve_space_size = reserve_space->memory_size();
+    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
+    platform::ScopedActivationDescriptor scope_act_desc;
+    cudnnActivationDescriptor_t activation_desc_ =
+        scope_act_desc.descriptor<T>(act_type);
+    // --------------- cudnn batchnorm workspace ---------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+            /*handle=*/dev_ctx.cudnn_handle(),
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*xDesc=*/data_desc_,
+            /*yDesc=*/data_desc_,
+            /*dyDesc=*/data_desc_,
+            /*dzDesc=*/data_desc_,
+            /*dxDesc=*/data_desc_,
+            /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+            /*activationDesc=*/activation_desc_,
+            /*sizeInBytes=*/&workspace_size));
+
+    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
+                                                  workspace_size);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnBatchNormalizationBackwardEx(
+            /*handle=*/dev_ctx.cudnn_handle(),
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
+            /*betaDataDiff=*/CudnnDataType<T>::kZero(),
+            /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
+            /*betaParamDiff=*/CudnnDataType<T>::kZero(),
+            /*xDesc=*/data_desc_,
+            /*xData=*/x->template data<T>(),
+            /*yDesc=*/data_desc_,
+            /*yData=*/y->template data<T>(),
+            /*dyDesc=*/data_desc_,
+            /*dyData=*/d_y->template data<T>(),
+            /*dzDesc=*/data_desc_,
+            /*dzData=*/d_z->template data<T>(),
+            /*dxDesc=*/data_desc_,
+            /*dxData=*/d_x->template data<T>(),
+            /*dBnScaleBiasDesc=*/bn_param_desc_,
+            /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
+            /*bnBiasData=*/bias->template data<BatchNormParamType<T>>(),
+            /*dBnScaleData=*/d_scale->template data<BatchNormParamType<T>>(),
+            /*dBnBiasData=*/d_bias->template data<BatchNormParamType<T>>(),
+            /*epsilon=*/epsilon,
+            /*savedMean=*/saved_mean_data,
+            /*savedInvVariance=*/saved_var_data,
+            /*activationDesmc=*/activation_desc_,
+            /*workspace=*/workspace_ptr,
+            /*workSpaceSizeInBytes=*/workspace_size,
+            /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
+            /*reserveSpaceSizeInBytes=*/reserve_space_size));
+
+    // clean when exit.
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#if CUDNN_VERSION >= 7401
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    fused_bn_add_activation,
+    ops::FusedBatchNormAddActKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(fused_bn_add_activation_grad,
+                        ops::FusedBatchNormAddActGradKernel<
+                            plat::CUDADeviceContext, plat::float16>);
+#endif
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
new file mode 100644
index 00000000000..5c7df96e60d
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
@@ -0,0 +1,106 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+class FusedBatchNormAddActOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedBatchNormAddActGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedBatchNormAddActOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+template <typename T>
+class FusedBatchNormAddActGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Z", this->Input("Z"));
+    op->SetInput("Y", this->Output("Y"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+
+    op->SetInput("Scale", this->Input("Scale"));
+    op->SetInput("Bias", this->Input("Bias"));
+    op->SetInput("SavedMean", this->Output("SavedMean"));
+    op->SetInput("SavedVariance", this->Output("SavedVariance"));
+    op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
+
+    op->SetAttrMap(this->Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Z"), this->InputGrad("Z"));
+    op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+  }
+};
+
+class FusedBatchNormAddActOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
+    return m;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormAddActKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormAddActGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 7b564b3f837..ac6493b1c29 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -45,6 +45,7 @@ from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 
 from paddle.fluid import core
+from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.entry_attr import ProbabilityEntry, CountFilterEntry
 
 from paddle.fluid.framework import Variable, convert_np_dtype_to_dtype_
@@ -57,7 +58,7 @@ __all__ = [
     'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat',
     'sparse_embedding', 'partial_sum', 'tdm_child', 'rank_attention',
     'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice',
-    'correlation'
+    'correlation', 'fused_bn_add_act'
 ]
 
 
@@ -1625,3 +1626,191 @@ def correlation(x,
             },
             outputs={"Output": output})
     return output
+
+
+def fused_bn_add_act(x,
+                     y,
+                     momentum=0.9,
+                     epsilon=1e-05,
+                     param_attr=None,
+                     bias_attr=None,
+                     moving_mean_name=None,
+                     moving_variance_name=None,
+                     act=None,
+                     name=None):
+    """
+    This Op performs batch norm on input x, and adds the result to input y. Then
+    it performs activation on the sum. The data format of inputs must be NHWC
+    `[batch, in_height, in_width, in_channels]`.
+
+    Args:
+        x(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+            is float16.
+        y(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+            is float16.
+        momentum(float|Tensor, optional): The value used for the moving_mean and
+            moving_var computation. This should be a float number or a tensor with
+            shape [1] and data type as float32. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+	        will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
+	        If the Initializer of the param_attr is not set, the parameter is initialized
+	        with Xavier. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+	        will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+	        If the Initializer of the bias_attr is not set, the bias is initialized zero.
+	        Default: None.
+        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. If it
+            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
+            will save global mean with the string.
+        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance.
+            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
+            will save global variance with the string.
+        act(string, optional): Activation type, linear|relu|prelu|...
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
+
+    Examples:
+            .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            def build_program(main_program, startup_program):
+                with fluid.program_guard(main_program, startup_program):
+                    x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+                    y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+                    conv1_1 = fluid.layers.conv2d(
+                        input=x,
+                        filter_size=3,
+                        num_filters=32,
+                        stride=1,
+                        padding=1,
+                        act=None,
+                        bias_attr=False,
+                        data_format='NHWC')
+                    conv1_2 = fluid.layers.conv2d(
+                        input=x,
+                        filter_size=3,
+                        num_filters=32,
+                        stride=1,
+                        padding=1,
+                        act=None,
+                        bias_attr=False,
+                        data_format='NHWC')
+                    bn = fluid.layers.batch_norm(
+                        input=conv1_1,
+                        act=None,
+                        data_layout='NHWC')
+                    fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(conv1_2, bn)
+                    prediction = fluid.layers.fc(input=fused_bn_add_act, size=10, act='softmax')
+                    loss = fluid.layers.cross_entropy(input=prediction, label=y)
+                    loss = fluid.layers.mean(loss)
+                    sgd = fluid.optimizer.SGD(learning_rate=0.001)
+                    sgd = fluid.contrib.mixed_precision.decorate(
+                        sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+                    sgd.minimize(loss)
+
+                return x, y, loss
+
+            iters = 5
+            batch_size = 16
+            support_gpu = fluid.is_compiled_with_cuda()
+            if support_gpu:
+                main_program = fluid.Program()
+                startup_program = fluid.Program()
+                place = fluid.CUDAPlace(0)
+                x, y, loss = build_program(main_program, startup_program)
+  
+                feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+                train_reader = paddle.batch(
+                    paddle.dataset.mnist.train(), batch_size=batch_size)
+                exe = fluid.Executor(place)
+                scope = fluid.Scope()
+                with fluid.scope_guard(scope):
+                    exe.run(startup_program)
+                    for _ in range(iters):
+                        data = next(train_reader())
+                        loss_v = exe.run(main_program, feed=feeder.feed(data), fetch_list=[loss])
+    """
+    helper = LayerHelper('fused_bn_add_act', **locals())
+
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             'fused_bn_add_act')
+    check_variable_and_dtype(y, 'input', ['float16', 'float32', 'float64'],
+                             'fused_bn_add_act')
+    bn_param_dtype = core.VarDesc.VarType.FP32
+
+    x_shape = x.shape
+    channel_num = x_shape[-1]
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=bn_param_dtype,
+        default_initializer=Constant(1.0))
+    bias = helper.create_parameter(
+        attr=helper.bias_attr,
+        shape=param_shape,
+        dtype=bn_param_dtype,
+        is_bias=True)
+    mean = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_mean_name, initializer=Constant(0.0), trainable=False),
+        shape=param_shape,
+        dtype=bn_param_dtype)
+    mean.stop_gradient = True
+    variance = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_variance_name,
+            initializer=Constant(1.0),
+            trainable=False),
+        shape=param_shape,
+        dtype=bn_param_dtype)
+    variance.stop_gradient = True
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance out share the same memory
+    variance_out = variance
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    reserve_space = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.FP16, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(
+        core.VarDesc.VarType.FP16)
+
+    inputs = {
+        "X": x,
+        "Z": y,
+        "Scale": scale,
+        "Bias": bias,
+    }
+    attrs = {"epsilon": epsilon, 'momentum': momentum}
+
+    outputs = {
+        "Y": batch_norm_out,
+        "MeanOut": mean_out,
+        "VarianceOut": variance_out,
+        "SavedMean": saved_mean,
+        "SavedVariance": saved_variance,
+        "ReserveSpace": reserve_space
+    }
+
+    helper.append_op(
+        type="fused_bn_add_activation",
+        inputs=inputs,
+        outputs=outputs,
+        attrs=attrs)
+
+    return batch_norm_out
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index 1f301b7148d..a9f080c514d 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -135,6 +135,7 @@ gray_list = {
     'get_tensor_from_selected_rows',
     'sign',
     'cast',
+    'fused_bn_add_activation',
 }
 '''
 # The set of ops that don't support fp16 calculation
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 0b142ff33de..0ff166d8dc8 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -69,8 +69,10 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
     ]
 
     for in_name in op.input_names:
-        if src_dtype == core.VarDesc.VarType.FP32 and op.type == 'batch_norm':
-            if in_name != 'X':
+        if src_dtype == core.VarDesc.VarType.FP32 and op.type in [
+                'batch_norm', 'fused_bn_add_activation'
+        ]:
+            if in_name not in {'X', 'Z'}:
                 continue
         for in_var_name in op.input(in_name):
             in_var = block.var(in_var_name)
@@ -102,7 +104,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                     op._set_attr('in_dtype', dest_dtype)
     if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.FP16:
         for out_name in op.output_names:
-            if op.type == 'batch_norm' and out_name != 'Y':
+            if op.type in ['batch_norm', 'fused_bn_add_activation'
+                           ] and out_name != 'Y':
                 continue
             for out_var_name in op.output(out_name):
                 out_var = block.var(out_var_name)
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
new file mode 100644
index 00000000000..1bc305cd1f4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
@@ -0,0 +1,215 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "Paddle core is not compiled with CUDA")
+class TestFusedBnAddActAPI(unittest.TestCase):
+    def setUp(self):
+        self.conv_param_attr1 = fluid.ParamAttr(
+            name='conv2d_1.weight',
+            initializer=fluid.initializer.Xavier(uniform=False),
+            learning_rate=0.001)
+        self.conv_param_attr2 = fluid.ParamAttr(
+            name='conv2d_2.weight',
+            initializer=fluid.initializer.Xavier(uniform=False),
+            learning_rate=0.001)
+        self.bn_param_attr1 = fluid.ParamAttr(
+            name='batch_norm_w_1',
+            initializer=fluid.initializer.Constant(value=1.0))
+        self.bn_bias_attr1 = fluid.ParamAttr(
+            name='batch_norm_b_1',
+            initializer=fluid.initializer.Constant(value=0.0))
+        self.bn_param_attr2 = fluid.ParamAttr(
+            name='batch_norm_w_2',
+            initializer=fluid.initializer.Constant(value=1.0))
+        self.bn_bias_attr2 = fluid.ParamAttr(
+            name='batch_norm_b_2',
+            initializer=fluid.initializer.Constant(value=0.0))
+        self.fc_param_attr = fluid.ParamAttr(
+            name='fc.weight',
+            initializer=fluid.initializer.Xavier(uniform=False))
+
+    def build_fused_program(self,
+                            main_program,
+                            startup_program,
+                            use_cuda,
+                            seed=1):
+        with fluid.program_guard(main_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+            y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+            conv1_1 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr1,
+                bias_attr=False,
+                data_format='NHWC')
+            conv1_2 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr2,
+                bias_attr=False,
+                data_format='NHWC')
+            bn = fluid.layers.batch_norm(
+                input=conv1_1,
+                param_attr=self.bn_param_attr1,
+                bias_attr=self.bn_bias_attr1,
+                act=None,
+                data_layout='NHWC')
+            fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(
+                conv1_2,
+                bn,
+                param_attr=self.bn_param_attr2,
+                bias_attr=self.bn_bias_attr2)
+            prediction = fluid.layers.fc(input=fused_bn_add_act,
+                                         size=10,
+                                         act='softmax',
+                                         param_attr=self.fc_param_attr)
+            loss = fluid.layers.cross_entropy(input=prediction, label=y)
+            loss = fluid.layers.mean(loss)
+            sgd = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd = fluid.contrib.mixed_precision.decorate(
+                sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+            sgd.minimize(loss)
+
+        return x, y, loss
+
+    def build_origin_program(self,
+                             main_program,
+                             startup_program,
+                             use_cuda,
+                             seed=1):
+        with fluid.program_guard(main_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+            y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+            conv1_1 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr1,
+                bias_attr=False,
+                data_format='NHWC')
+            conv1_2 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr2,
+                bias_attr=False,
+                data_format='NHWC')
+            bn1 = fluid.layers.batch_norm(
+                input=conv1_1,
+                param_attr=self.bn_param_attr1,
+                bias_attr=self.bn_bias_attr1,
+                act=None,
+                data_layout='NHWC')
+            bn2 = fluid.layers.batch_norm(
+                input=conv1_2,
+                param_attr=self.bn_param_attr2,
+                bias_attr=self.bn_bias_attr2,
+                act=None,
+                data_layout='NHWC')
+            out = bn1 + bn2
+            out = fluid.layers.relu(out)
+            prediction = fluid.layers.fc(input=out,
+                                         size=10,
+                                         act='softmax',
+                                         param_attr=self.fc_param_attr)
+            loss = fluid.layers.cross_entropy(input=prediction, label=y)
+            loss = fluid.layers.mean(loss)
+            sgd = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd = fluid.contrib.mixed_precision.decorate(
+                sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+            sgd.minimize(loss)
+
+        return x, y, loss
+
+    def check(self, place, use_cuda):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        iters = 5
+        batch_size = 16
+
+        # build_fused_program
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        x, y, loss = self.build_fused_program(main_program, startup_program,
+                                              use_cuda)
+        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        exe = fluid.Executor(place)
+        loss_vals_fused = []
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup_program)
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                loss_vals_fused.append(loss_v[0][0])
+
+        # build_origin_program
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        x, y, loss = self.build_origin_program(main_program, startup_program,
+                                               use_cuda)
+        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        loss_vals = []
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup_program)
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                loss_vals.append(loss_v[0][0])
+
+        # check loss
+        for i in range(iters):
+            self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5)
+
+    def test_fuse_bn_add_act(self):
+        place = fluid.CUDAPlace(0)
+        self.check(place, use_cuda=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab


From 43240a1b814a49e6b02c3b1c49249a9b6f7fe2c5 Mon Sep 17 00:00:00 2001
From: cc <52520497+juncaipeng@users.noreply.github.com>
Date: Wed, 23 Sep 2020 10:42:51 +0800
Subject: [PATCH 180/261] [doc] Add example for cache and buffered (#26819)

* Add example for cache and buffered, test=develop, test=document_fix
---
 python/paddle/reader/decorator.py | 42 +++++++++++++++++++++++++++----
 tools/wlist.json                  |  2 --
 2 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 91a2a78203c..8ee4d73ea84 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -62,6 +62,22 @@ def cache(reader):
 
     Returns:
         generator: a decorated reader object which yields data from cached memory.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            
+            def reader():
+                for i in range(3):
+                    yield i
+            
+            # All data is cached into memory
+            cached_reader = paddle.io.cache(reader)
+            
+            # Output: 0 1 2
+            for i in cached_reader():
+                print(i)
     """
     all_data = tuple(reader())
 
@@ -296,12 +312,28 @@ def buffered(reader, size):
     buffer. Reading from the buffered data reader will proceed as long
     as the buffer is not empty.
 
-    :param reader: the data reader to read from.
-    :type reader: callable
-    :param size: max buffer size.
-    :type size: int
+    Args:
+        reader(generator): the data reader to read from.
+        size(int): max buffer size.
+
+    Returns:
+        generator: the buffered data reader.
+    
+    Examples:
+        .. code-block:: python
 
-    :returns: the buffered data reader.
+            import paddle
+            
+            def reader():
+                for i in range(3):
+                    yield i
+            
+            # Create a buffered reader, and the buffer size is 2.
+            buffered_reader = paddle.io.buffered(reader, 2)
+            
+            # Output: 0 1 2
+            for i in buffered_reader():
+                print(i)
     """
 
     class EndSignal():
diff --git a/tools/wlist.json b/tools/wlist.json
index 20f6a9cbaed..5591f90da4b 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -105,8 +105,6 @@
         "convert_dist_to_sparse_program",
         "load_persistables_for_increment",
         "load_persistables_for_inference",
-        "cache",
-        "buffered",
         "xmap_readers",
         "Metric.reset",
         "Metric.update",
-- 
GitLab


From 41b59555387616edef6bd5ef1b9093ab92b90db1 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 23 Sep 2020 10:55:01 +0800
Subject: [PATCH 181/261] Polish no onwer ops error message (#27448)

* polish no onwer op error message

* fix unittest failed

* polish details based reviewer comment
---
 .../operators/add_position_encoding_op.cc     | 10 +++-
 paddle/fluid/operators/assign_value_op.h      |  5 +-
 paddle/fluid/operators/coalesce_tensor_op.cc  | 60 ++++++++++++-------
 .../fluid/operators/dequantize_abs_max_op.cc  |  6 +-
 paddle/fluid/operators/detection/gpc.cc       | 30 +++++++---
 .../distributed_ops/fetch_barrier_op.cc       |  4 +-
 .../distributed_ops/send_recv_util.h          | 10 ++--
 paddle/fluid/operators/gru_unit_op.h          |  8 ++-
 paddle/fluid/operators/interpolate_op.cc      | 47 ++++++++-------
 paddle/fluid/operators/merge_lod_tensor_op.cc | 46 ++++++++------
 paddle/fluid/operators/strided_memcpy.h       | 32 +++++++---
 paddle/fluid/operators/var_conv_2d_op.cc      | 38 ++++++++----
 12 files changed, 193 insertions(+), 103 deletions(-)

diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
index 629fedba6e3..e5fcd270eb8 100644
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -69,12 +69,18 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("alpha", "The scale of Original Embedding.")
         .SetDefault(1.0f)
         .AddCustomChecker([](const float& alpha) {
-          PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0.");
+          PADDLE_ENFORCE_GE(
+              alpha, 0.0f,
+              platform::errors::InvalidArgument(
+                  "Attribute 'alpha' must be greater than or equal to 0.0."));
         });
     AddAttr<float>("beta", "The scale of Position Embedding.")
         .SetDefault(1.0f)
         .AddCustomChecker([](const float& beta) {
-          PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0.");
+          PADDLE_ENFORCE_GE(
+              beta, 0.0f,
+              platform::errors::InvalidArgument(
+                  "Attribute 'beta' must be greater than or equal to 0.0."));
         });
     AddComment(R"DOC(
     Add Position Encoding Operator.
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
index b462c43d23a..1418d96b67b 100644
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -76,7 +76,10 @@ class AssignValueKernel : public framework::OpKernel<T> {
         value_name = "int64_values";
         break;
       default:
-        PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype);
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported data type(code %d) for AssignValue operator, only "
+            "supports bool, int32, float32 and int64.",
+            dtype));
         break;
     }
     CopyVecotorToTensor<T>(value_name, out, ctx);
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 5b7bcde21a9..d67d90c348e 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -33,29 +33,37 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
     auto out_vars = context.MultiOutputVar("Output");
 
     PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0),
-                      "The CoalesceTensorOp has no input.");
-    PADDLE_ENFORCE_EQ(
-        in_var_names.size(), out_var_names.size(),
-        "The number of CoalesceTensorOp's input and output is not match.");
+                      platform::errors::InvalidArgument(
+                          "The CoalesceTensor operator has no input."));
+    PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size(),
+                      platform::errors::InvalidArgument(
+                          "The number of CoalesceTensor operator's input and "
+                          "output is not match, "
+                          "input number is %u, output number is %u.",
+                          in_var_names.size(), out_var_names.size()));
 
     // Input & Output check: only support LoDTensor
     for (size_t i = 0; i < in_var_names.size(); ++i) {
       PADDLE_ENFORCE_NOT_NULL(
           in_vars[i],
-          "The input variable %s of CoalesceTensorOp does not exist.",
-          in_var_names[i]);
+          platform::errors::NotFound("The input variable %s of CoalesceTensor "
+                                     "operator does not exist.",
+                                     in_var_names[i]));
       PADDLE_ENFORCE_NOT_NULL(
           out_vars[i],
-          "The output variable %s of CoalesceTensorOp does not exist.",
-          out_var_names[i]);
-      PADDLE_ENFORCE_EQ(
-          in_vars[i]->IsType<framework::LoDTensor>(), true,
-          "The input variable %s of CoalesceTensorOp is not LoDTensor.",
-          in_var_names[i]);
-      PADDLE_ENFORCE_EQ(
-          out_vars[i]->IsType<framework::LoDTensor>(), true,
-          "The output variable %s of CoalesceTensorOp is not LoDTensor.",
-          in_var_names[i]);
+          platform::errors::NotFound("The output variable %s of CoalesceTensor "
+                                     "operator does not exist.",
+                                     out_var_names[i]));
+      PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensor>(), true,
+                        platform::errors::InvalidArgument(
+                            "The input variable %s of CoalesceTensor operator "
+                            "is not LoDTensor.",
+                            in_var_names[i]));
+      PADDLE_ENFORCE_EQ(out_vars[i]->IsType<framework::LoDTensor>(), true,
+                        platform::errors::InvalidArgument(
+                            "The output variable %s of CoalesceTensor operator "
+                            "is not LoDTensor.",
+                            in_var_names[i]));
     }
 
     auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
@@ -64,7 +72,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < in_var_names.size(); ++i) {
         PADDLE_ENFORCE_EQ(
             in_var_names[i], out_var_names[i],
-            "The input and output variable of CoalesceTensorOp is different.");
+            platform::errors::InvalidArgument(
+                "The input and output variable of CoalesceTensor operator is "
+                "different, %dth input is %s, %dth output is %s.",
+                i, in_var_names[i], i, out_var_names[i]));
       }
     } else {
       // Init the output as input
@@ -134,16 +145,25 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       const std::vector<const framework::LoDTensor *> &lod_tensors,
       const std::vector<std::string> var_names, size_t *numel,
       const size_t &size_of_dtype, const platform::Place &place) const {
-    PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
+    PADDLE_ENFORCE_EQ(
+        lod_tensors.size(), var_names.size(),
+        platform::errors::InvalidArgument(
+            "The number of input tensor and variable does not match, the "
+            "number of input tensor is %u, the number of input variable is %u.",
+            lod_tensors.size(), var_names.size()));
     *numel = 0;
     std::stringstream ss;
     ss << "alloc_space_for_vars: ";
     for (size_t i = 0; i < var_names.size(); ++i) {
       PADDLE_ENFORCE_EQ(lod_tensors[i]->IsInitialized(), true,
-                        "%s is not initialized.", var_names[i]);
+                        platform::errors::InvalidArgument(
+                            "Tensor `%s` is not initialized.", var_names[i]));
 
       auto size = lod_tensors[i]->numel();
-      PADDLE_ENFORCE_GT(size, 0);
+      PADDLE_ENFORCE_GT(
+          size, 0,
+          platform::errors::InvalidArgument(
+              "The number of tensor `%s`'s elements is 0.", var_names[i]));
       ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
          << ") "
          << " addres:" << lod_tensors[i]->data<void>() << ", ";
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cc b/paddle/fluid/operators/dequantize_abs_max_op.cc
index 48743f2e48c..0d4d68d9f62 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
@@ -45,10 +45,8 @@ class DequantizeMaxAbsOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of DequantizeMaxAbsOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of DequantizeMaxAbsOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DequantizeMaxAbs");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DequantizeMaxAbs");
 
     ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc
index b46d231d0ff..6b1b0cd8b35 100644
--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
@@ -532,7 +532,8 @@ static int count_contours(polygon_node *polygon) {
 }
 
 static void add_left(polygon_node *p, double x, double y) {
-  PADDLE_ENFORCE_NOT_NULL(p);
+  PADDLE_ENFORCE_NOT_NULL(p, paddle::platform::errors::InvalidArgument(
+                                 "Input polygon node is nullptr."));
   vertex_node *nv = NULL;
 
   /* Create a new vertex node and set its fields */
@@ -588,7 +589,8 @@ static void add_right(polygon_node *p, double x, double y) {
 }
 
 static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) {
-  PADDLE_ENFORCE_NOT_NULL(p);
+  PADDLE_ENFORCE_NOT_NULL(p, paddle::platform::errors::InvalidArgument(
+                                 "Input polygon node is nullptr."));
   polygon_node *target = NULL;
 
   /* Label contour as external */
@@ -664,7 +666,8 @@ void add_vertex(vertex_node **t, double x, double y) {
 }
 
 void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) {
-  PADDLE_ENFORCE_NOT_NULL(e);
+  PADDLE_ENFORCE_NOT_NULL(e, paddle::platform::errors::InvalidArgument(
+                                 "Input edge node is nullptr."));
   add_vertex(&(e->outp[p]->v[s]), x, y);
   e->outp[p]->active++;
 }
@@ -693,7 +696,8 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {
 
   gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
                    const_cast<char *>("Bounding box creation"));
-  PADDLE_ENFORCE_NOT_NULL(box);
+  PADDLE_ENFORCE_NOT_NULL(box, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc box memory."));
 
   /* Construct contour bounding boxes */
   for (c = 0; c < p->num_contours; c++) {
@@ -857,7 +861,9 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
   /* Create an extended hole array */
   gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
                   const_cast<char *>("contour hole addition"));
-  PADDLE_ENFORCE_NOT_NULL(extended_hole);
+  PADDLE_ENFORCE_NOT_NULL(extended_hole,
+                          paddle::platform::errors::ResourceExhausted(
+                              "Failed to malloc extended hole memory."));
 
   /* Create an extended contour array */
   gpc_malloc<gpc_vertex_list>(extended_contour,
@@ -975,7 +981,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
   /* Build scanbeam table from scanbeam tree */
   gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                      const_cast<char *>("sbt creation"));
-  PADDLE_ENFORCE_NOT_NULL(sbt);
+  PADDLE_ENFORCE_NOT_NULL(sbt, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc scanbeam table memory."));
+
   build_sbt(&scanbeam, sbt, sbtree);
   scanbeam = 0;
   free_sbtree(&sbtree);
@@ -1017,7 +1025,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
     e0 = aet;
     e1 = aet;
     /* Set up bundle fields of first edge */
-    PADDLE_ENFORCE_NOT_NULL(aet);
+    PADDLE_ENFORCE_NOT_NULL(aet, paddle::platform::errors::InvalidArgument(
+                                     "Edge node AET is nullptr."));
+
     aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
     aet->bundle[ABOVE][!aet->type] = 0;
     aet->bstate[ABOVE] = UNBUNDLED;
@@ -1612,7 +1622,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
   /* Build scanbeam table from scanbeam tree */
   gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                      const_cast<char *>("sbt creation"));
-  PADDLE_ENFORCE_NOT_NULL(sbt);
+  PADDLE_ENFORCE_NOT_NULL(sbt, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc scanbeam table memory."));
   build_sbt(&scanbeam, sbt, sbtree);
   scanbeam = 0;
   free_sbtree(&sbtree);
@@ -1650,7 +1661,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
     e1 = aet;
 
     /* Set up bundle fields of first edge */
-    PADDLE_ENFORCE_NOT_NULL(aet);
+    PADDLE_ENFORCE_NOT_NULL(aet, paddle::platform::errors::InvalidArgument(
+                                     "Edge node AET is nullptr."));
     aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
     aet->bundle[ABOVE][!aet->type] = 0;
     aet->bstate[ABOVE] = UNBUNDLED;
diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
index b064265917b..c9f9daf3b3c 100644
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
@@ -48,7 +48,9 @@ class FetchBarrierOp : public framework::OperatorBase {
     }
 
     for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient");
+      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U,
+                        platform::errors::Unavailable(
+                            "Internal error occurred in RPCClient."));
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h
index c05a1ff1da8..7dc0596ac31 100644
--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h
@@ -34,16 +34,16 @@ inline bool NeedSend(const framework::Scope& scope,
       std::string::npos)
     return false;
   auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
-                          varname);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound(
+               "Can not find variable '%s' in the send side.", varname));
   if (var->IsType<framework::LoDTensor>()) {
     return var->Get<framework::LoDTensor>().IsInitialized();
   } else if (var->IsType<framework::SelectedRows>()) {
     return var->Get<framework::SelectedRows>().rows().size() > 0UL;
   } else {
-    PADDLE_THROW(
-        "Variable type in send side should be in "
-        "[LodTensor, SelectedRows]");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Variable type in send side should be LodTensor or SelectedRows."));
   }
   return false;
 }
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 712ef05d863..4865a02c529 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -47,7 +47,9 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     else if (act_type == relu)
       ReluFunctor<T>()(d, x, y);
     else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported activation type, only supports identity, sigmoid, tanh "
+          "and relu."));
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
@@ -137,7 +139,9 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     else if (act_type == relu)
       ReluGradFunctor<T>()(d, x, y, dy, dx);
     else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported activation type, only supports identity, sigmoid, tanh "
+          "and relu."));
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 1e99e22e12b..e8a9ed878e9 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -104,12 +104,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
   auto dim_x = ctx->GetInputDim("X");
   auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
 
-  PADDLE_ENFORCE(
-      "bilinear" == interp_method || "nearest" == interp_method ||
-          "bicubic" == interp_method,
-      "Interpolation method can only be \"bilinear\" or \"nearest\" when "
-      "Input(X) dimension is 4, but got method = %s .",
-      interp_method);
+  PADDLE_ENFORCE_EQ("bilinear" == interp_method || "nearest" == interp_method ||
+                        "bicubic" == interp_method,
+                    true, platform::errors::InvalidArgument(
+                              "Interpolation method can only be \"bilinear\" "
+                              "or \"nearest\" or \"bicubic\" when "
+                              "Input(X) dimension is 4, but got method is %s.",
+                              interp_method));
   const DataLayout data_layout = framework::StringToDataLayout(
       ctx->Attrs().Get<std::string>("data_layout"));
 
@@ -169,13 +170,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
     auto out_size_dim = ctx->GetInputDim("OutSize");
     PADDLE_ENFORCE_EQ(
         out_size_dim.size(), 1,
-        platform::errors::InvalidArgument(
-            "OutSize's dimension size must be 1, but got dimension = %d .",
-            out_size_dim.size()));
+        platform::errors::InvalidArgument("OutSize's dimension size must be 1, "
+                                          "but got dimension size is %d .",
+                                          out_size_dim.size()));
     PADDLE_ENFORCE_EQ(
         out_size_dim[0], 2,
         platform::errors::InvalidArgument(
-            "OutSize's dim[0] must be 2, but got dimention = %d .",
+            "OutSize's dimension[0] must be 2, but got dimension[0] is %d .",
             out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
     return;
@@ -264,12 +265,15 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
 
   if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
     auto out_size_dim = ctx->GetInputDim("OutSize");
-    PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
-                      "OutSize's dimension size must be 1, but got size =%d .",
-                      out_size_dim.size());
+    PADDLE_ENFORCE_EQ(
+        out_size_dim.size(), 1,
+        platform::errors::InvalidArgument(
+            "OutSize's dimension size must be 1, but got size is %d.",
+            out_size_dim.size()));
     PADDLE_ENFORCE_EQ(out_size_dim[0], 3,
-                      "OutSize's dim[0] must be 3, but got size = %d .",
-                      out_size_dim[0]);
+                      platform::errors::InvalidArgument(
+                          "OutSize's dim[0] must be 3, but got size is %d.",
+                          out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
     return;
   }
@@ -289,10 +293,8 @@ class InterpolateOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of InterpolateOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of InterpolationOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Interpolate");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Interpolate");
 
     auto dim_x = ctx->GetInputDim("X");  // NCHW format
     PADDLE_ENFORCE(
@@ -534,9 +536,10 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InterpolateGrad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "InterpolateGrad");
+
     auto dim_x = ctx->GetInputDim("X");
     if (ctx->HasOutput(framework::GradVarName("X"))) {
       ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index c9b852cfc05..87d914aa797 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -44,8 +44,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
     auto level = static_cast<size_t>(Attr<int>("level"));
 
-    PADDLE_ENFORCE(in_true.numel() || in_false.numel(),
-                   "Input(InTrue) or Input(InFalse) should be initialized.");
+    PADDLE_ENFORCE_EQ(
+        in_true.numel() || in_false.numel(), true,
+        platform::errors::InvalidArgument(
+            "Input(InTrue) or Input(InFalse) should be initialized."));
 
     auto &mask_dim = mask.dims();
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
@@ -56,7 +58,9 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx,
                             cpu_mask.get());
 #else
-      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "Not supported GPU, Please recompile or reinstall paddle with CUDA "
+          "support."));
 #endif
     }
     auto *mask_data = cpu_mask->data<bool>();
@@ -109,7 +113,11 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       size_t start_offset = lod_and_offset.second.first;
       size_t end_offset = lod_and_offset.second.second;
 
-      PADDLE_ENFORCE_GE(end_offset, start_offset);
+      PADDLE_ENFORCE_GE(end_offset, start_offset,
+                        platform::errors::InvalidArgument(
+                            "The end offset less than start offset, end offset "
+                            "is %d, start offset is %d.",
+                            end_offset, start_offset));
       size_t len = end_offset - start_offset;
       if (len == 0) {
         continue;
@@ -189,22 +197,24 @@ class MergeLoDTensorInferShape : public framework::InferShapeBase {
                    "merge_lod_tensor");
     auto mask_dim = context->GetInputDim("Mask");
     PADDLE_ENFORCE_EQ(mask_dim.size(), 2,
-                      "If you are using IfElse OP:"
-                      "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-                      "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-                      "Please ensure that the cond should be a 2-D tensor and "
-                      "the second dim size of cond should be 1. "
-                      "But now the cond's shape is [",
-                      *mask_dim.Get(), "].\n");
+                      platform::errors::InvalidArgument(
+                          "If you are using IfElse OP:"
+                          "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
+                          "ie.true_block():\n    out_1 = ie.input(x)\n\n"
+                          "Please ensure that the cond is a 2-D tensor and "
+                          "the second dim size of cond is 1. "
+                          "But now the cond's shape is [%s].\n",
+                          mask_dim));
     if (context->IsRuntime() || mask_dim[1] > 0) {
       PADDLE_ENFORCE_EQ(mask_dim[1], 1,
-                        "If you are using IfElse OP:"
-                        "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-                        "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-                        "Please ensure that the cond should be a 2-D tensor "
-                        "and the second dim size of cond should be 1. "
-                        "But now the cond's shape is [",
-                        *mask_dim.Get(), "].\n");
+                        platform::errors::InvalidArgument(
+                            "If you are using IfElse OP:"
+                            "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
+                            "ie.true_block():\n    out_1 = ie.input(x)\n\n"
+                            "Please ensure that the cond is a 2-D tensor "
+                            "and the second dim size of cond is 1. "
+                            "But now the cond's shape is [%s].\n",
+                            mask_dim));
     }
 
     context->SetOutputDim("Out", context->GetInputDim("InTrue"));
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index f20bada8ab2..142b00b4de6 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -60,20 +60,33 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
   auto place = ctx.GetPlace();
 
   PADDLE_ENFORCE_EQ(src_stride_numel.size(), dst_stride_numel.size(),
-                    "src and dst tensor should have the same dims size.");
+                    platform::errors::InvalidArgument(
+                        "Source and destination tensor should have the same "
+                        "dimension size, but source tensor dimension size is "
+                        "%u, destination tensor size is %u.",
+                        src_stride_numel.size(), dst_stride_numel.size()));
 
   for (int64_t i = 0; i < axis; ++i) {
     if (i < axis) {
-      PADDLE_ENFORCE_EQ(src_stride_numel[i] / src_stride_numel[axis],
-                        dst_stride_numel[i] / dst_stride_numel[axis],
-                        "src and dst should have the same elements "
-                        "except the specified axis.");
+      PADDLE_ENFORCE_EQ(
+          src_stride_numel[i] / src_stride_numel[axis],
+          dst_stride_numel[i] / dst_stride_numel[axis],
+          platform::errors::InvalidArgument(
+              "Source and destination tensor should have the same number of "
+              "elements except the specified axis, but the source elements "
+              "number is %d, destination elements number is %d.",
+              src_stride_numel[i] / src_stride_numel[axis],
+              dst_stride_numel[i] / dst_stride_numel[axis]));
     } else if (i == axis) {
       continue;
     } else {
-      PADDLE_ENFORCE_EQ(src_stride_numel[i], dst_stride_numel[i],
-                        "src and dst should have the same elements "
-                        "except the specified axis.");
+      PADDLE_ENFORCE_EQ(
+          src_stride_numel[i], dst_stride_numel[i],
+          platform::errors::InvalidArgument(
+              "Source and destination tensor should have the same number of "
+              "elements except the specified axis, but the source elements "
+              "number is %d, destination elements number is %d.",
+              src_stride_numel[i], dst_stride_numel[i]));
     }
   }
 
@@ -90,7 +103,8 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
       memory::Copy(gpu_place, dst + i * dst_after, gpu_place,
                    src + i * src_after, sizeof(T) * size, cuda_ctx.stream());
 #else
-      PADDLE_THROW("Paddle is not compiled with GPU");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "Paddle is not compiled with GPU."));
 #endif
     }
   }
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index f8a29a52d7a..db8b2c30501 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -78,21 +78,35 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
       platform::errors::NotFound("Col(Output) of VarConv2dOP is not found."));
 
   auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    "The rank of X(Input) can't be less than 2.");
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(), 2,
+      platform::errors::InvalidArgument(
+          "The rank of X(Input) can't be less than 2, but received rank is %u.",
+          x_dims.size()));
 
   auto w_dims = ctx->GetInputDim("W");
 
-  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor");
+  PADDLE_ENFORCE_EQ(
+      w_dims.size(), 2,
+      platform::errors::InvalidArgument(
+          "Input W should be a 2-D tensor, but its actual dimension is %u.",
+          w_dims.size()));
   int output_channel = ctx->Attrs().Get<int>("OutputChannel");
   int input_channel = ctx->Attrs().Get<int>("InputChannel");
   int kernel_h = ctx->Attrs().Get<int>("KernelH");
   int kernel_w = ctx->Attrs().Get<int>("KernelW");
-  PADDLE_ENFORCE_EQ(w_dims[0], output_channel,
-                    "W dim[0] should be equal to OutputChannel");
+  PADDLE_ENFORCE_EQ(
+      w_dims[0], output_channel,
+      platform::errors::InvalidArgument(
+          "Input W's dimension[0] should be equal to OutputChannel, the "
+          "dimension[0] is %d, OutputChannel is %d.",
+          w_dims[0], output_channel));
   PADDLE_ENFORCE_EQ(
       w_dims[1], input_channel * kernel_h * kernel_w,
-      "W dim[1] should be equal to InputChannel * StrideH * StrideW");
+      platform::errors::InvalidArgument(
+          "Input W's dimension[1] should be equal to InputChannel * StrideH * "
+          "StrideW, the dimension[1] is %d, expected value is %d.",
+          w_dims[1], input_channel * kernel_h * kernel_w));
 
   if (ctx->IsRuntime()) {
     framework::Variable* x_var =
@@ -103,10 +117,14 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
         platform::errors::InvalidArgument("The Input(X) Tensor of VarConv2dOP "
                                           "does not contain LoD information."));
 
-    PADDLE_ENFORCE_GE(x_lod.size(), 1, "The Input(X)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], static_cast<int64_t>(x_lod[0].back()),
-        "The Input(X)'s lod info mismatches the actual tensor shape.");
+    PADDLE_ENFORCE_GE(x_lod.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The Input(X)'s lod info is corrupted."));
+    PADDLE_ENFORCE_EQ(x_dims[0], static_cast<int64_t>(x_lod[0].back()),
+                      platform::errors::InvalidArgument(
+                          "The Input(X)'s lod info mismatches the actual "
+                          "tensor shape, input lod is %s, tensor shape is %s.",
+                          x_lod, x_dims));
 
     framework::Variable* row_var =
         BOOST_GET(framework::Variable*, ctx->GetInputVarPtrs("ROW")[0]);
-- 
GitLab


From 292b24aa6de374619e4ada5e5f17b602b1def0f8 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Wed, 23 Sep 2020 11:06:51 +0800
Subject: [PATCH 182/261] fix bug MD of compile, And add MD/STATIC/OPENBLAS
 inference lib check on windows (#27051)

---
 CMakeLists.txt                                | 23 ++++-
 cmake/external/cryptopp.cmake                 | 17 +---
 cmake/flags.cmake                             | 31 -------
 cmake/inference_lib.cmake                     | 27 +++---
 paddle/fluid/inference/CMakeLists.txt         |  5 +-
 .../inference/api/demo_ci/CMakeLists.txt      |  4 +-
 paddle/fluid/inference/api/demo_ci/run.sh     | 91 ++++++++++---------
 .../inference/api/paddle_infer_declare.h      |  4 -
 paddle/scripts/paddle_build.bat               | 19 +++-
 9 files changed, 105 insertions(+), 116 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb796103350..b1554fba5e1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,8 +63,29 @@ if(WIN32)
         set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
         set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
         set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+        foreach(flag_var
+            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+            CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+            CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+            CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+            if(${flag_var} MATCHES "/MD")
+                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+            endif()
+        endforeach(flag_var)
     endif()
-    
+
+    # windows build turn off warnings.
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
+    endforeach(flag_var)
+    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+        set(${flag_var} "${${flag_var}} /w")
+    endforeach(flag_var)
+
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
     message(STATUS "Using parallel compiling (/MP)")
diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake
index af5dd0e2c9b..351ef1c7c7a 100644
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -22,23 +22,8 @@ SET(CRYPTOPP_TAG        CRYPTOPP_8_2_0)
 
 IF(WIN32)
   SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE)
-  SET(CRYPTOPP_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
-  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
-  set(CompilerFlags
-        CMAKE_CXX_FLAGS
-        CMAKE_CXX_FLAGS_DEBUG
-        CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_C_FLAGS
-        CMAKE_C_FLAGS_DEBUG
-        CMAKE_C_FLAGS_RELEASE
-        )
-  foreach(CompilerFlag ${CompilerFlags})
-    string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
-  endforeach()
 ELSE(WIN32)
   SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE)
-  SET(CRYPTOPP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 ENDIF(WIN32)
 
 set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
@@ -48,7 +33,7 @@ set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
                         -DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib
                         -DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR}
                         -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DCMAKE_CXX_FLAGS=${CRYPTOPP_CMAKE_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                         -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                         -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 415e07c7542..ed0bf8396b3 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -90,20 +90,6 @@ macro(safe_set_nvflag flag_name)
     endif()
 endmacro()
 
-macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
-    if (BUILD_SHARED_LIBS) 
-        return() # if build shared libs, the flags keep same with '/MD'
-    endif(BUILD_SHARED_LIBS)
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-endmacro()
 
 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
@@ -229,20 +215,3 @@ endforeach()
 
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
 
-
-if(WIN32)
-    # windows build turn off warnings.
-    if(MSVC_STATIC_CRT)
-        safe_set_static_flag()
-    endif()
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-    endforeach(flag_var)
-    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
-        set(${flag_var} "${${flag_var}} /w")
-    endforeach(flag_var)
-endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index e3c2409f103..f19f0eb43d3 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -24,7 +24,7 @@ set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_d
 # so the generation of static lib is temporarily turned off.
 if(WIN32)
     #todo: remove the option 
-    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   OFF)
+    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use dynamic."   OFF)
     if(NOT PYTHON_EXECUTABLE)
         FIND_PACKAGE(PythonInterp REQUIRED)
     endif()
@@ -165,25 +165,22 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
     if(WITH_STATIC_LIB)
-        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib)
+        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib
+                             ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.*)
     else()
         set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.dll
-                            ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
+                             ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
     endif()
+    copy(inference_lib_dist
+            SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
+            DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
+            ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
 else(WIN32)
     set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
-endif(WIN32)
-
-if(WIN32 AND NOT WITH_STATIC_LIB)
-        copy(inference_lib_dist
-                SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
-                DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
-                      ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
-else()
-        copy(inference_lib_dist
+    copy(inference_lib_dist
                 SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
                 DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
-endif()
+endif(WIN32)
 
 copy(inference_lib_dist
         SRCS  ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
@@ -211,12 +208,12 @@ add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})
 
 set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid")
 set(module "inference")
-if(WIN32 AND NOT WITH_STATIC_LIB)
+if(WIN32)
         copy(fluid_lib_dist
                 SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
                 DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
                 )
-else()
+        else()
         copy(fluid_lib_dist
                 SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
                 DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 9dc96fdfe86..cf6fcb7b643 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -44,10 +44,11 @@ add_subdirectory(api)
 set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
      zero_copy_tensor reset_tensor_array 
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
-if(WIN32)
+# TODO(xingzhaolong, jiweibo): remove this and create_static_lib(paddle_fluid) on windows GPU
+if(WIN32 AND WITH_GPU)
   cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_API}) 
 else()
- create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API}) 
+  create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API}) 
 endif()
 
 if(NOT APPLE AND NOT WIN32)
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 08a1a542819..6a3760e1f74 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -51,8 +51,8 @@ if (WIN32)
     set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
     set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
     set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    safe_set_static_flag()
     if (WITH_STATIC_LIB)
-      safe_set_static_flag()
       add_definitions(-DSTATIC_LIB)
     endif()
   endif()
@@ -136,7 +136,7 @@ else()
   set(DEPS ${DEPS}
       ${MATH_LIB} ${MKLDNN_LIB}
       glog gflags_static libprotobuf  xxhash ${EXTERNAL_LIB})
-  set(DEPS ${DEPS} libcmt shlwapi.lib)
+  set(DEPS ${DEPS} shlwapi.lib)
 endif(NOT WIN32)
 
 if(WITH_GPU)
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 6b7fb0f619a..a3e7bec398a 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -6,7 +6,7 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include
 TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
-
+MSVC_STATIC_CRT=$7
 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 
 cd `dirname $0`
@@ -66,43 +66,54 @@ mkdir -p build
 cd build
 rm -rf *
 
-if [ $(echo `uname` | grep "Win") != "" ]; then
-  # -----simple_on_word2vec on windows-----
-  cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-    -DWITH_MKL=$TURN_ON_MKL \
-    -DDEMO_NAME=simple_on_word2vec \
-    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=OFF
-  msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
-  Release/simple_on_word2vec.exe \
-      --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
-      --use_gpu=False
-  if [ $? -ne 0 ]; then
-    echo "simple_on_word2vec demo runs fail."
-    exit 1
-  fi
-
-  # -----vis_demo on windows-----
-  rm -rf *
-  cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-    -DWITH_MKL=$TURN_ON_MKL \
-    -DDEMO_NAME=vis_demo \
-    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=OFF
-  msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
-  for vis_demo_name in $vis_demo_list; do
-    Release/vis_demo.exe \
-      --modeldir=$DATA_DIR/$vis_demo_name/model \
-      --data=$DATA_DIR/$vis_demo_name/data.txt \
-      --refer=$DATA_DIR/$vis_demo_name/result.txt \
-      --use_gpu=False
-    if [ $? -ne 0 ]; then
-      echo "vis demo $vis_demo_name runs fail."
-      exit 1
+for WITH_STATIC_LIB in ON OFF; do
+  if [ $(echo `uname` | grep "Win") != "" ]; then
+    # TODO(xingzhaolong, jiweibo): remove this if windows GPU library is ready.
+    if [ $TEST_GPU_CPU == ON] && [ $WITH_STATIC_LIB ==ON ]; then
+      return 0
     fi
-  done
-else
-  for WITH_STATIC_LIB in ON OFF; do
+    
+    # -----simple_on_word2vec on windows-----
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
+      -DDEMO_NAME=simple_on_word2vec \
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+    msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
+    for use_gpu in $use_gpu_list; do
+      Release/simple_on_word2vec.exe \
+        --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
+        echo "simple_on_word2vec demo runs fail."
+        exit 1
+      fi
+    done
+
+    # -----vis_demo on windows-----
+    rm -rf *
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
+      -DDEMO_NAME=vis_demo \
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+    msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
+    for use_gpu in $use_gpu_list; do
+      for vis_demo_name in $vis_demo_list; do
+        Release/vis_demo.exe \
+          --modeldir=$DATA_DIR/$vis_demo_name/model \
+          --data=$DATA_DIR/$vis_demo_name/data.txt \
+          --refer=$DATA_DIR/$vis_demo_name/result.txt \
+          --use_gpu=$use_gpu
+        if [ $? -ne 0 ]; then
+          echo "vis demo $vis_demo_name runs fail."
+          exit 1
+        fi
+      done
+    done
+  else
     # -----simple_on_word2vec on linux/mac-----
     rm -rf *
     cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -123,7 +134,6 @@ else
         fi
       done
     fi
-
     # ---------vis_demo on linux/mac---------
     rm -rf *
     cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -145,7 +155,6 @@ else
         fi
       done
     done
-
     # --------tensorrt mobilenet on linux/mac------
     if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
       rm -rf *
@@ -167,6 +176,6 @@ else
         exit 1
       fi
     fi
-  done
-fi
+  fi
+done
 set +x
diff --git a/paddle/fluid/inference/api/paddle_infer_declare.h b/paddle/fluid/inference/api/paddle_infer_declare.h
index 39c9653f16c..e8525f440fe 100644
--- a/paddle/fluid/inference/api/paddle_infer_declare.h
+++ b/paddle/fluid/inference/api/paddle_infer_declare.h
@@ -17,11 +17,7 @@
 #if defined(_WIN32)
 #ifndef PD_INFER_DECL
 #ifdef PADDLE_DLL_INFERENCE
-#ifndef PADDLE_ON_INFERENCE
-#define PD_INFER_DECL
-#else
 #define PD_INFER_DECL __declspec(dllexport)
-#endif  // PADDLE_ON_INFERENCE
 #else
 #define PD_INFER_DECL __declspec(dllimport)
 #endif  // PADDLE_DLL_INFERENCE
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 60e4496bc54..524c086c079 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -26,6 +26,7 @@ wmic process where name="op_function_generator.exe" call terminate  2>NUL
 rem ------initialize common variable------
 if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
 if not defined BRANCH set BRANCH=develop
+if not defined TENSORRT_ROOT set TENSORRT_ROOT="C:/TensorRT-5.1.5.0"
 if not defined WITH_MKL set WITH_MKL=ON
 if not defined WITH_GPU set WITH_GPU=OFF
 if not defined WITH_AVX set WITH_AVX=ON
@@ -33,9 +34,11 @@ if not defined WITH_TESTING set WITH_TESTING=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
+if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON
 if not defined WITH_CACHE set WITH_CACHE=ON
 if not defined WITH_TPCACHE set WITH_TPCACHE=ON
 
+
 rem -------set cache build work directory-----------
 if "%WITH_CACHE%"=="OFF" (
     rmdir build /s/q
@@ -99,6 +102,7 @@ set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
 :: set maximum cache size to 20G
 clcache.exe -M 21474836480
 
+
 rem ------set cache third_party------
 set cache_dir=%work_dir:Paddle=cache%
 dir %cache_dir%
@@ -138,6 +142,7 @@ exit /b 1
 :CASE_wincheck_mkl
 set WITH_MKL=ON
 set WITH_GPU=OFF
+set MSVC_STATIC_CRT=ON
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
@@ -149,11 +154,13 @@ goto:success
 :CASE_wincheck_openblas
 set WITH_MKL=OFF
 set WITH_GPU=ON
+set MSVC_STATIC_CRT=OFF
 rem Temporarily turn off WITH_INFERENCE_API_TEST on GPU due to compile hang
 set WITH_INFERENCE_API_TEST=OFF
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
+:: call :test_inference || goto test_inference_error
 goto:success
 
 rem "Other configurations are added here"
@@ -172,12 +179,14 @@ set start=%start:~4,10%
 echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
 -DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
--DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
+-DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
 
 cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
 -DON_INFER=%ON_INFER%  -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
--DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
+-DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
 goto:eof
 
 :cmake_error
@@ -282,7 +291,9 @@ dir %THIRD_PARTY_PATH:/=\%\install\mklml\lib
 dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 
-set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
+set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
+%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
+%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
 ctest.exe --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
 goto:eof
 
@@ -305,7 +316,7 @@ set end=%end:~4,10%
 call :timestamp "%start%" "%end%" "TestCases Total"
 
 cd %work_dir%\paddle\fluid\inference\api\demo_ci
-%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT%/include %TENSORRT_ROOT%/lib %MSVC_STATIC_CRT%
 goto:eof
 
 :test_inference_error
-- 
GitLab


From d7b7dcd10e6cdb00f237c6a9ef8f1d562733043b Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Wed, 23 Sep 2020 12:48:42 +0800
Subject: [PATCH 183/261] fix cmake dependencies of test_recognize_digits,
 test=develop (#27475)

---
 paddle/fluid/train/CMakeLists.txt             | 2 +-
 python/paddle/fluid/tests/book/CMakeLists.txt | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index 235d92ac4f9..d587081fbac 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -26,7 +26,7 @@ function(train_test TARGET_NAME)
                     ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
         endif()
         set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES DEPENDS test_${TARGET_NAME})
+                PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model)
         if(NOT WIN32 AND NOT APPLE)
             set_tests_properties(test_train_${TARGET_NAME}${arg}
                     PROPERTIES TIMEOUT 150)
diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt
index 673c965b662..96321aae566 100644
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/CMakeLists.txt
@@ -4,4 +4,5 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 # default test
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
+    set_tests_properties(${src} PROPERTIES FIXTURES_SETUP ${src}_infer_model)
 endforeach()
-- 
GitLab


From bc5f0246a807728593c889d7924c921e88ffe643 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 23 Sep 2020 14:26:37 +0800
Subject: [PATCH 184/261] large scale kv speedup (#26510)

* rename communicator meet->BatchesCounter

* fix parame recv for sparse

* geo sparse init from pserver

* optimize init from pserver

* add large scale optimizer fuse(SGD/ADAM)

* rectification init_worker and exe.run startup program
---
 .../operators/distributed/communicator.cc     |  94 ++++++----
 .../operators/distributed/communicator.h      |  10 +-
 .../operators/distributed/parameter_recv.cc   |  86 ++++++++-
 .../lookup_sparse_table_fuse_adam_op.cc       | 153 ++++++++++++++++
 .../lookup_sparse_table_fuse_adam_op.h        | 142 +++++++++++++++
 .../lookup_sparse_table_fuse_sgd_op.cc        | 120 ++++++++++++
 .../lookup_sparse_table_fuse_sgd_op.h         | 105 +++++++++++
 .../operators/distributed_ops/recv_op.cc      |  11 +-
 .../fleet/runtime/parameter_server_runtime.py |   6 +-
 .../distribute_transpiler/__init__.py         |   4 +-
 .../fleet/parameter_server/ir/pserver_pass.py |  87 ++++++++-
 .../fleet/parameter_server/ir/trainer_pass.py |  16 --
 .../incubate/fleet/tests/fleet_deep_ctr.py    |   2 +-
 .../fluid/tests/unittests/dist_fleet_ctr.py   |   6 +-
 .../tests/unittests/dist_fleet_ctr_ps_gpu.py  |   5 +-
 .../tests/unittests/dist_fleet_heter_ctr.py   |   5 +-
 .../tests/unittests/dist_fleet_simnet_bow.py  |   2 +-
 .../dist_fleet_sparse_embedding_ctr.py        |   3 +-
 .../unittests/test_communicator_async.py      |   5 +-
 .../tests/unittests/test_communicator_geo.py  |   2 +-
 .../unittests/test_communicator_half_async.py |   2 +-
 .../tests/unittests/test_communicator_sync.py |   6 +-
 .../test_dist_fleet_a_sync_optimizer_async.py |  30 +--
 .../test_dist_fleet_a_sync_optimizer_sync.py  |  15 +-
 .../tests/unittests/test_dist_fleet_ps4.py    |  20 +-
 .../tests/unittests/test_dist_fleet_ps5.py    |   3 +-
 .../tests/unittests/test_dist_fleet_ps6.py    | 168 +++++++++++++++++
 .../test_dist_lookup_sparse_table_fuse_ops.py | 171 ++++++++++++++++++
 28 files changed, 1137 insertions(+), 142 deletions(-)
 create mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
 create mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
 create mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
 create mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py

diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index b2cc9390fa2..a0ac82a6f4a 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -74,8 +74,12 @@ void AsyncCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
   } else {
     recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
   }
+
+  InitParams();
 }
 
+void AsyncCommunicator::InitParams() { RecvNoBarrier(); }
+
 AsyncCommunicator::~AsyncCommunicator() {
   running_ = false;
   if (main_thread_) main_thread_->join();
@@ -157,16 +161,18 @@ void AsyncCommunicator::MainThread() {
   }
 
   while (running_) {
-    int meet = Meet();
-
-    VLOG(1) << "async_meet: " << meet;
-
-    SendGlobalStep(meet);
-    SendByCommunicator(meet);
-    BarrierSend();
-    RecvByCommunicator();
-    BarrierRecv();
-    BarrierWeakUp();
+    int batches = BatchesCounter();
+
+    if (batches > 0) {
+      SendGlobalStep(batches);
+      SendByCommunicator(batches);
+      BarrierSend();
+      RecvByCommunicator();
+      BarrierRecv();
+      BarrierWeakUp();
+    } else {
+      VLOG(1) << "get nothing from sending queue, will skip send/recv";
+    }
   }
   VLOG(1) << "communicator stopped, send thread exit";
 }
@@ -187,7 +193,7 @@ void AsyncCommunicator::RecvNoBarrier() {
       auto &var_name = iter.first;
       VLOG(4) << "recv var " << var_name;
       auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(iter.second, *recv_scope_, false);
+      recv_functor(iter.second, *recv_scope_);
     };
     task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task)));
   }
@@ -197,7 +203,7 @@ void AsyncCommunicator::RecvNoBarrier() {
   }
 }
 
-int AsyncCommunicator::Meet() {
+int AsyncCommunicator::BatchesCounter() {
   auto &step_queue = send_varname_to_queue_.at(STEP_COUNTER);
 
   size_t merged_var_num = 0;
@@ -316,7 +322,7 @@ void HalfAsyncCommunicator::Clean() {
   }
 }
 
-int HalfAsyncCommunicator::Meet() {
+int HalfAsyncCommunicator::BatchesCounter() {
   while (running_) {
     if (barrier_counter_.load() >= barrier_trigger_.load() &&
         barrier_trigger_.load() != 0) {
@@ -443,7 +449,7 @@ void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
   old_scope_.reset(new Scope());
   pserver_scope_.reset(new Scope());
 
-  Init();
+  InitParams();
 }
 
 void GeoCommunicator::Send(const std::vector<std::string> &var_names,
@@ -626,9 +632,7 @@ void GeoCommunicator::RecvByCommunicator() {
       if (recv_ctx.is_sparse) {
         RecvSparse(var_name);
       } else {
-        VLOG(1) << "recv dense " << var_name << " begin";
         RecvDense(var_name);
-        VLOG(1) << "recv dense " << var_name << " done";
       }
     };
     tasks.emplace_back(send_threadpool_->enqueue(std::move(recv_task)));
@@ -696,7 +700,7 @@ void GeoCommunicator::RecvDense(const std::string &varname) {
 
   auto &ctx = recv_varname_to_ctx_.at(varname);
   auto recv = distributed::ParameterRecv<float>();
-  recv(ctx, *pserver_scope_, true);
+  recv(ctx, *pserver_scope_);
 
   PADDLE_ENFORCE_EQ(
       var_psrever->IsInitialized(), true,
@@ -721,7 +725,7 @@ void GeoCommunicator::RecvDense(const std::string &varname) {
              t_timestamp->data<float>());
 }
 
-void GeoCommunicator::Init() {
+void GeoCommunicator::InitParams() {
   std::vector<std::future<void>> tasks;
   tasks.reserve(recv_varname_to_ctx_.size());
 
@@ -744,12 +748,17 @@ void GeoCommunicator::Init() {
 }
 
 void GeoCommunicator::InitDense(const std::string varname) {
-  auto *var = old_scope_->Var(varname);
-  var->GetMutable<framework::LoDTensor>();
-
   auto &ctx = recv_varname_to_ctx_.at(varname);
   auto recv = distributed::ParameterRecv<float>();
-  recv(ctx, *old_scope_);
+  recv(ctx, *recv_scope_);
+
+  auto *global_var = recv_scope_->FindVar(varname);
+  global_var->GetMutable<framework::LoDTensor>();
+
+  auto *old_var = old_scope_->Var(varname);
+  old_var->GetMutable<framework::LoDTensor>();
+
+  framework::CopyVariable(*global_var, old_var);
   VLOG(1) << "init dense variable " << varname << " done";
 }
 
@@ -781,22 +790,41 @@ void GeoCommunicator::InitSparse() {
 
   LargeScaleKV::Init(metas);
 
-  for (size_t i = 0; i < metas.size(); i++) {
-    auto &varname = metas[i].name;
-    auto &dict = dicts[i];
+  for (auto &meta : metas) {
+    auto &ctx = recv_varname_to_ctx_.at(meta.name);
+    auto recv = distributed::ParameterRecv<float>();
 
-    std::vector<int64_t> ids;
-    ids.reserve(dict);
+    auto *global_var = recv_scope_->FindVar(meta.name);
+    auto global_value = global_var->Get<framework::LoDTensor>();
+    auto rows = global_value.dims()[0];
+    auto dim1 = global_value.dims()[1];
 
-    for (auto j = 0; j < dict; ++j) {
-      ids.push_back(j);
-    }
+    recv(ctx, *recv_scope_);
+    VLOG(1) << "recv " << meta.name << " with global scope for init";
+
+    auto n_rows = global_var->Get<framework::LoDTensor>().dims()[0];
+
+    PADDLE_ENFORCE_EQ(
+        rows, n_rows,
+        platform::errors::InvalidArgument(
+            "global var: %s origin dim must equal recved rows", meta.name));
+
+    std::vector<int64_t> ids(rows);
+    std::iota(ids.begin(), ids.end(), 0);
 
     auto *ins = distributed::LargeScaleKV::GetInstance();
-    ins->Get(varname)->Init(ids);
+    std::vector<std::vector<std::vector<float> *>> values;
+
+    ins->Get(meta.name)->Init(ids);
+    ins->Get(meta.name)->Get(ids, {"Param"}, &values);
 
-    VLOG(3) << "GeoCommunicator init sparse " << varname << " with size "
-            << ids.size();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, float>(
+        paddle::platform::CPUDeviceContext());
+
+    for (auto &id : ids) {
+      blas.VCOPY(dim1, global_value.data<float>() + id * dim1,
+                 values[id][0]->data());
+    }
   }
 
   VLOG(3) << "init sparse variable done";
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 2f6da150d1e..4a9a9eb1701 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <deque>
 #include <map>
 #include <memory>
+#include <numeric>
 #include <set>
 #include <string>
 #include <unordered_map>
@@ -29,6 +30,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/distributed/communicator_common.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/large_scale_kv.h"
@@ -279,6 +281,8 @@ class AsyncCommunicator : public Communicator {
                 const RpcCtxMap &recv_varname_to_ctx,
                 Scope *recv_scope) override;
 
+  void InitParams();
+
   void MainThread();
 
   void Send(const std::vector<std::string> &var_names,
@@ -293,7 +297,7 @@ class AsyncCommunicator : public Communicator {
 
   virtual void RecvNoBarrier();
 
-  virtual int Meet();
+  virtual int BatchesCounter();
 
   virtual void BarrierSend() {}
 
@@ -350,7 +354,7 @@ class HalfAsyncCommunicator : public AsyncCommunicator {
 
   void BarrierTriggerReset(int initial_val) override;
 
-  int Meet();
+  int BatchesCounter();
 
   void BarrierWeakUp();
 
@@ -435,7 +439,7 @@ class GeoCommunicator : public AsyncCommunicator {
 
   void RecvDense(const std::string &varname);
 
-  void Init();
+  void InitParams();
 
   void InitSparse();
 
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index 5409ec54987..3b8479c91b0 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -41,8 +41,67 @@ using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
 
 template <typename T>
-void RecvSelectedRows(const CommContext &rpc_ctx,
-                      const framework::Scope &scope) {
+void RecvSparseLodTensor(const CommContext &rpc_ctx,
+                         const framework::Scope &scope) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto cpu_place = platform::CPUPlace();
+  auto &cpu_ctx = *pool.Get(cpu_place);
+
+  distributed::RPCClient *rpc_client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
+
+  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
+  std::vector<const float *> tensors;
+  std::vector<distributed::VarHandlePtr> rets;
+  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
+    auto &recv_var_name = rpc_ctx.splited_varnames[i];
+    auto *local_var = local_scope->Var(recv_var_name);
+    VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
+    // sparse param in recv_scope is LoDTensor
+    rets.push_back(rpc_client->AsyncGetVarNoBarrier(
+        rpc_ctx.epmap[i], cpu_ctx, *local_scope.get(), recv_var_name,
+        recv_var_name));
+
+    const auto *value = local_var->Get<framework::LoDTensor>().data<float>();
+    tensors.push_back(value);
+  }
+
+  for (size_t i = 0; i < rets.size(); i++) {
+    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
+                                               "internal error in RPCClient"));
+  }
+
+  auto *merged_var = scope.FindVar(rpc_ctx.var_name);
+
+  if (merged_var == nullptr || !merged_var->IsInitialized()) {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("%s must initialized at first."));
+  }
+  auto dims1 = merged_var->Get<framework::LoDTensor>().dims()[1];
+  int64_t height = 0;
+  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
+    auto *splited_var = local_scope->FindVar(rpc_ctx.splited_varnames[i]);
+    height += splited_var->Get<framework::LoDTensor>().dims()[0];
+  }
+
+  PADDLE_ENFORCE_EQ(merged_var->Get<framework::LoDTensor>().dims()[0], height,
+                    "recved var must has same dims with local var");
+
+  auto *merged_t = merged_var->GetMutable<framework::LoDTensor>();
+  auto *merged_d = merged_t->mutable_data<float>(cpu_place);
+
+  auto pserver_num = rpc_ctx.splited_varnames.size();
+  for (int x = 0; x < height; ++x) {
+    auto id = x % pserver_num;
+    auto idx = x / pserver_num;
+    std::memcpy(merged_d + x * dims1, tensors[id] + idx * dims1,
+                sizeof(float) * dims1);
+  }
+}
+
+template <typename T>
+void RecvGeoSparseRecords(const CommContext &rpc_ctx,
+                          const framework::Scope &scope) {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto cpu_place = platform::CPUPlace();
   auto &cpu_ctx = *pool.Get(cpu_place);
@@ -84,9 +143,14 @@ void RecvSelectedRows(const CommContext &rpc_ctx,
     ids_num += recv_t.rows().size();
     width = recv_t.value().dims()[1];
 
-    std::transform(recv_t.rows().begin(), recv_t.rows().end(),
-                   std::back_inserter(all_ids),
-                   [&](int64_t id) { return id * pserver_num + i; });
+    if (rpc_ctx.is_distributed) {
+      std::copy(recv_t.rows().begin(), recv_t.rows().end(),
+                std::back_inserter(all_ids));
+    } else {
+      std::transform(recv_t.rows().begin(), recv_t.rows().end(),
+                     std::back_inserter(all_ids),
+                     [&](int64_t id) { return id * pserver_num + i; });
+    }
   }
 
   auto *var = scope.FindVar(rpc_ctx.var_name);
@@ -146,7 +210,8 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
 
 template <typename T>
 void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
-                                  const framework::Scope &scope, bool barrier) {
+                                  const framework::Scope &scope,
+                                  bool geo_records) {
   VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name;
 
   PADDLE_ENFORCE_GE(rpc_ctx.origin_varnames.size(), 1,
@@ -154,18 +219,21 @@ void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
                         "origin_varnames.size() >= 1 is permitted"));
 
   if (rpc_ctx.is_sparse) {
-    RecvSelectedRows<T>(rpc_ctx, scope);
+    if (geo_records) {
+      RecvGeoSparseRecords<T>(rpc_ctx, scope);
+    } else {
+      RecvSparseLodTensor<T>(rpc_ctx, scope);
+    }
   } else {
     RecvLodTensor<T>(rpc_ctx, scope);
   }
 
   VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
 }
-
 template <typename T>
 void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
                                   const framework::Scope &scope) {
-  this->operator()(rpc_ctx, scope, true);
+  this->operator()(rpc_ctx, scope, false);
 }
 
 template struct ParameterRecv<float>;
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
new file mode 100644
index 00000000000..e53ce8cc67c
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h"
+
+#include <string>
+namespace paddle {
+namespace operators {
+
+class LargeScaleFuseAdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of LargeScaleFuseAdamOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("LearningRate"),
+        "Input(LearningRate) of LargeScaleFuseAdamOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+
+    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
+                      "Maybe the Input variable LearningRate has not "
+                      "been initialized. You may need to confirm "
+                      "if you put exe.run(startup_program) "
+                      "after optimizer.minimize function.");
+
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 element");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    if (var_name == "LearningRate") {
+      return framework::OpKernelType(tensor.type(), tensor.place(),
+                                     tensor.layout());
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class LargeScaleFuseAdamOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto in_var_type = ctx->GetInputType("Grad");
+    PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
+                          in_var_type == framework::proto::VarType::LOD_TENSOR,
+                      true, platform::errors::InvalidArgument(
+                                "The input Var's type should be LoDtensor or "
+                                "SelectedRows, but the received type is %s",
+                                in_var_type));
+  }
+};
+
+class LargeScaleFuseAdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Grad",
+             "(SelectedRows) Ids's type should be SelectedRows"
+             "THe ids to be looked up in W.");
+
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
+    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
+    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
+    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+
+    AddAttr<bool>("is_entry",
+                  "(bool)"
+                  "sparse table need entry");
+
+    AddAttr<std::string>("tablename",
+                         "(string)"
+                         "sparse table name");
+
+    AddAttr<std::vector<std::string>>("value_names",
+                                      "(strings)"
+                                      "sparse table name");
+
+    AddComment(R"DOC(
+Adam Optimizer.
+
+This implements the Adam optimizer from Section 2 of the Adam
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
+
+Adam updates:
+
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    lookup_sparse_table_fuse_adam, ops::LargeScaleFuseAdamOp,
+    ops::LargeScaleFuseAdamOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::LargeScaleFuseAdamOpInferVarType);
+
+REGISTER_OP_CPU_KERNEL(
+    lookup_sparse_table_fuse_adam,
+    ops::LargeScaleFuseAdamOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
new file mode 100644
index 00000000000..89b8d54a463
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <math.h>  // for sqrt in CPU and CUDA
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/large_scale_kv.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LargeScaleFuseAdamOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override;
+};
+
+template <typename T>
+class LargeScaleFuseAdamOpKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using paddle::framework::LoDTensor;
+
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *grad_var = ctx.InputVar("Grad");
+
+    PADDLE_ENFORCE(
+        grad_var->IsType<framework::SelectedRows>(),
+        platform::errors::InvalidArgument(
+            "in large scale optimize, gradient should only be SelectedRows"));
+
+    const auto &grad = grad_var->Get<framework::SelectedRows>();
+
+    // for distributed training, a sparse var may be empty,
+    // just skip updating.
+    if (grad.rows().size() == 0) {
+      return;
+    }
+
+    framework::SelectedRows tmp_grad_merge;
+    const framework::SelectedRows *grad_merge_ptr;
+    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
+    merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
+               &tmp_grad_merge, true);
+    grad_merge_ptr = &tmp_grad_merge;
+
+    std::vector<int64_t> in_rows;
+    in_rows.reserve(grad_merge_ptr->rows().size());
+    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
+              std::back_inserter(in_rows));
+
+    const auto *lr = learning_rate->data<T>();
+    auto grad_v = grad_merge_ptr->value();
+    auto grad_width = grad_v.dims()[1];
+
+    //    auto is_entry = context.Attr<bool>("is_entry");
+    auto tablename = ctx.Attr<std::string>("tablename");
+    auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
+
+    auto *beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
+    auto *beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
+    auto *beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
+    auto *beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+
+    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta1 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta1_pow_out->numel()));
+
+    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta2 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta2_pow_out->numel()));
+
+    // update beta1 and beta2
+    beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+        beta1 * beta1_pow->data<T>()[0];
+    beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+        beta2 * beta2_pow->data<T>()[0];
+
+    std::vector<std::vector<std::vector<float> *>> values;
+    std::vector<int64_t> dims;
+
+    auto *ins = distributed::LargeScaleKV::GetInstance();
+    auto *table = ins->Get(tablename);
+    table->Get(in_rows, value_names, &values);
+    table->Dims({"Param"}, &dims);
+
+    PADDLE_ENFORCE_EQ(dims[0], grad_width,
+                      platform::errors::InvalidArgument(
+                          "param_row should have the same size with grad_row"));
+
+    T lr_ = lr[0];
+    T beta1_pow_ = beta1_pow->data<T>()[0];
+    T beta2_pow_ = beta2_pow->data<T>()[0];
+
+    lr_ *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
+
+    for (size_t i = 0; i < in_rows.size(); i++) {
+      auto &params = values[i][0];
+      auto &moment_1 = values[i][1];
+      auto &moment_2 = values[i][2];
+
+      auto *p_data = params->data();
+      auto *m1_data = moment_1->data();
+      auto *m2_data = moment_2->data();
+
+      for (int x = 0; x < grad_width; ++x) {
+        auto g = grad_v.data<T>()[grad_width * i + x];
+        m1_data[x] = beta1 * m1_data[x] + (1 - beta1) * g;
+        m2_data[x] = beta2 * m2_data[x] + (1 - beta2) * g * g;
+        p_data[x] -= lr_ * (m1_data[x] / (sqrt(m2_data[x]) + epsilon));
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
new file mode 100644
index 00000000000..010658b5280
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h"
+
+#include <string>
+namespace paddle {
+namespace operators {
+
+class LargeScaleFuseSGDOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of LargeScaleFuseSGDOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("LearningRate"),
+        "Input(LearningRate) of LargeScaleFuseSGDOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+
+    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
+                      "Maybe the Input variable LearningRate has not "
+                      "been initialized. You may need to confirm "
+                      "if you put exe.run(startup_program) "
+                      "after optimizer.minimize function.");
+
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 element");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    if (var_name == "LearningRate") {
+      return framework::OpKernelType(tensor.type(), tensor.place(),
+                                     tensor.layout());
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class LargeScaleFuseSGDOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto in_var_type = ctx->GetInputType("Grad");
+    PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
+                          in_var_type == framework::proto::VarType::LOD_TENSOR,
+                      true, platform::errors::InvalidArgument(
+                                "The input Var's type should be LoDtensor or "
+                                "SelectedRows, but the received type is %s",
+                                in_var_type));
+  }
+};
+
+class LargeScaleFuseSGDOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Grad",
+             "(SelectedRows) Ids's type should be SelectedRows"
+             "THe ids to be looked up in W.");
+    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
+    AddAttr<bool>("is_entry",
+                  "(bool)"
+                  "sparse table need entry");
+
+    AddAttr<std::string>("tablename",
+                         "(string)"
+                         "sparse table name");
+
+    AddAttr<std::vector<std::string>>("value_names",
+                                      "(strings)"
+                                      "sparse table name");
+
+    AddComment(R"DOC(
+
+LargeScaleFuseSGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+$$param\_out = param - learning\_rate * grad$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    lookup_sparse_table_fuse_sgd, ops::LargeScaleFuseSGDOp,
+    ops::LargeScaleFuseSGDOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::LargeScaleFuseSGDOpInferVarType);
+
+REGISTER_OP_CPU_KERNEL(
+    lookup_sparse_table_fuse_sgd,
+    ops::LargeScaleFuseSGDOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
new file mode 100644
index 00000000000..5d4bf1015fa
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/large_scale_kv.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LargeScaleFuseSGDOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override;
+};
+
+template <typename T>
+class LargeScaleFuseSGDOpKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    const auto *grad_var = ctx.InputVar("Grad");
+
+    PADDLE_ENFORCE(
+        grad_var->IsType<framework::SelectedRows>(),
+        platform::errors::InvalidArgument(
+            "in large scale optimize, gradient should only be SelectedRows"));
+
+    const auto &grad = grad_var->Get<framework::SelectedRows>();
+
+    // for distributed training, a sparse var may be empty,
+    // just skip updating.
+    if (grad.rows().size() == 0) {
+      return;
+    }
+
+    framework::SelectedRows tmp_grad_merge;
+    const framework::SelectedRows *grad_merge_ptr;
+    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
+    merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
+               &tmp_grad_merge, true);
+    grad_merge_ptr = &tmp_grad_merge;
+
+    std::vector<int64_t> in_rows;
+    in_rows.reserve(grad_merge_ptr->rows().size());
+    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
+              std::back_inserter(in_rows));
+
+    const auto *lr = learning_rate->data<T>();
+    auto grad_v = grad_merge_ptr->value();
+    auto grad_width = grad_v.dims()[1];
+
+    //    auto is_entry = context.Attr<bool>("is_entry");
+    auto tablename = ctx.Attr<std::string>("tablename");
+    auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
+
+    std::vector<std::vector<std::vector<float> *>> values;
+    std::vector<int64_t> dims;
+
+    auto *ins = distributed::LargeScaleKV::GetInstance();
+    auto *table = ins->Get(tablename);
+    table->Get(in_rows, value_names, &values);
+    table->Dims({"Param"}, &dims);
+
+    PADDLE_ENFORCE_EQ(dims[0], grad_width,
+                      platform::errors::InvalidArgument(
+                          "param_row should have the same size with grad_row"));
+
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+
+    std::vector<T> grads;
+    framework::TensorToVector(grad_v, ctx.device_context(), &grads);
+
+    blas.SCAL(grads.size(), lr[0], grads.data());
+
+    for (int x = 0; x < static_cast<int>(in_rows.size()); ++x) {
+      auto &params = values[x][0];
+      blas.VSUB(grad_width, params->data(), grads.data() + grad_width * x,
+                params->data());
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 15b36baeada..2547ba3acb1 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -37,12 +37,6 @@ class RecvOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    int do_not_run = Attr<int>("do_not_run");
-    if (do_not_run) {
-      VLOG(3) << "recv do not run!";
-      return;
-    }
-
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
     std::vector<std::string> varnames =
         Attr<std::vector<std::string>>("varnames");
@@ -63,11 +57,10 @@ class RecvOp : public framework::OperatorBase {
     if (recv_varnames.size() > 0) {
       auto *communicator = distributed::Communicator::GetInstance();
 
-      if (communicator == nullptr) {
+      if (communicator != nullptr) {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "need run fleet.init_worker first"));
+            "execute startup program must before fleet.init_worker"));
       }
-      communicator->RecvNoBarrier();
     } else {
       std::vector<distributed::VarHandlePtr> rets;
       if (with_barrier) {
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index ae5c53b8a37..6dd4661f000 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -220,12 +220,12 @@ class ParameterServerRuntime(RuntimeBase):
         else:
             model_dirname = None
 
-        if self.role_maker._is_heter_worker():
-            self._init_worker()
-
         executor = self._get_executor()
         executor.run(fluid.default_startup_program())
 
+        if self.role_maker._is_heter_worker():
+            self._init_worker()
+
         if self.role_maker._is_heter_worker():
             return
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
index 236cb458be4..e556a98ed75 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -191,12 +191,14 @@ class FleetTranspiler(Fleet):
         self._communicator = Communicator(
             trainer_config.mode, kwargs,
             trainer_config.get_communicator_flags())
+
         self._communicator.init_with_ctx(send_ctx, recv_ctx)
 
         if not self._communicator.is_running():
             self._communicator.start()
         else:
-            warnings.warn("communicator has been initialized, skip")
+            raise ValueError(
+                "Communicator can only be inited once, please check")
 
     def init_worker(self):
         """
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
index 05deff10a2e..a60c4e149f5 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
@@ -624,6 +624,7 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
         value_dims = []
         grad = None
         opt_idx = -1
+        fuse = False
 
         for op in block.ops:
             opt_idx += 1
@@ -631,6 +632,9 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
             if op.type not in opt_value_map.keys():
                 continue
 
+            if op.type in ["sgd", "adam"]:
+                fuse = True
+
             grad = main_program.global_block().vars[op.input("Grad")[0]]
 
             for value in opt_value_map[op.type]:
@@ -644,7 +648,67 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
 
             if value_names:
                 break
-        return grad, opt_idx, value_names, value_dims, acture_names
+        return grad, opt_idx, value_names, value_dims, acture_names, fuse
+
+    def add_fuse_large_scale_op(block, global_block, table_name, value_names,
+                                acture_names, grad, is_entry, opt_idx):
+
+        op = block.ops[opt_idx]
+
+        if op.type == "sgd":
+            grad = main_program.global_block().vars[op.input("Grad")[0]]
+            lr = main_program.global_block().vars[op.input("LearningRate")[0]]
+
+            block._insert_op(
+                opt_idx,
+                type="lookup_sparse_table_fuse_sgd",
+                inputs={"Grad": grad,
+                        "LearningRate": lr},
+                attrs={
+                    "is_entry": is_entry,
+                    "tablename": table_name,
+                    "value_names": value_names
+                })
+
+        elif op.type == "adam":
+            grad = main_program.global_block().vars[op.input("Grad")[0]]
+            lr = main_program.global_block().vars[op.input("LearningRate")[0]]
+            beta1_pow = main_program.global_block().vars[op.input("Beta1Pow")[
+                0]]
+            beta2_pow = main_program.global_block().vars[op.input("Beta2Pow")[
+                0]]
+            beta1_pow_o = main_program.global_block().vars[op.output(
+                "Beta1PowOut")[0]]
+            beta2_pow_o = main_program.global_block().vars[op.output(
+                "Beta2PowOut")[0]]
+
+            beta1 = op.attr('beta1')
+            beta2 = op.attr('beta2')
+            epsilon = op.attr('epsilon')
+
+            block._insert_op(
+                opt_idx,
+                type="lookup_sparse_table_fuse_adam",
+                inputs={
+                    "Grad": grad,
+                    "LearningRate": lr,
+                    "Beta1Pow": beta1_pow,
+                    "Beta2Pow": beta2_pow
+                },
+                outputs={
+                    "Beta1PowOut": beta1_pow_o,
+                    "Beta2PowOut": beta2_pow_o
+                },
+                attrs={
+                    "beta1": beta1,
+                    "beta2": beta2,
+                    "epsilon": epsilon,
+                    "is_entry": is_entry,
+                    "tablename": table_name,
+                    "value_names": value_names
+                })
+        else:
+            raise ValueError("only support sgd/adam optimizer now")
 
     def add_large_scale_op(block, global_block, table_name, value_names,
                            acture_names, grad, is_entry, opt_idx):
@@ -711,24 +775,35 @@ def large_scale_sparse_pass(program, main_program, config, is_startup=False):
         for param, blockid in param_blockid_map.items():
             opt_block = program.block(blockid)
 
-            grad, opt_idx, value_names, value_dims, acture_names = \
+            grad, opt_idx, value_names, value_dims, acture_names, fuse = \
                 get_optimizer_values(opt_block)
 
             entry_attr = get_entry_attr(param)
             is_entry = False if entry_attr == "none" else True
-            add_large_scale_op(opt_block,
-                               program.global_block(), param, value_names,
-                               acture_names, grad, is_entry, opt_idx)
 
+            if fuse:
+                add_fuse_large_scale_op(opt_block,
+                                        program.global_block(), param,
+                                        value_names, acture_names, grad,
+                                        is_entry, opt_idx)
+            else:
+                add_large_scale_op(opt_block,
+                                   program.global_block(), param, value_names,
+                                   acture_names, grad, is_entry, opt_idx)
     else:
         large_scale_kv_metas = []
         for param, blockid in param_blockid_map.items():
             opt_block = main_program.block(blockid)
-            grad, _, value_names, value_dims, acture_names = \
+
+            grad, opt_idx, value_names, value_dims, acture_names, fuse = \
                 get_optimizer_values(opt_block)
 
             entry_attr = get_entry_attr(param)
 
+            if fuse:
+                # remove origin optimzier op
+                opt_block._remove_op(opt_idx)
+
             # training/infer
             mode = "0"
             names_str = ",".join(value_names)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 4543af9820e..3f826da3ae2 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -227,22 +227,6 @@ def init_from_server_pass(program, config):
     fetch_barrier_out = program.global_block().create_var(
         name=framework.generate_control_dev_var_name())
 
-    recv_ctx = config.get_communicator_recv_context(recv_type=1)
-    recv_varnames = []
-
-    for name, ctxs in recv_ctx.items():
-        recv_varnames.extend(ctxs.origin_varnames())
-
-    program.global_block().append_op(
-        type="recv",
-        inputs={"X": []},
-        outputs={"Out": []},
-        attrs={
-            "recv_varnames": recv_varnames,
-            "trainer_id": config.get_role_id(),
-            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-        })
-
     program.global_block().append_op(
         type="fetch_barrier",
         inputs={},
diff --git a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
index 60378aa9827..06a90b78fd2 100644
--- a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
+++ b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
@@ -164,8 +164,8 @@ def train(args):
     elif fleet.is_worker():
         logger.info("run trainer")
 
-        fleet.init_worker()
         exe.run(fleet.startup_program)
+        fleet.init_worker()
 
         thread_num = 2
         filelist = []
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 8277499fcce..5721445c414 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -163,8 +163,10 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         """
 
         exe = fluid.Executor(fluid.CPUPlace())
-        fleet.init_worker()
+
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
+
         batch_size = 4
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
         self.reader.decorate_sample_list_generator(train_reader)
@@ -202,8 +204,8 @@ class TestDistCTR2x2(FleetDistRunnerBase):
 
         exe = fluid.Executor(fluid.CPUPlace())
 
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
 
         thread_num = 2
         batch_size = 128
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
index 0e3c8099277..3852b225234 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
@@ -60,8 +60,9 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
         device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         place = fluid.CUDAPlace(device_id)
         exe = fluid.Executor(place)
-        fleet.init_worker()
+
         exe.run(fleet.startup_program)
+        fleet.init_worker()
 
         batch_size = 4
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
@@ -104,8 +105,8 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
         place = fluid.CUDAPlace(device_id)
         exe = fluid.Executor(place)
 
-        fleet.init_worker()
         exe.run(fleet.startup_program)
+        fleet.init_worker()
 
         thread_num = 2
         batch_size = 128
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index 2f938a813d8..470fb98d799 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -152,8 +152,9 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
         """
 
         exe = fluid.Executor(fluid.CPUPlace())
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
+
         batch_size = 4
         train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
         self.reader.decorate_sample_list_generator(train_reader)
@@ -176,8 +177,8 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
 
         exe = fluid.Executor(fluid.CPUPlace())
 
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
 
         thread_num = int(os.getenv("CPU_NUM", 2))
         batch_size = 128
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
index 2ea69e1b676..ff848488739 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -222,8 +222,8 @@ class TestDistSimnetBow2x2(FleetDistRunnerBase):
         """
 
         exe = fluid.Executor(fluid.CPUPlace())
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
         batch_size = 4
         # reader
         train_reader = paddle.batch(fake_simnet_reader(), batch_size=batch_size)
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
index 77697896b4d..81530573a60 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
@@ -151,8 +151,9 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         """
 
         exe = fluid.Executor(fluid.CPUPlace())
-        fleet.init_worker()
+
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
 
         batch_size = 4
 
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_async.py b/python/paddle/fluid/tests/unittests/test_communicator_async.py
index d032d6d75b5..a86b80b2cf9 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_async.py
@@ -30,11 +30,10 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distribu
 
 class TestCommunicator(unittest.TestCase):
     def net(self):
-        x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+        x = fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        cost = fluid.layers.square_error_cost(input=x, label=y)
         avg_cost = fluid.layers.mean(cost)
         return avg_cost
 
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index d9fc9262b31..5916000fba7 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -83,8 +83,8 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
 
-        fleet.init_worker()
         exe.run(fluid.default_startup_program())
+        fleet.init_worker()
 
         train_reader = paddle.batch(self.fake_reader(), batch_size=24)
         feeder = fluid.DataFeeder(place=place, feed_list=[x, z, y])
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
index 391588780f3..b0f55f2939d 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
@@ -71,8 +71,8 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
 
-        fleet.init_worker()
         exe.run(fleet.startup_program)
+        fleet.init_worker()
 
         train_reader = paddle.batch(self.fake_reader(), batch_size=24)
         feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_sync.py b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
index c0044d9d620..95b209b1460 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
@@ -27,11 +27,9 @@ import paddle.distributed.fleet as fleet
 
 class TestCommunicator(unittest.TestCase):
     def net(self):
-        x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+        x = fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        cost = fluid.layers.square_error_cost(input=x, label=y)
         avg_cost = fluid.layers.mean(cost)
         return avg_cost
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
index a82612b0ed2..7f55e956a94 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -44,16 +44,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         paddle.fluid.framework.switch_startup_program(startup_program)
 
         fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
+        y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
+        avg_cost = paddle.fluid.layers.mean(cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
@@ -71,7 +66,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
                 sends += 1
             if op.type == "sgd":
                 sgds += 1
-        self.assertEqual(sends, 7)
+        self.assertEqual(sends, 1)
         self.assertEqual(sgds, 0)
 
         fleet.init_worker()
@@ -89,16 +84,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         paddle.fluid.framework.switch_startup_program(startup_program)
 
         fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
+        y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
+        avg_cost = paddle.fluid.layers.mean(cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
index b05a53c88bb..db3f2afb366 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -36,16 +36,11 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
 
     def test_gradient_merge_optimizer(self):
         fleet.init(role_maker.PaddleCloudRoleMaker())
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
+        y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
+        avg_cost = paddle.fluid.layers.mean(cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = False
@@ -63,7 +58,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
                 sends += 1
             if op.type == "sgd":
                 sgds += 1
-        self.assertEqual(sends, 6)
+        self.assertEqual(sends, 0)
         self.assertEqual(sgds, 0)
 
         fleet.init_worker()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
index 379bcaf684d..6fe52ba9fe6 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
@@ -70,15 +70,13 @@ class TestPSPassWithBow(unittest.TestCase):
         q = fluid.layers.data(
             name="query_ids", shape=[1], dtype="int64", lod_level=1)
         # embedding
-        q_emb = fluid.layers.embedding(
+        q_emb = fluid.contrib.layers.sparse_embedding(
             input=q,
-            is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=0.01),
                 name="__emb__",
-                learning_rate=emb_lr),
-            is_sparse=is_sparse)
+                learning_rate=emb_lr))
         q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
         # vsum
         q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
@@ -97,15 +95,13 @@ class TestPSPassWithBow(unittest.TestCase):
         pt = fluid.layers.data(
             name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
         # embedding
-        pt_emb = fluid.layers.embedding(
+        pt_emb = fluid.contrib.layers.sparse_embedding(
             input=pt,
-            is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=0.01),
                 name="__emb__",
-                learning_rate=emb_lr),
-            is_sparse=is_sparse)
+                learning_rate=emb_lr))
         pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
         # vsum
         pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
@@ -123,15 +119,13 @@ class TestPSPassWithBow(unittest.TestCase):
         nt = fluid.layers.data(
             name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
         # embedding
-        nt_emb = fluid.layers.embedding(
+        nt_emb = fluid.contrib.layers.sparse_embedding(
             input=nt,
-            is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=0.01),
                 name="__emb__",
-                learning_rate=emb_lr),
-            is_sparse=is_sparse)
+                learning_rate=emb_lr))
         nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
         # vsum
         nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
@@ -167,7 +161,7 @@ class TestPSPassWithBow(unittest.TestCase):
 
         fleet.init(role)
         loss, acc, _ = self.net()
-        optimizer = fluid.optimizer.SGD(base_lr)
+        optimizer = fluid.optimizer.Adam(base_lr)
         strategy = StrategyFactory.create_async_strategy()
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(loss)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
index fd069793473..c570c4d8cd0 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
@@ -168,12 +168,13 @@ class TestPSPassWithBow(unittest.TestCase):
         fleet.init(role)
         loss, acc, _ = self.net()
 
-        optimizer = fluid.optimizer.SGD(
+        optimizer = fluid.optimizer.Adagrad(
             learning_rate=fluid.layers.exponential_decay(
                 learning_rate=base_lr,
                 decay_steps=500,
                 decay_rate=0.969,
                 staircase=True))
+
         strategy = StrategyFactory.create_async_strategy()
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(loss)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
new file mode 100644
index 00000000000..d5b1284e3ce
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
@@ -0,0 +1,168 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+
+# For Net
+base_lr = 0.2
+emb_lr = base_lr * 3
+dict_dim = 1500
+emb_dim = 128
+hid_dim = 128
+margin = 0.1
+sample_rate = 1
+batch_size = 4
+
+
+class TestPSPassWithBow(unittest.TestCase):
+    def net(self):
+        def get_acc(cos_q_nt, cos_q_pt, batch_size):
+            cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
+            cond = fluid.layers.cast(cond, dtype='float64')
+            cond_3 = fluid.layers.reduce_sum(cond)
+            acc = fluid.layers.elementwise_div(
+                cond_3,
+                fluid.layers.fill_constant(
+                    shape=[1], value=batch_size * 1.0, dtype='float64'),
+                name="simnet_acc")
+            return acc
+
+        def get_loss(cos_q_pt, cos_q_nt):
+            loss_op1 = fluid.layers.elementwise_sub(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=cos_q_pt,
+                    shape=[-1, 1],
+                    value=margin,
+                    dtype='float32'),
+                cos_q_pt)
+            loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
+            loss_op3 = fluid.layers.elementwise_max(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                loss_op2)
+            avg_cost = fluid.layers.mean(loss_op3)
+            return avg_cost
+
+        is_distributed = False
+        is_sparse = True
+
+        # query
+        q = fluid.layers.data(
+            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        q_emb = fluid.contrib.layers.sparse_embedding(
+            input=q,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
+        # vsum
+        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
+        q_ss = fluid.layers.softsign(q_sum)
+        # fc layer after conv
+        q_fc = fluid.layers.fc(
+            input=q_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__q_fc__",
+                learning_rate=base_lr))
+        # label data
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        # pt
+        pt = fluid.layers.data(
+            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        pt_emb = fluid.contrib.layers.sparse_embedding(
+            input=pt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
+        # vsum
+        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
+        pt_ss = fluid.layers.softsign(pt_sum)
+        # fc layer
+        pt_fc = fluid.layers.fc(
+            input=pt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        # nt
+        nt = fluid.layers.data(
+            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        nt_emb = fluid.contrib.layers.sparse_embedding(
+            input=nt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
+        # vsum
+        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
+        nt_ss = fluid.layers.softsign(nt_sum)
+        # fc layer
+        nt_fc = fluid.layers.fc(
+            input=nt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
+        cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
+        # loss
+        avg_cost = get_loss(cos_q_pt, cos_q_nt)
+        # acc
+        acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
+        return [avg_cost, acc, cos_q_pt]
+
+    def test(self):
+        endpoints = [
+            "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006",
+            "127.0.0.1:36007"
+        ]
+
+        role = role_maker.UserDefinedRoleMaker(
+            current_id=0,
+            role=role_maker.Role.SERVER,
+            worker_num=2,
+            server_endpoints=endpoints)
+
+        fleet.init(role)
+        loss, acc, _ = self.net()
+        optimizer = fluid.optimizer.Adagrad(base_lr)
+        strategy = StrategyFactory.create_async_strategy()
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(loss)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py b/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
new file mode 100644
index 00000000000..bca91c536ba
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
@@ -0,0 +1,171 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+
+class TestLookupTableFuseOp(unittest.TestCase):
+    def test_fuse(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+    def check_with_place(self, place):
+        scope = fluid.global_scope()
+        scope.var("LearningRate").get_tensor().set([0.01], place)
+        scope.var("Ids").get_tensor().set([i for i in range(100)], place)
+
+        init_program = fluid.Program()
+
+        lr = init_program.global_block().create_var(
+            name="LearningRate",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            dtype="float32")
+
+        ids = init_program.global_block().create_var(
+            name="Ids",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[100],
+            dtype="int64")
+
+        output = init_program.global_block().create_var(
+            name="output",
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[100, 8],
+            dtype="float32")
+
+        metas = []
+        metas.append(
+            "embedding_1.block0:Param,Moment1,Moment2:8,8,8:0:embedding_1@GRAD.block0:embedding_1.block0,embedding_1_moment1_0,embedding_1_moment2_0,kSparseIDs@embedding_1.block0:uniform_random&0&-0.5&0.5,fill_constant&0.0,fill_constant&0.0:none"
+        )
+        metas.append(
+            "embedding_2.block0:Param:8:0:embedding_2@GRAD.block0:embedding_2.block0,kSparseIDs@embedding_2.block0:uniform_random&0&-0.5&0.5:none"
+        )
+
+        init_program.global_block().append_op(
+            type="lookup_sparse_table_init",
+            inputs=None,
+            outputs=None,
+            attrs={"large_scale_metas": metas})
+
+        init_program.global_block().append_op(
+            type="lookup_sparse_table_read",
+            inputs={"Ids": ids},
+            outputs={"Out": output},
+            attrs={
+                "tablename": "embedding_1.block0",
+                "init": True,
+                "value_names": ["Param"],
+            })
+
+        init_program.global_block().append_op(
+            type="lookup_sparse_table_read",
+            inputs={"Ids": ids},
+            outputs={"Out": output},
+            attrs={
+                "tablename": "embedding_2.block0",
+                "init": True,
+                "value_names": ["Param"],
+            })
+
+        executor = fluid.Executor(place)
+        executor.run(init_program)
+
+        training_program = fluid.Program()
+
+        scope.var('Beta1Pow').get_tensor().set(
+            np.array([0]).astype("float32"), place)
+        scope.var('Beta2Pow').get_tensor().set(
+            np.array([0]).astype("float32"), place)
+
+        rows = [0, 1, 2, 3, 4, 5, 6]
+        row_numel = 8
+        w_selected_rows = scope.var('Grad').get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+        lr = training_program.global_block().create_var(
+            name="LearningRate",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            dtype="float32")
+
+        grads = training_program.global_block().create_var(
+            name="Grad",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.SELECTED_ROWS,
+            shape=[100, 8],
+            dtype="float32")
+
+        beta1 = training_program.global_block().create_var(
+            name="Beta1Pow",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            dtype="float32")
+
+        beta2 = training_program.global_block().create_var(
+            name="Beta2Pow",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            dtype="float32")
+
+        training_program.global_block().append_op(
+            type="lookup_sparse_table_fuse_adam",
+            inputs={
+                "Grad": grads,
+                "LearningRate": lr,
+                "Beta1Pow": beta1,
+                "Beta2Pow": beta2,
+            },
+            outputs={"Beta1PowOut": beta1,
+                     "Beta2PowOut": beta2},
+            attrs={
+                "is_entry": False,
+                "tablename": "embedding_1.block0",
+                "value_names": ["Param", "Moment1", "Moment2"],
+            })
+
+        training_program.global_block().append_op(
+            type="lookup_sparse_table_fuse_sgd",
+            inputs={"Grad": grads,
+                    "LearningRate": lr},
+            attrs={
+                "is_entry": False,
+                "tablename": "embedding_2.block0",
+                "value_names": ["Param"],
+            })
+
+        executor.run(training_program)
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From 5508c7874492de3e95a2925d214225b6c8558747 Mon Sep 17 00:00:00 2001
From: LutaoChu <30695251+LutaoChu@users.noreply.github.com>
Date: Wed, 23 Sep 2020 17:11:01 +0800
Subject: [PATCH 185/261] Fix bug: The calculation result of Diag_v2 Op under
 large size input is wrong (#27447)

The calculation result of Diag_v2 Op under large size input is wrong
---
 paddle/fluid/operators/diag_v2_op.cu          | 44 ++++++++-------
 .../paddle/fluid/tests/unittests/test_diag.py | 55 ++++++++++++++++++-
 2 files changed, 77 insertions(+), 22 deletions(-)

diff --git a/paddle/fluid/operators/diag_v2_op.cu b/paddle/fluid/operators/diag_v2_op.cu
index 4386cc6b818..12ea31945f8 100644
--- a/paddle/fluid/operators/diag_v2_op.cu
+++ b/paddle/fluid/operators/diag_v2_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+#include <tuple>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/diag_v2_op.h"
 
@@ -58,6 +59,17 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
     auto out_dims = out->dims();
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
+    auto GetBlockGridSize = [&dev_ctx](int64_t size) {
+      const int64_t block_size =
+          std::min(size, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock()));
+      int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+      const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
+                                          static_cast<int64_t>(1));
+      const int64_t grid_size =
+          std::min(max_blocks, (size + block_size - 1) / block_size);
+      return std::tuple<int64_t, int64_t>{block_size, grid_size};
+    };
+
     if (x_dims.size() == 1) {
       float padding_value = context.Attr<float>("padding_value");
       math::SetConstant<DeviceContext, T> set_padding_value;
@@ -67,26 +79,23 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
       auto size = (offset > 0) ? x_length + offset : x_length - offset;
       const int& x_stride = ComputeStride(0, x_dims);
       if (size > 0) {
-        const int block_num = std::min(static_cast<int>(size),
-                                       dev_ctx.GetMaxPhysicalThreadCount());
-        int size_ = static_cast<int>(size);
-        int block_num_ = static_cast<int>(block_num);
-        const int grid_num =
-            std::min(1024, (size_ + block_num_ - 1) / block_num_);
         const auto& out_stride_0 = ComputeStride(0, out_dims);
         const auto& out_stride_1 = ComputeStride(1, out_dims);
         auto start =
             (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0);
 
-        PasteDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>(
-            out_data, x_data, start, x_length, out_stride_0 + out_stride_1,
-            x_stride);
+        std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+
+        PasteDiagonalKernel<
+            T><<<std::get<1>(block_grid_size), std::get<0>(block_grid_size), 0,
+                 dev_ctx.stream()>>>(out_data, x_data, start, x_length,
+                                     out_stride_0 + out_stride_1, x_stride);
       }
     } else {
       const int& x_stride_0 = ComputeStride(0, x_dims);
       const int& x_stride_1 = ComputeStride(1, x_dims);
 
-      int size;
+      int64_t size;
       if (offset > 0) {
         size = std::min(x_dims[0], x_dims[1] - offset);
       } else {
@@ -94,18 +103,15 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
       }
 
       if (size > 0) {
-        const int block_num = std::min(static_cast<int>(size),
-                                       dev_ctx.GetMaxPhysicalThreadCount());
-        int size_ = static_cast<int>(size);
-        int block_num_ = static_cast<int>(block_num);
-        const int grid_num =
-            std::min(1024, (size_ + block_num_ - 1) / block_num_);
         auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0);
         const auto& out_stride_0 = ComputeStride(0, out_dims);
 
-        ExtractDiagonalKernel<T><<<grid_num, block_num, 0, dev_ctx.stream()>>>(
-            out_data, x_data, start, size, x_stride_0 + x_stride_1,
-            out_stride_0);
+        std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
+
+        ExtractDiagonalKernel<
+            T><<<std::get<1>(block_grid_size), std::get<0>(block_grid_size), 0,
+                 dev_ctx.stream()>>>(out_data, x_data, start, size,
+                                     x_stride_0 + x_stride_1, out_stride_0);
       }
     }
   }
diff --git a/python/paddle/fluid/tests/unittests/test_diag.py b/python/paddle/fluid/tests/unittests/test_diag.py
index 780d57b5331..ddf1240e4ef 100644
--- a/python/paddle/fluid/tests/unittests/test_diag.py
+++ b/python/paddle/fluid/tests/unittests/test_diag.py
@@ -119,6 +119,16 @@ class TestDiagV2API(unittest.TestCase):
             (n, n)) + np.diag(self.input_np3, self.offset) - np.diag(
                 self.padding_value * np.ones(n))
 
+        self.input_np4 = np.random.random(size=(2000, 2000)).astype(np.float32)
+        self.expected6 = np.diag(self.input_np4)
+        self.expected7 = np.diag(self.input_np4, k=1)
+        self.expected8 = np.diag(self.input_np4, k=-1)
+
+        self.input_np5 = np.random.random(size=(2000)).astype(np.float32)
+        self.expected9 = np.diag(self.input_np5)
+        self.expected10 = np.diag(self.input_np5, k=1)
+        self.expected11 = np.diag(self.input_np5, k=-1)
+
     def run_imperative(self):
         x = paddle.to_tensor(self.input_np)
         y = paddle.diag(x)
@@ -141,10 +151,32 @@ class TestDiagV2API(unittest.TestCase):
         y = paddle.diag(x, padding_value=-8)
         self.assertTrue(np.allclose(y.numpy(), self.expected5))
 
+        x = paddle.to_tensor(self.input_np4)
+        y = paddle.diag(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected6))
+
+        y = paddle.diag(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected7))
+
+        y = paddle.diag(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected8))
+
+        x = paddle.to_tensor(self.input_np5)
+        y = paddle.diag(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected9))
+
+        y = paddle.diag(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected10))
+
+        y = paddle.diag(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected11))
+
     def run_static(self, use_gpu=False):
         x = paddle.data(name='input', shape=[10, 10], dtype='float32')
         x2 = paddle.data(name='input2', shape=[100], dtype='float64')
         x3 = paddle.data(name='input3', shape=[100], dtype='int64')
+        x4 = paddle.data(name='input4', shape=[2000, 2000], dtype='float32')
+        x5 = paddle.data(name='input5', shape=[2000], dtype='float32')
         result0 = paddle.diag(x)
         result1 = paddle.diag(x, offset=1)
         result2 = paddle.diag(x, offset=-1)
@@ -152,17 +184,28 @@ class TestDiagV2API(unittest.TestCase):
         result4 = paddle.diag(x2, padding_value=8)
         result5 = paddle.diag(x3, padding_value=8.0)
         result6 = paddle.diag(x3, padding_value=-8)
+        result7 = paddle.diag(x4)
+        result8 = paddle.diag(x4, offset=1)
+        result9 = paddle.diag(x4, offset=-1)
+        result10 = paddle.diag(x5)
+        result11 = paddle.diag(x5, offset=1)
+        result12 = paddle.diag(x5, offset=-1)
 
         place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        res0, res1, res2, res4, res5, res6 = exe.run(
+        res0, res1, res2, res4, res5, res6, res7, res8, res9, res10, res11, res12 = exe.run(
             feed={
                 "input": self.input_np,
                 "input2": self.input_np2,
-                'input3': self.input_np3
+                'input3': self.input_np3,
+                'input4': self.input_np4,
+                'input5': self.input_np5
             },
-            fetch_list=[result0, result1, result2, result4, result5, result6])
+            fetch_list=[
+                result0, result1, result2, result4, result5, result6, result7,
+                result8, result9, result10, result11, result12
+            ])
 
         self.assertTrue(np.allclose(res0, self.expected0))
         self.assertTrue(np.allclose(res1, self.expected1))
@@ -171,6 +214,12 @@ class TestDiagV2API(unittest.TestCase):
         self.assertTrue(np.allclose(res4, self.expected3))
         self.assertTrue(np.allclose(res5, self.expected4))
         self.assertTrue(np.allclose(res6, self.expected5))
+        self.assertTrue(np.allclose(res7, self.expected6))
+        self.assertTrue(np.allclose(res8, self.expected7))
+        self.assertTrue(np.allclose(res9, self.expected8))
+        self.assertTrue(np.allclose(res10, self.expected9))
+        self.assertTrue(np.allclose(res11, self.expected10))
+        self.assertTrue(np.allclose(res12, self.expected11))
 
     def test_cpu(self):
         paddle.disable_static(place=paddle.fluid.CPUPlace())
-- 
GitLab


From 1e1ae5c54d31f6167c644d769d09e188495f0816 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Wed, 23 Sep 2020 19:06:50 +0800
Subject: [PATCH 186/261] Make the Bind Method of Tensor more automatic
 (#27270)

* Makes the Bind Method more intelligent

* Makes the Bind Method more intelligent

* fix unittest

* fix unittest

* fix conflict
---
 paddle/fluid/pybind/imperative.cc             |  62 +++++------
 python/paddle/fluid/dygraph/math_op_patch.py  | 101 +++++-------------
 python/paddle/fluid/layers/math_op_patch.py   |  32 ++----
 .../tests/unittests/rnn/test_rnn_cells.py     |  14 +--
 .../tests/unittests/rnn/test_rnn_nets.py      |  26 ++---
 .../unittests/test_math_op_patch_var_base.py  |  88 ++++++++++++---
 .../fluid/tests/unittests/test_minimum_op.py  |   8 +-
 .../fluid/tests/unittests/test_mse_loss.py    |   9 +-
 .../fluid/tests/unittests/test_nll_loss.py    |  12 +--
 .../unittests/test_nn_margin_rank_loss.py     |  18 ++--
 .../tests/unittests/test_nn_sigmoid_op.py     |   4 +-
 .../fluid/tests/unittests/test_numel_op.py    |   4 +-
 .../fluid/tests/unittests/test_ones_like.py   |   2 +-
 .../tests/unittests/test_pairwise_distance.py |   4 +-
 .../fluid/tests/unittests/test_sort_op.py     |   4 +-
 .../fluid/tests/unittests/test_tile_op.py     |   6 +-
 .../tests/unittests/test_transformer_api.py   |  76 ++++++-------
 .../fluid/tests/unittests/test_warpctc_op.py  |  16 +--
 python/paddle/tensor/manipulation.py          |   2 +-
 19 files changed, 229 insertions(+), 259 deletions(-)

diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 489dd198876..da9900e2b27 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -649,61 +649,47 @@ void BindImperative(py::module *m_ptr) {
              return self.NewVarBase(tensor.place(), false);
            },
            py::return_value_policy::copy, R"DOC(
-        **Notes**:
-            **This API is ONLY available in Dygraph mode**
 
-        Returns a new Variable, detached from the current graph.
-
-        Returns:
-             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable): The detached Variable.
+        Returns a new Tensor, detached from the current graph.
 
+        Returns: The detached Tensor.
 
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import Linear
-                import numpy as np
-
-                data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
-                with fluid.dygraph.guard():
-                    linear = Linear(32, 64)
-                    data = to_variable(data)
-                    x = linear(data)
-                    y = x.detach()
+                import paddle
+                paddle.disable_static()
 
+                linear = Linear(32, 64)
+                data = paddle.uniform(shape=[30, 10, 32], -1, 1)
+                x = linear(data)
+                y = x.detach()
        )DOC")
       .def("clear_gradient", &imperative::VarBase::ClearGradient, R"DOC(
 
-        **Notes**:
-        **1. This API is ONLY available in Dygraph mode**
-
-        **2. Use it only Variable has gradient, normally we use this for Parameters since other temporal Variable will be deleted by Python's GC**
+        Only for Tensor that has gradient, normally we use this for Parameters since other temporary Tensor doesen't has gradient.
 
-        Clear  (set to ``0`` ) the Gradient of Current Variable
+        The Gradient of current Tensor will be set to ``0`` .
 
         Returns:  None
 
         Examples:
              .. code-block:: python
 
-                import paddle.fluid as fluid
-                import numpy as np
-
-                x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    inputs2 = []
-                    for _ in range(10):
-                         tmp = fluid.dygraph.base.to_variable(x)
-                         tmp.stop_gradient=False
-                         inputs2.append(tmp)
-                    ret2 = fluid.layers.sums(inputs2)
-                    loss2 = fluid.layers.reduce_sum(ret2)
-                    loss2.backward()
-                    print(loss2.gradient())
-                    loss2.clear_gradient()
-                    print("After clear {}".format(loss2.gradient()))
+                import paddle
+                paddle.disable_static()
+
+                inputs = []
+                for _ in range(10):
+                    tmp = paddle.ones([2, 2])
+                    tmp.stop_gradient=False
+                    inputs.append(tmp)
+                ret = paddle.sums(inputs2)
+                loss = paddle.reduce_sum(ret)
+                loss.backward()
+                print("Before clear_gradient {}".format(loss.grad))
+                loss.clear_gradient()
+                print("After clear_gradient {}".format(loss.grad))
       )DOC")
       .def("_run_backward",
            [](imperative::VarBase &self, const imperative::Tracer &tracer,
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 3aa7b9dfc26..68206f62860 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -17,8 +17,7 @@ from __future__ import print_function
 from .. import core
 from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
 from ..layers.layer_function_generator import OpProtoHolder
-from ..layers import common_methods
-from . import to_variable, no_grad
+from . import no_grad
 
 import numpy as np
 import six
@@ -53,47 +52,25 @@ def monkey_patch_math_varbase():
 
     def astype(self, dtype):
         """
-        **Notes**:
-            **The variable must be a** :ref:`api_fluid_Tensor`
 
-        Cast a variable to a specified data type.
+        Cast a Tensor to a specified data type.
 
         Args:
-
-            self(Variable): The source variable
-
-            dtype: The target data type
+            dtype: The target data type.
 
         Returns:
-            Variable: Variable with new dtype
+            Tensor: a new Tensor with target dtype
 
         Examples:
-            In Static Graph Mode:
-
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                startup_prog = fluid.Program()
-                main_prog = fluid.Program()
-                with fluid.program_guard(startup_prog, main_prog):
-                    original_variable = fluid.data(name = "new_variable", shape=[2,2], dtype='float32')
-                    new_variable = original_variable.astype('int64')
-                    print("new var's dtype is: {}".format(new_variable.dtype))
-
-            In Dygraph Mode:
-
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
                 import numpy as np
 
-                x = np.ones([2, 2], np.float32)
-                with fluid.dygraph.guard():
-                    original_variable = fluid.dygraph.to_variable(x)
-                    print("original var's dtype is: {}, numpy dtype is {}".format(original_variable.dtype, original_variable.numpy().dtype))
-                    new_variable = original_variable.astype('int64')
-                    print("new var's dtype is: {}, numpy dtype is {}".format(new_variable.dtype, new_variable.numpy().dtype))
+                original_tensor = paddle.ones([2, 2])
+                print("original tensor's dtype is: {}".format(original_tensor.dtype))
+                new_tensor = original_tensor.astype('float32')
+                print("new tensor's dtype is: {}".format(new_tensor.dtype))
 
         """
         if not isinstance(dtype, core.VarDesc.VarType):
@@ -147,6 +124,10 @@ def monkey_patch_math_varbase():
     def _ndim_(var):
         return len(var.shape)
 
+    @property
+    def _size_(var):
+        return np.prod(var.shape)
+
     def _scalar_add_(var, value):
         return _scalar_elementwise_op_(var, 1.0, value)
 
@@ -208,7 +189,6 @@ def monkey_patch_math_varbase():
         __impl__.__doc__ = """
         {0}
         Args:
-            self(Tensor): left hand Tensor
             other_var(Tensor|float|int): right hand Tensor
 
         Returns:
@@ -217,23 +197,7 @@ def monkey_patch_math_varbase():
         __impl__.__name__ = method_name
         return __impl__
 
-    # Todo(zhouwei): implement dygraph template to adapt to any function, receive('op_type', 'arg_template')
-    #  Such as _method_creator_('addmm', 'x, y, alpha=1.0, beta=1.0, name=None'). It can reduce call time.
-    def _method_creator_(op_type, arg_template=None):
-        def __impl__(self):
-            op = getattr(core.ops, op_type)
-            return op(self)
-
-        __impl__.__doc__ = """
-
-        See paddle.{}""".format(op_type)
-        __impl__.__name__ = op_type
-
-        return __impl__
-
     varbase_methods = [
-        # Type1: From custom fun or lambda
-        ##   b=-a
         ('__neg__', _neg_),
         ('__float__', _float_),
         ('__long__', _long_),
@@ -244,8 +208,7 @@ def monkey_patch_math_varbase():
         ('dim', lambda x: len(x.shape)),
         ('ndimension', lambda x: len(x.shape)),
         ('ndim', _ndim_),
-        ('size', lambda x: x.shape),
-        # Type2: From Template that create core.ops automatically. It's recommended.
+        ('size', _size_),
         ('__add__',
          _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)),
         ##  a+b == b+a. Do not need to reverse explicitly
@@ -283,31 +246,7 @@ def monkey_patch_math_varbase():
         ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
         ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
         ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
-        ('__array_ufunc__', None),
-        ('sigmoid', _method_creator_('sigmoid', 'name=None')),
-        ('log_sigmoid', _method_creator_('logsigmoid', 'name=None')),
-        ('exp', _method_creator_('exp', 'name=None')),
-        ('tanh', _method_creator_('tanh', 'name=None')),
-        ('atan', _method_creator_('atan', 'name=None')),
-        ('tanh_shrink', _method_creator_('tanh_shrink', 'name=None')),
-        ('sqrt', _method_creator_('sqrt', 'name=None')),
-        ('rsqrt', _method_creator_('rsqrt', 'name=None')),
-        ('abs', _method_creator_('abs', 'name=None')),
-        ('ceil', _method_creator_('ceil', 'name=None')),
-        ('floor', _method_creator_('floor', 'name=None')),
-        ('cos', _method_creator_('cos', 'name=None')),
-        ('acos', _method_creator_('acos', 'name=None')),
-        ('asin', _method_creator_('asin', 'name=None')),
-        ('sin', _method_creator_('sin', 'name=None')),
-        ('sinh', _method_creator_('sinh', 'name=None')),
-        ('cosh', _method_creator_('cosh', 'name=None')),
-        ('round', _method_creator_('round', 'name=None')),
-        ('reciprocal', _method_creator_('reciprocal', 'name=None')),
-        ('square', _method_creator_('square', 'name=None')),
-        ('softplus', _method_creator_('softplus', 'name=None')),
-        ('softsign', _method_creator_('softsign', 'name=None')),
-        # Type3: Form module 'paddle.tensor' defaultly.
-        #   It's not a goodway, because it will increase call time.
+        ('__array_ufunc__', None)
     ]
 
     global _already_patch_varbase
@@ -318,7 +257,15 @@ def monkey_patch_math_varbase():
             setattr(core.VarBase, method_name, method_impl)
     else:
         import paddle.tensor
-        for method_name in common_methods:
+        # Tensor method from module paddle.tensor
+        tensor_methods = paddle.tensor.linalg.__all__ + \
+                         paddle.tensor.math.__all__ + \
+                         paddle.tensor.logic.__all__ + \
+                         paddle.tensor.manipulation.__all__ + \
+                         paddle.tensor.search.__all__ + \
+                         paddle.tensor.stat.__all__ + \
+                         paddle.tensor.attribute.__all__
+        for method_name in tensor_methods:
             if hasattr(core.VarBase, method_name): continue
             method_impl = getattr(paddle.tensor, method_name, None)
             if method_impl: setattr(core.VarBase, method_name, method_impl)
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 4595f0cf939..92b58a7e2ee 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -54,29 +54,6 @@ EXPRESSION_MAP = {
     "__ge__": "A >= B"
 }
 
-# method for Tensor from paddle.tensor
-# edit it when paddle.tensor has new method about Tensor operation
-common_methods = [
-    'exp', 'tanh', 'atan', 'sqrt', 'rsqrt', 'abs', 'ceil', 'floor', 'cos',
-    'acos', 'asin', 'sin', 'sinh', 'cosh', 'round', 'reciprocal', 'square',
-    'rank', 'matmul', 'dot', 'norm', 'transpose', 'dist', 't', 'cross',
-    'cholesky', 'bmm', 'histogram', 'equal', 'greater_equal', 'greater_than',
-    'is_empty', 'isfinite', 'less_equal', 'less_than', 'logical_and',
-    'logical_not', 'logical_or', 'logical_xor', 'not_equal', 'reduce_all',
-    'reduce_any', 'allclose', 'equal_all', 'cast', 'expand', 'expand_as',
-    'tile', 'flatten', 'gather', 'gather_nd', 'reshape', 'reverse', 'scatter',
-    'scatter_nd_add', 'scatter_nd', 'shard_index', 'slice', 'split', 'squeeze',
-    'strided_slice', 'unique', 'unique_with_counts', 'unsqueeze', 'flip',
-    'unbind', 'roll', 'cumsum', 'increment', 'log', 'pow', 'reciprocal',
-    'round', 'rsqrt', 'scale', 'sign', 'stanh', 'sum', 'reduce_prod', 'max',
-    'min', 'mm', 'div', 'multiply', 'add', 'logsumexp', 'log1p', 'erf',
-    'addcmul', 'addmm', 'clamp', 'trace', 'kron', 'argmax', 'argmin', 'argsort',
-    'has_inf', 'has_nan', 'topk', 'index_select', 'nonzero', 'sort',
-    'index_sample', 'mean', 'std', 'var', 'elementwise_add', 'elementwise_div',
-    'elementwise_floordiv', 'elementwise_mod', 'elementwise_pow',
-    'elementwise_sub'
-]
-
 _already_patch_variable = False
 
 
@@ -372,7 +349,14 @@ def monkey_patch_variable():
             setattr(Variable, method_name, method_impl)
     else:
         import paddle.tensor
-        for method_name in common_methods:
+        variabel_methods = paddle.tensor.linalg.__all__ + \
+                           paddle.tensor.math.__all__ + \
+                           paddle.tensor.logic.__all__ + \
+                           paddle.tensor.manipulation.__all__ + \
+                           paddle.tensor.search.__all__ + \
+                           paddle.tensor.stat.__all__ + \
+                           paddle.tensor.attribute.__all__
+        for method_name in variabel_methods:
             if hasattr(Variable, method_name): continue
             method_impl = getattr(paddle.tensor, method_name, None)
             if method_impl: setattr(Variable, method_name, method_impl)
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
index 8d2677229a0..ab1127afa58 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
@@ -47,7 +47,7 @@ class TestSimpleRNNCell(unittest.TestCase):
         prev_h = np.random.randn(4, 32)
 
         y1, h1 = rnn1(x, prev_h)
-        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def test_with_zero_state(self):
@@ -57,7 +57,7 @@ class TestSimpleRNNCell(unittest.TestCase):
         x = np.random.randn(4, 16)
 
         y1, h1 = rnn1(x)
-        y2, h2 = rnn2(paddle.to_variable(x))
+        y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def runTest(self):
@@ -90,7 +90,7 @@ class TestGRUCell(unittest.TestCase):
         prev_h = np.random.randn(4, 32)
 
         y1, h1 = rnn1(x, prev_h)
-        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def test_with_zero_state(self):
@@ -100,7 +100,7 @@ class TestGRUCell(unittest.TestCase):
         x = np.random.randn(4, 16)
 
         y1, h1 = rnn1(x)
-        y2, h2 = rnn2(paddle.to_variable(x))
+        y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
     def runTest(self):
@@ -134,8 +134,8 @@ class TestLSTMCell(unittest.TestCase):
 
         y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
         y2, (h2, c2) = rnn2(
-            paddle.to_variable(x),
-            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+            paddle.to_tensor(x),
+            (paddle.to_tensor(prev_h), paddle.to_tensor(prev_c)))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -146,7 +146,7 @@ class TestLSTMCell(unittest.TestCase):
         x = np.random.randn(4, 16)
 
         y1, (h1, c1) = rnn1(x)
-        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        y2, (h2, c2) = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
 
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
index ef297b3bb62..7c03b51837e 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -53,7 +53,7 @@ class TestSimpleRNN(unittest.TestCase):
         prev_h = np.random.randn(2 * self.num_directions, 4, 32)
 
         y1, h1 = rnn1(x, prev_h)
-        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -66,7 +66,7 @@ class TestSimpleRNN(unittest.TestCase):
             x = np.transpose(x, [1, 0, 2])
 
         y1, h1 = rnn1(x)
-        y2, h2 = rnn2(paddle.to_variable(x))
+        y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -81,11 +81,11 @@ class TestSimpleRNN(unittest.TestCase):
 
         y1, h1 = rnn1(x, sequence_length=sequence_length)
 
-        seq_len = paddle.to_variable(sequence_length)
+        seq_len = paddle.to_tensor(sequence_length)
         mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
-        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
         y2 = paddle.multiply(y2, mask, axis=0)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
@@ -133,7 +133,7 @@ class TestGRU(unittest.TestCase):
         prev_h = np.random.randn(2 * self.num_directions, 4, 32)
 
         y1, h1 = rnn1(x, prev_h)
-        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        y2, h2 = rnn2(paddle.to_tensor(x), paddle.to_tensor(prev_h))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -146,7 +146,7 @@ class TestGRU(unittest.TestCase):
             x = np.transpose(x, [1, 0, 2])
 
         y1, h1 = rnn1(x)
-        y2, h2 = rnn2(paddle.to_variable(x))
+        y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
@@ -161,11 +161,11 @@ class TestGRU(unittest.TestCase):
 
         y1, h1 = rnn1(x, sequence_length=sequence_length)
 
-        seq_len = paddle.to_variable(sequence_length)
+        seq_len = paddle.to_tensor(sequence_length)
         mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
-        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
         y2 = paddle.multiply(y2, mask, axis=0)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
@@ -209,8 +209,8 @@ class TestLSTM(unittest.TestCase):
 
         y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
         y2, (h2, c2) = rnn2(
-            paddle.to_variable(x),
-            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+            paddle.to_tensor(x),
+            (paddle.to_tensor(prev_h), paddle.to_tensor(prev_c)))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
@@ -224,7 +224,7 @@ class TestLSTM(unittest.TestCase):
             x = np.transpose(x, [1, 0, 2])
 
         y1, (h1, c1) = rnn1(x)
-        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        y2, (h2, c2) = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
@@ -240,11 +240,11 @@ class TestLSTM(unittest.TestCase):
 
         y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
 
-        seq_len = paddle.to_variable(sequence_length)
+        seq_len = paddle.to_tensor(sequence_length)
         mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
-        y2, (h2, c2) = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2, (h2, c2) = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
         y2 = paddle.multiply(y2, mask, axis=0)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index a70862f4019..5df04ddfc3d 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -19,6 +19,7 @@ import paddle
 import paddle.fluid as fluid
 import numpy as np
 import six
+import inspect
 
 
 class TestMathOpPatchesVarBase(unittest.TestCase):
@@ -302,21 +303,13 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
         self.assertEqual(x.dim(), 2)
         self.assertEqual(x.ndimension(), 2)
         self.assertEqual(x.ndim, 2)
-        self.assertEqual(x.size(), [2, 3])
-        self.assertTrue(
-            np.array_equal(x.sigmoid().numpy(), fluid.layers.sigmoid(x).numpy(
-            )))
-        self.assertTrue(
-            np.array_equal(x.log_sigmoid().numpy(),
-                           fluid.layers.logsigmoid(x).numpy()))
+        self.assertEqual(x.size, 6)
+        self.assertEqual(x.numel(), 6)
         self.assertTrue(np.array_equal(x.exp().numpy(), paddle.exp(x).numpy()))
         self.assertTrue(
             np.array_equal(x.tanh().numpy(), paddle.tanh(x).numpy()))
         self.assertTrue(
             np.array_equal(x.atan().numpy(), paddle.atan(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.tanh_shrink().numpy(),
-                           fluid.layers.tanh_shrink(x).numpy()))
         self.assertTrue(np.array_equal(x.abs().numpy(), paddle.abs(x).numpy()))
         m = x.abs()
         self.assertTrue(
@@ -344,12 +337,6 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
             )))
         self.assertTrue(
             np.array_equal(x.square().numpy(), paddle.square(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.softplus().numpy(),
-                           fluid.layers.softplus(x).numpy()))
-        self.assertTrue(
-            np.array_equal(x.softsign().numpy(),
-                           fluid.layers.softsign(x).numpy()))
         self.assertTrue(
             np.array_equal(x.rank().numpy(), paddle.rank(x).numpy()))
         self.assertTrue(
@@ -422,6 +409,8 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
         self.assertTrue(np.array_equal(x.reciprocal(), paddle.reciprocal(x)))
 
         # 2. Binary operation
+        self.assertTrue(
+            np.array_equal(x.divide(y).numpy(), paddle.divide(x, y).numpy()))
         self.assertTrue(
             np.array_equal(
                 x.matmul(y, True, False).numpy(),
@@ -501,6 +490,73 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
         self.assertTrue(
             np.array_equal(
                 x.logical_and(y).numpy(), paddle.logical_and(x, y).numpy()))
+        a = paddle.to_tensor([[1, 2], [3, 4]])
+        b = paddle.to_tensor([[4, 3], [2, 1]])
+        self.assertTrue(
+            np.array_equal(
+                x.where(a, b).numpy(), paddle.where(x, a, b).numpy()))
+
+        self.assertTrue(inspect.ismethod(a.dot))
+        self.assertTrue(inspect.ismethod(a.elementwise_add))
+        self.assertTrue(inspect.ismethod(a.elementwise_div))
+        self.assertTrue(inspect.ismethod(a.elementwise_floordiv))
+        self.assertTrue(inspect.ismethod(a.elementwise_mod))
+        self.assertTrue(inspect.ismethod(a.elementwise_sub))
+        self.assertTrue(inspect.ismethod(a.logsumexp))
+        self.assertTrue(inspect.ismethod(a.multiplex))
+        self.assertTrue(inspect.ismethod(a.prod))
+        self.assertTrue(inspect.ismethod(a.reduce_max))
+        self.assertTrue(inspect.ismethod(a.reduce_min))
+        self.assertTrue(inspect.ismethod(a.reduce_prod))
+        self.assertTrue(inspect.ismethod(a.reduce_sum))
+        self.assertTrue(inspect.ismethod(a.scale))
+        self.assertTrue(inspect.ismethod(a.stanh))
+        self.assertTrue(inspect.ismethod(a.sums))
+        self.assertTrue(inspect.ismethod(a.elementwise_sum))
+        self.assertTrue(inspect.ismethod(a.max))
+        self.assertTrue(inspect.ismethod(a.maximum))
+        self.assertTrue(inspect.ismethod(a.min))
+        self.assertTrue(inspect.ismethod(a.minimum))
+        self.assertTrue(inspect.ismethod(a.floor_divide))
+        self.assertTrue(inspect.ismethod(a.remainder))
+        self.assertTrue(inspect.ismethod(a.floor_mod))
+        self.assertTrue(inspect.ismethod(a.multiply))
+        self.assertTrue(inspect.ismethod(a.logsumexp))
+        self.assertTrue(inspect.ismethod(a.inverse))
+        self.assertTrue(inspect.ismethod(a.log1p))
+        self.assertTrue(inspect.ismethod(a.erf))
+        self.assertTrue(inspect.ismethod(a.addcmul))
+        self.assertTrue(inspect.ismethod(a.addmm))
+        self.assertTrue(inspect.ismethod(a.clip))
+        self.assertTrue(inspect.ismethod(a.trace))
+        self.assertTrue(inspect.ismethod(a.kron))
+        self.assertTrue(inspect.ismethod(a.isinf))
+        self.assertTrue(inspect.ismethod(a.isnan))
+        self.assertTrue(inspect.ismethod(a.concat))
+        self.assertTrue(inspect.ismethod(a.broadcast_to))
+        self.assertTrue(inspect.ismethod(a.scatter_nd_add))
+        self.assertTrue(inspect.ismethod(a.scatter_nd))
+        self.assertTrue(inspect.ismethod(a.shard_index))
+        self.assertTrue(inspect.ismethod(a.chunk))
+        self.assertTrue(inspect.ismethod(a.stack))
+        self.assertTrue(inspect.ismethod(a.strided_slice))
+        self.assertTrue(inspect.ismethod(a.unsqueeze))
+        self.assertTrue(inspect.ismethod(a.unstack))
+        self.assertTrue(inspect.ismethod(a.argmax))
+        self.assertTrue(inspect.ismethod(a.argmin))
+        self.assertTrue(inspect.ismethod(a.argsort))
+        self.assertTrue(inspect.ismethod(a.has_inf))
+        self.assertTrue(inspect.ismethod(a.has_nan))
+        self.assertTrue(inspect.ismethod(a.masked_select))
+        self.assertTrue(inspect.ismethod(a.topk))
+        self.assertTrue(inspect.ismethod(a.index_select))
+        self.assertTrue(inspect.ismethod(a.nonzero))
+        self.assertTrue(inspect.ismethod(a.sort))
+        self.assertTrue(inspect.ismethod(a.index_sample))
+        self.assertTrue(inspect.ismethod(a.mean))
+        self.assertTrue(inspect.ismethod(a.reduce_mean))
+        self.assertTrue(inspect.ismethod(a.std))
+        self.assertTrue(inspect.ismethod(a.numel))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_minimum_op.py b/python/paddle/fluid/tests/unittests/test_minimum_op.py
index 4c08b7386ca..a0673c82c5b 100644
--- a/python/paddle/fluid/tests/unittests/test_minimum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minimum_op.py
@@ -61,8 +61,8 @@ class ApiMinimumTest(unittest.TestCase):
     def test_dynamic_api(self):
         paddle.disable_static()
         np_x = np.array([10, 10]).astype('float64')
-        x = paddle.to_variable(self.input_x)
-        y = paddle.to_variable(self.input_y)
+        x = paddle.to_tensor(self.input_x)
+        y = paddle.to_tensor(self.input_y)
         z = paddle.minimum(x, y)
         np_z = z.numpy()
         z_expected = np.array(np.minimum(self.input_x, self.input_y))
@@ -73,8 +73,8 @@ class ApiMinimumTest(unittest.TestCase):
         np_x = np.random.rand(5, 4, 3, 2).astype("float64")
         np_y = np.random.rand(4, 3).astype("float64")
 
-        x = paddle.to_variable(self.input_x)
-        y = paddle.to_variable(self.input_y)
+        x = paddle.to_tensor(self.input_x)
+        y = paddle.to_tensor(self.input_y)
         result_1 = paddle.minimum(x, y, axis=1)
         result_2 = paddle.minimum(x, y, axis=-2)
         self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py
index 753d96c4411..e327307e955 100644
--- a/python/paddle/fluid/tests/unittests/test_mse_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py
@@ -205,8 +205,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
 
             paddle.disable_static()
             dy_ret = paddle.nn.functional.mse_loss(
-                paddle.to_variable(input_np),
-                paddle.to_variable(target_np), 'mean')
+                paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'mean')
             dy_result = dy_ret.numpy()
 
             sub = input_np - target_np
@@ -240,8 +239,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
 
             paddle.disable_static()
             dy_ret = paddle.nn.functional.mse_loss(
-                paddle.to_variable(input_np),
-                paddle.to_variable(target_np), 'sum')
+                paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'sum')
             dy_result = dy_ret.numpy()
 
             sub = input_np - target_np
@@ -275,8 +273,7 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
 
             paddle.disable_static()
             dy_ret = paddle.nn.functional.mse_loss(
-                paddle.to_variable(input_np),
-                paddle.to_variable(target_np), 'none')
+                paddle.to_tensor(input_np), paddle.to_tensor(target_np), 'none')
             dy_result = dy_ret.numpy()
 
             sub = input_np - target_np
diff --git a/python/paddle/fluid/tests/unittests/test_nll_loss.py b/python/paddle/fluid/tests/unittests/test_nll_loss.py
index e7154193bea..c07bf949af3 100644
--- a/python/paddle/fluid/tests/unittests/test_nll_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nll_loss.py
@@ -909,8 +909,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase):
             with fluid.dygraph.guard():
                 x_np = np.random.random(size=(5, )).astype(np.float64)
                 label_np = np.random.randint(0, 10, size=(5, )).astype(np.int64)
-                x = paddle.to_variable(x_np)
-                label = paddle.to_variable(label_np)
+                x = paddle.to_tensor(x_np)
+                label = paddle.to_tensor(label_np)
                 nll_loss = paddle.nn.loss.NLLLoss()
                 res = nll_loss(x, label)
 
@@ -933,8 +933,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase):
             with fluid.dygraph.guard():
                 x_np = np.random.random(size=(5, 3)).astype(np.float64)
                 label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64)
-                x = paddle.to_variable(x_np)
-                label = paddle.to_variable(label_np)
+                x = paddle.to_tensor(x_np)
+                label = paddle.to_tensor(label_np)
                 nll_loss = paddle.nn.loss.NLLLoss(reduction='')
                 res = nll_loss(x, label)
 
@@ -957,8 +957,8 @@ class TestNLLLossInvalidArgs(unittest.TestCase):
             with fluid.dygraph.guard():
                 x_np = np.random.random(size=(5, 3)).astype(np.float64)
                 label_np = np.random.randint(0, 3, size=(5, )).astype(np.int64)
-                x = paddle.to_variable(x_np)
-                label = paddle.to_variable(label_np)
+                x = paddle.to_tensor(x_np)
+                label = paddle.to_tensor(label_np)
                 res = paddle.nn.functional.nll_loss(x, label, reduction='')
 
         self.assertRaises(
diff --git a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
index 0ebe769fb9b..8ee3b2ac203 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_margin_rank_loss.py
@@ -101,9 +101,9 @@ def create_test_case(margin, reduction):
 
         def run_dynamic_functional_api(self, place):
             paddle.disable_static(place)
-            x = paddle.to_variable(self.x_data)
-            y = paddle.to_variable(self.y_data)
-            label = paddle.to_variable(self.label_data)
+            x = paddle.to_tensor(self.x_data)
+            y = paddle.to_tensor(self.y_data)
+            label = paddle.to_tensor(self.label_data)
 
             result = paddle.nn.functional.margin_ranking_loss(x, y, label,
                                                               margin, reduction)
@@ -117,9 +117,9 @@ def create_test_case(margin, reduction):
 
         def run_dynamic_api(self, place):
             paddle.disable_static(place)
-            x = paddle.to_variable(self.x_data)
-            y = paddle.to_variable(self.y_data)
-            label = paddle.to_variable(self.label_data)
+            x = paddle.to_tensor(self.x_data)
+            y = paddle.to_tensor(self.y_data)
+            label = paddle.to_tensor(self.label_data)
             margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
                 margin=margin, reduction=reduction)
             result = margin_rank_loss(x, y, label)
@@ -134,9 +134,9 @@ def create_test_case(margin, reduction):
         def run_dynamic_broadcast_api(self, place):
             paddle.disable_static(place)
             label_data = np.random.choice([-1, 1], size=[10]).astype("float64")
-            x = paddle.to_variable(self.x_data)
-            y = paddle.to_variable(self.y_data)
-            label = paddle.to_variable(label_data)
+            x = paddle.to_tensor(self.x_data)
+            y = paddle.to_tensor(self.y_data)
+            label = paddle.to_tensor(label_data)
             margin_rank_loss = paddle.nn.loss.MarginRankingLoss(
                 margin=margin, reduction=reduction)
             result = margin_rank_loss(x, y, label)
diff --git a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
index d52a1f5d5b1..90132a0923d 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
@@ -56,7 +56,7 @@ class TestNNSigmoidAPI(unittest.TestCase):
 
     def check_dynamic_api(self, place):
         paddle.disable_static(place)
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         mysigmoid = nn.Sigmoid()
         y = mysigmoid(x)
         self.assertTrue(np.allclose(y.numpy(), self.y))
@@ -94,7 +94,7 @@ class TestNNFunctionalSigmoidAPI(unittest.TestCase):
 
     def check_dynamic_api(self):
         paddle.disable_static()
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         y = functional.sigmoid(x)
         self.assertTrue(np.allclose(y.numpy(), self.y))
 
diff --git a/python/paddle/fluid/tests/unittests/test_numel_op.py b/python/paddle/fluid/tests/unittests/test_numel_op.py
index 8512bc99e74..800706e5965 100644
--- a/python/paddle/fluid/tests/unittests/test_numel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
@@ -76,8 +76,8 @@ class TestNumelOoAPI(unittest.TestCase):
         paddle.disable_static(paddle.CPUPlace())
         input_1 = np.random.random([2, 1, 4, 5]).astype("int32")
         input_2 = np.random.random([1, 4, 5]).astype("int32")
-        x_1 = paddle.to_variable(input_1)
-        x_2 = paddle.to_variable(input_2)
+        x_1 = paddle.to_tensor(input_1)
+        x_2 = paddle.to_tensor(input_2)
         out_1 = paddle.numel(x_1)
         out_2 = paddle.numel(x_2)
         assert (np.array_equal(out_1.numpy().item(0), np.size(input_1)))
diff --git a/python/paddle/fluid/tests/unittests/test_ones_like.py b/python/paddle/fluid/tests/unittests/test_ones_like.py
index c1e6a337771..bb0d6f07bdb 100644
--- a/python/paddle/fluid/tests/unittests/test_ones_like.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_like.py
@@ -63,7 +63,7 @@ class TestOnesLikeImpeartive(unittest.TestCase):
         place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         paddle.disable_static(place)
-        x = paddle.to_variable(np.ones(shape))
+        x = paddle.to_tensor(np.ones(shape))
         for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
             out = ones_like(x, dtype)
             self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
index baf0efa6ec2..cf138e67726 100644
--- a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
+++ b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
@@ -48,8 +48,8 @@ def test_static(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
 
 def test_dygraph(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
     paddle.disable_static()
-    x = paddle.to_variable(x_np)
-    y = paddle.to_variable(y_np)
+    x = paddle.to_tensor(x_np)
+    y = paddle.to_tensor(y_np)
     dist = paddle.nn.layer.distance.PairwiseDistance(
         p=p, epsilon=epsilon, keepdim=keepdim)
     distance = dist(x, y)
diff --git a/python/paddle/fluid/tests/unittests/test_sort_op.py b/python/paddle/fluid/tests/unittests/test_sort_op.py
index 015b72fd1c5..366e0c7a3fa 100644
--- a/python/paddle/fluid/tests/unittests/test_sort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sort_op.py
@@ -72,14 +72,14 @@ class TestSortDygraph(unittest.TestCase):
 
     def test_api_0(self):
         paddle.disable_static(self.place)
-        var_x = paddle.to_variable(self.input_data)
+        var_x = paddle.to_tensor(self.input_data)
         out = paddle.sort(var_x)
         self.assertEqual((np.sort(self.input_data) == out.numpy()).all(), True)
         paddle.enable_static()
 
     def test_api_1(self):
         paddle.disable_static(self.place)
-        var_x = paddle.to_variable(self.input_data)
+        var_x = paddle.to_tensor(self.input_data)
         out = paddle.sort(var_x, axis=-1)
         self.assertEqual(
             (np.sort(
diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py
index 5aaf3199344..b0f065a26a0 100644
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
@@ -230,13 +230,13 @@ class TestTileAPI(unittest.TestCase):
     def test_api(self):
         with fluid.dygraph.guard():
             np_x = np.random.random([12, 14]).astype("float32")
-            x = paddle.to_variable(np_x)
+            x = paddle.to_tensor(np_x)
 
             positive_2 = np.array([2]).astype("int32")
-            positive_2 = paddle.to_variable(positive_2)
+            positive_2 = paddle.to_tensor(positive_2)
 
             repeat_times = np.array([2, 3]).astype("int32")
-            repeat_times = paddle.to_variable(repeat_times)
+            repeat_times = paddle.to_tensor(repeat_times)
 
             out_1 = paddle.tile(x, repeat_times=[2, 3])
             out_2 = paddle.tile(x, repeat_times=[positive_2, 3])
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index bd76edc9d8c..7c7a71a3be1 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -234,23 +234,23 @@ class TestTransformer(unittest.TestCase):
                 if cache_dict:
                     if 'k' and 'v' in cache_dict:
                         cache_obj = multi_head_attn.Cache(
-                            paddle.to_variable(cache_dict['k']),
-                            paddle.to_variable(cache_dict['v']))
+                            paddle.to_tensor(cache_dict['k']),
+                            paddle.to_tensor(cache_dict['v']))
                     elif 'static_k' and 'static_v' in cache_dict:
                         cache_obj = multi_head_attn.StaticCache(
-                            paddle.to_variable(cache_dict['static_k']),
-                            paddle.to_variable(cache_dict['static_v']))
+                            paddle.to_tensor(cache_dict['static_k']),
+                            paddle.to_tensor(cache_dict['static_v']))
                 if attn_mask is not None:
                     attn_output = multi_head_attn(
-                        paddle.to_variable(query),
-                        paddle.to_variable(key),
-                        paddle.to_variable(value),
-                        paddle.to_variable(attn_mask), cache_obj)
+                        paddle.to_tensor(query),
+                        paddle.to_tensor(key),
+                        paddle.to_tensor(value),
+                        paddle.to_tensor(attn_mask), cache_obj)
                 else:
                     attn_output = multi_head_attn(
-                        paddle.to_variable(query),
-                        paddle.to_variable(key),
-                        paddle.to_variable(value), attn_mask, cache_obj)
+                        paddle.to_tensor(query),
+                        paddle.to_tensor(key),
+                        paddle.to_tensor(value), attn_mask, cache_obj)
                 attn_output = attn_output[0] if cache_dict else attn_output
 
                 # implementation by numpy
@@ -296,16 +296,16 @@ class TestTransformer(unittest.TestCase):
                 attn_dropout, act_dropout)
 
             encoder_output = encoder_layer(
-                paddle.to_variable(src),
-                paddle.to_variable(src_mask))  # paddle.to_variable(src_mask))
+                paddle.to_tensor(src),
+                paddle.to_tensor(src_mask))  # paddle.to_tensor(src_mask))
             # 4.numpy:
             # paddle self attention
             self_attn = MultiHeadAttention(
                 d_model, n_head, dropout=attn_dropout)
             attn_output = self_attn(
-                paddle.to_variable(src),
-                paddle.to_variable(src),
-                paddle.to_variable(src), paddle.to_variable(src_mask)).numpy()
+                paddle.to_tensor(src),
+                paddle.to_tensor(src),
+                paddle.to_tensor(src), paddle.to_tensor(src_mask)).numpy()
 
             src = attn_output + residual
             src_norm = layer_norm(src, d_model, encoder_layer.norm1)
@@ -348,13 +348,13 @@ class TestTransformer(unittest.TestCase):
                 cache_objs = None
                 if cache:
                     cache_objs = decoder_layer.gen_cache(
-                        paddle.to_variable(memory))
+                        paddle.to_tensor(memory))
 
                 decoder_output = decoder_layer(
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(memory),
-                    paddle.to_variable(tgt_mask),
-                    paddle.to_variable(memory_mask), cache_objs)
+                    paddle.to_tensor(tgt),
+                    paddle.to_tensor(memory),
+                    paddle.to_tensor(tgt_mask),
+                    paddle.to_tensor(memory_mask), cache_objs)
 
                 decoder_output = decoder_output[0].numpy(
                 ) if cache else decoder_output.numpy()
@@ -365,10 +365,10 @@ class TestTransformer(unittest.TestCase):
                 self_attn_cache = cache_objs[
                     0] if cache_objs is not None else None
                 tgt = self_attn(
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(tgt_mask), self_attn_cache)
+                    paddle.to_tensor(tgt),
+                    paddle.to_tensor(tgt),
+                    paddle.to_tensor(tgt),
+                    paddle.to_tensor(tgt_mask), self_attn_cache)
 
                 tgt = tgt[0].numpy() if cache else tgt.numpy()
 
@@ -380,10 +380,10 @@ class TestTransformer(unittest.TestCase):
                 cross_attn_cache = cache_objs[
                     1] if cache_objs is not None else None
                 tgt = cross_attn(
-                    paddle.to_variable(tgt_norm),
-                    paddle.to_variable(memory),
-                    paddle.to_variable(memory),
-                    paddle.to_variable(memory_mask), cross_attn_cache)
+                    paddle.to_tensor(tgt_norm),
+                    paddle.to_tensor(memory),
+                    paddle.to_tensor(memory),
+                    paddle.to_tensor(memory_mask), cross_attn_cache)
                 tgt = tgt[0].numpy() if cache else tgt.numpy()
 
                 # postprocess
@@ -416,7 +416,7 @@ class TestTransformer(unittest.TestCase):
             encoder = TransformerEncoder(encoder_layer, num_layers)
             # src, src_mask
             enc_output = encoder(
-                paddle.to_variable(src), paddle.to_variable(src_mask))
+                paddle.to_tensor(src), paddle.to_tensor(src_mask))
 
     def test_decoder(self):
         batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
@@ -438,9 +438,9 @@ class TestTransformer(unittest.TestCase):
             decoder = TransformerDecoder(decoder_layer, num_layers)
 
             output = decoder(
-                paddle.to_variable(tgt),
-                paddle.to_variable(memory),
-                paddle.to_variable(tgt_mask), paddle.to_variable(memory_mask))
+                paddle.to_tensor(tgt),
+                paddle.to_tensor(memory),
+                paddle.to_tensor(tgt_mask), paddle.to_tensor(memory_mask))
 
     def test_transformer(self):
         batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
@@ -453,24 +453,24 @@ class TestTransformer(unittest.TestCase):
                 n_head,
                 dim_feedforward=dim_feedforward,
                 dropout=dropout)
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                 np.random.rand(batch_size, source_length, d_model).astype(
                     "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                 np.random.rand(batch_size, target_length, d_model).astype(
                     "float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
             tgt_mask = np.zeros((batch_size, n_head, target_length,
                                  target_length)).astype("float32")
             tgt_mask[0][0][0][0] = -1e9
             memory_mask = np.zeros((batch_size, n_head, target_length,
                                     source_length)).astype("float32")
             memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 6bc42f0712a..c4155e0d826 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -424,10 +424,10 @@ class TestCTCLossAPICase(unittest.TestCase):
         loss_np = ctc.forward()
 
         paddle.disable_static()
-        softmax = paddle.to_variable(logits)
-        labels = paddle.to_variable(labels)
-        logits_length = paddle.to_variable(self.logits_length)
-        labels_length = paddle.to_variable(self.labels_length)
+        softmax = paddle.to_tensor(logits)
+        labels = paddle.to_tensor(labels)
+        logits_length = paddle.to_tensor(self.logits_length)
+        labels_length = paddle.to_tensor(self.labels_length)
         loss_pd_mean = F.ctc_loss(
             softmax,
             labels,
@@ -477,10 +477,10 @@ class TestCTCLossAPICase(unittest.TestCase):
         loss_np = ctc.forward()
 
         paddle.disable_static()
-        softmax = paddle.to_variable(logits)
-        labels = paddle.to_variable(labels)
-        logits_length = paddle.to_variable(self.logits_length)
-        labels_length = paddle.to_variable(self.labels_length)
+        softmax = paddle.to_tensor(logits)
+        labels = paddle.to_tensor(labels)
+        logits_length = paddle.to_tensor(self.logits_length)
+        labels_length = paddle.to_tensor(self.labels_length)
 
         loss_pd = paddle.nn.CTCLoss(self.blank, 'none')(
             softmax, labels, logits_length, labels_length)
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 9de407841fb..dc6a04a4723 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -53,7 +53,7 @@ __all__ = [
     'shard_index',
     'slice',
     'split',
-    'chunk'
+    'chunk',
     'squeeze',
     'stack',
     'strided_slice',
-- 
GitLab


From c17f9cf25fd42ab868983a85c03d8c9a2b4a007d Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Wed, 23 Sep 2020 19:09:28 +0800
Subject: [PATCH 187/261] [bug fix]:Memory increases after adapting the cudnn
 version to cudnn8 (#27436)

* [bug fix]:Memory increases after adapting the cudnn version to 8

* [bug fix]cudnnGetConvolutionForwardAlgorithm not defined
---
 paddle/fluid/operators/conv_cudnn_helper.h    | 30 ++++++++++++++++++-
 .../fluid/operators/fused/conv_fusion_op.cu   | 10 ++++++-
 paddle/fluid/platform/dynload/cudnn.cc        |  8 +++++
 paddle/fluid/platform/dynload/cudnn.h         |  1 +
 4 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 25b45f281a7..fac8e242510 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -162,7 +162,20 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
       workspace_size = GetWorkspaceSize(args, algo);
 
       if (workspace_size > workspace_size_limit) {
+#if CUDNN_VERSION >= 8000
         workspace_size_limit = workspace_size;
+#else
+        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
+                   "the workspace size request("
+                << workspace_size << ") exceeds the limit("
+                << workspace_size_limit << ")";
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+                args.handle, args.idesc.desc(), args.wdesc.desc(),
+                args.cdesc.desc(), args.odesc.desc(),
+                CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+                workspace_size_limit, &algo));
+#endif
       }
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -291,8 +304,23 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
 #endif
       workspace_size = GetWorkspaceSize(args, algo);
       if (workspace_size > workspace_size_limit) {
-        workspace_size_limit = workspace_size;
         has_got_workspace_size = false;
+#if CUDNN_VERSION >= 8000
+        // There is no cudnnGetConvolutionBackwardDataAlgorithm in CUDNN 8
+        // version.
+        workspace_size_limit = workspace_size;
+#else
+        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
+                   "the workspace size request("
+                << workspace_size << ") exceeds the limit("
+                << workspace_size_limit << ")";
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+                args.handle, args.wdesc.desc(), args.odesc.desc(),
+                args.cdesc.desc(), args.idesc.desc(),
+                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+                workspace_size_limit, &algo));
+#endif
       }
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index b22f28fbbe3..49fded886a0 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -204,6 +204,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     auto x_dims = framework::vectorize(transformed_input.dims());
     auto f_dims = framework::vectorize(filter->dims());
     if (!exhaustive_search) {
+#if CUDNN_VERSION >= 8000
       int perf_count;
       int best_algo_idx = 0;
       size_t tmp_size = 0;
@@ -215,13 +216,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
               cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count,
               perf_results.get()));
       algo = (perf_results.get())[best_algo_idx].algo;
-      VLOG(3) << "cuDNN forward algo " << algo;
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
               handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
               cudnn_output_desc, algo, &workspace_size_in_bytes));
       if (workspace_size_in_bytes > workspace_size_limit)
         workspace_size_limit = workspace_size_in_bytes;
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+              handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+              cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &algo));
+      VLOG(3) << "cuDNN forward algo " << algo;
+#endif
     } else {
       std::function<cudnnConvolutionFwdAlgo_t()> search_func =
           [&]() -> cudnnConvolutionFwdAlgo_t {
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index 1166dc5e4ad..4c59fe5e9ba 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -30,6 +30,10 @@ CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DEFINE_WRAP);
+#endif
+
 #ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
 CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 #endif
@@ -54,6 +58,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_R8
+CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
+#endif
+
 bool HasCUDNN() {
   std::call_once(cudnn_dso_flag,
                  []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index fba41417648..dd0a2e19685 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -134,6 +134,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \
   __macro(cudnnGetConvolutionBackwardFilterAlgorithm);   \
   __macro(cudnnGetConvolutionForwardAlgorithm);          \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
   __macro(cudnnSetRNNDescriptor);
 CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
-- 
GitLab


From 19a58b3d5d48e07f3d4859a8817c8f9f740ad4cf Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Wed, 23 Sep 2020 19:12:04 +0800
Subject: [PATCH 188/261] disable ut test_vision_models and
 test_pretrained_model,test=document_fix (#27502)

---
 python/paddle/tests/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index e1bc65a5d15..6fb73b08c11 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -8,6 +8,10 @@ foreach(TEST_OP ${DIST_TEST_OPS})
     list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
 
+# disable test_pretrained_model and test_vision_models
+list(REMOVE_ITEM TEST_OPS test_pretrained_model)
+list(REMOVE_ITEM TEST_OPS test_vision_models)
+
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
-- 
GitLab


From bb84f0e64612ad4b6899c61aab2d3e97b1177b27 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 23 Sep 2020 20:12:46 +0800
Subject: [PATCH 189/261] Add new paddle.save/load APIs (#27331)

* init commit of new save/load

* fix failed unittests

* fix save_load_v2 unittest failed

* fix failed unittest & polish doc

* add tests for coverage

* add more tests & move static apis

* fix example code error

* polish emample code

* fix detail example code problem
---
 python/paddle/fluid/dygraph/checkpoint.py     |  18 +-
 python/paddle/fluid/dygraph/jit.py            |  27 +-
 python/paddle/fluid/dygraph/layers.py         |   6 +-
 python/paddle/fluid/dygraph/parallel.py       |   6 +-
 python/paddle/fluid/optimizer.py              |  20 +-
 .../unittests/test_imperative_save_load.py    |  18 +-
 .../unittests/test_imperative_save_load_v2.py |  29 +-
 .../test_load_state_dict_from_old_format.py   |  41 ++-
 .../tests/unittests/test_paddle_save_load.py  | 148 +++++++++
 python/paddle/framework/__init__.py           |   4 +-
 python/paddle/framework/io.py                 | 291 ++++++++++++++++++
 python/paddle/io/__init__.py                  |   8 -
 python/paddle/static/__init__.py              |  13 +-
 python/paddle/tensor/__init__.py              |   2 -
 python/paddle/tensor/io.py                    |  19 --
 15 files changed, 539 insertions(+), 111 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_paddle_save_load.py
 create mode 100644 python/paddle/framework/io.py
 delete mode 100644 python/paddle/tensor/io.py

diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 93cb0bafc84..f4ea4d670e6 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -145,7 +145,7 @@ def load_dygraph(model_path, config=None):
 
     .. note::
         Due to some historical reasons, if you load ``state_dict`` from the saved 
-        result of `paddle.io.save_inference_model`, the structured variable name 
+        result of `paddle.static.save_inference_model`, the structured variable name 
         will cannot be restored. You need to set the argument `use_structured_name=False` 
         when using `Layer.set_state_dict` later.
 
@@ -164,24 +164,24 @@ def load_dygraph(model_path, config=None):
         .. code-block:: python
 
             import paddle
-            
+            import paddle.fluid as fluid
+
             paddle.disable_static()
 
-            emb = paddle.nn.Embedding([10, 10])
+            emb = paddle.nn.Embedding(10, 10)
 
             state_dict = emb.state_dict()
-            paddle.save(state_dict, "paddle_dy")
+            fluid.save_dygraph(state_dict, "paddle_dy")
 
-            scheduler = paddle.optimizer.lr_scheduler.NoamLR(
+            scheduler = paddle.optimizer.lr_scheduler.NoamLR(	
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
                 learning_rate=scheduler,
                 parameters=emb.parameters())
             state_dict = adam.state_dict()
-            paddle.save(state_dict, "paddle_dy")
-
-            para_state_dict, opti_state_dict = paddle.load("paddle_dy")
+            fluid.save_dygraph(state_dict, "paddle_dy")
 
+            para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
     '''
     # deal with argument `model_path`
     model_prefix = model_path
@@ -275,7 +275,7 @@ def load_dygraph(model_path, config=None):
             # If users save all parameters as one file, the [ variable.name -> variable ]
             # mapping info will lost, so users need to give variable list, but users build 
             # variable list in dygraph mode is difficult, we recommend users to use
-            # paddle.io.load_program_state in this case
+            # paddle.static.load_program_state in this case
 
             # Try to load all the files in the directory in VarBase format, 
             # the file name is used as the name of VarBase
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 10819e4b320..d0e3d23b04b 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -231,9 +231,7 @@ def declarative(function=None, input_spec=None):
 class SaveLoadConfig(object):
     """
     The additional configuration options may be used in function 
-    :ref:`api_imperative_jit_save` that save :ref:`api_imperative_TranslatedLayer` 
-    or used in function :ref:`api_imperative_jit_load` that 
-    load :ref:`api_imperative_TranslatedLayer` .
+    ``paddle.jit.save/load`` and ``paddle.load`` .
     
     Examples:
         1. Using ``SaveLoadConfig`` when saving model
@@ -319,7 +317,7 @@ class SaveLoadConfig(object):
     @property
     def output_spec(self):
         """
-        Selects the output targets of the saved model ( :ref:`api_imperative_TranslatedLayer` ).
+        Selects the output targets of the saved model ( ``paddle.jit.TranslatedLayer`` ).
         By default, all return variables of original Layer's forward function
         are kept as the output of the saved TranslatedLayer.
 
@@ -531,11 +529,14 @@ class SaveLoadConfig(object):
     def separate_params(self):
         """
         Configure whether to save the Layer parameters as separete files.
-        (In order to be compatible with the behavior of :ref:`api_fluid_io_save_inference_model` )
+        (In order to be compatible with the behavior of ``paddle.static.save_inference_model`` )
 
         If True, each parameter will be saved to a file separately, the file name is the parameter name,
         and the SaveLoadConfig.params_filename configuration will not take effect. Default False.
 
+        .. note::
+            Only used for ``paddle.jit.save`` .
+
         Examples:
             .. code-block:: python
 
@@ -569,7 +570,7 @@ class SaveLoadConfig(object):
                     adam.clear_grad()
 
                 model_path = "simplenet.example.model.separate_params"
-                config = paddle.jit.SaveLoadConfig()
+                config = paddle.SaveLoadConfig()
                 config.separate_params = True
 
                 # saving with configs.separate_params
@@ -599,12 +600,12 @@ class SaveLoadConfig(object):
     def keep_name_table(self):
         """
         Configures whether keep ``structured_name -> parameter_name`` dict in loaded state dict.
-        This dict is the debugging information saved when call `paddle.save`. 
+        This dict is the debugging information saved when call ``paddle.save`` . 
         It is generally only used for debugging and does not affect the actual training or inference. 
-        By default, it will not be retained in `paddle.load` result. Default: False.
+        By default, it will not be retained in ``paddle.load`` result. Default: False.
         
         .. note::
-            Only used for ``paddle.load``.
+            Only used for ``paddle.load`` .
 
         Examples:
             .. code-block:: python
@@ -616,11 +617,11 @@ class SaveLoadConfig(object):
                 linear = paddle.nn.Linear(5, 1)
 
                 state_dict = linear.state_dict()
-                paddle.save(state_dict, "paddle_dy")
+                paddle.save(state_dict, "paddle_dy.pdparams")
 
-                configs = paddle.SaveLoadConfig()
-                configs.keep_name_table = True
-                para_state_dict, _ = paddle.load("paddle_dy", configs)
+                config = paddle.SaveLoadConfig()
+                config.keep_name_table = True
+                para_state_dict = paddle.load("paddle_dy.pdparams", config)
 
                 print(para_state_dict)
                 # the name_table is 'StructuredToParameterName@@'
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 9c79deaab73..88e24e7e1ea 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -970,12 +970,12 @@ class Layer(core.Layer):
                 
                 paddle.disable_static()
                 
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
 
                 state_dict = emb.state_dict()
-                paddle.save(state_dict, "paddle_dy")
+                paddle.save(state_dict, "paddle_dy.pdparams")
                 
-                para_state_dict, _ = paddle.load("paddle_dy")
+                para_state_dict = paddle.load("paddle_dy.pdparams")
 
                 emb.set_state_dict(para_state_dict)
 
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 472022bced7..de761cad529 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -610,13 +610,13 @@ class DataParallel(layers.Layer):
 
                 paddle.disable_static()
 
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
                 emb = fluid.dygraph.DataParallel(emb, strategy)
 
                 state_dict = emb.state_dict()
-                paddle.save(state_dict, "paddle_dy")
+                paddle.save(state_dict, "paddle_dy.pdparams")
 
-                para_state_dict, _ = paddle.load("paddle_dy")
+                para_state_dict = paddle.load("paddle_dy.pdparams")
 
                 emb.set_state_dict(para_state_dict)
 
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 1e7915ed781..0dd1694c86c 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -182,23 +182,25 @@ class Optimizer(object):
         Examples:
             .. code-block:: python
 
-                import paddle   
+                import paddle
+                import paddle.fluid as fluid
 
                 paddle.disable_static()
 
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
 
                 state_dict = emb.state_dict()
-                paddle.save(state_dict, "paddle_dy")
+                fluid.save_dygraph(state_dict, "paddle_dy")
 
-                adam = paddle.optimizer.Adam(learning_rate=fluid.layers.noam_decay( 100, 10000), 
-                                                parameter_list=emb.parameters())
+                scheduler = paddle.optimizer.lr_scheduler.NoamLR(	
+                    d_model=0.01, warmup_steps=100, verbose=True)
+                adam = paddle.optimizer.Adam(
+                    learning_rate=scheduler,
+                    parameters=emb.parameters())
                 state_dict = adam.state_dict()
+                fluid.save_dygraph(state_dict, "paddle_dy")
 
-                para_state_dict, opti_state_dict = paddle.load("paddle_dy")
-
-                adam.set_state_dict(opti_state_dict)
-
+                para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
         '''
         from paddle.optimizer.lr_scheduler import _LRScheduler
         if isinstance(self._learning_rate, _LRScheduler):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index 22e19efcb58..bee53fd10f5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -292,7 +292,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 np_t = v.numpy()
                 self.model_base[k] = np_t
 
-            paddle.save(self.state_dict, "./test_dy")
+            fluid.save_dygraph(self.state_dict, "./test_dy")
 
     def testLoadAndSetVarBase(self):
         seed = 90
@@ -373,7 +373,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             if isinstance(adam._learning_rate, LearningRateDecay):
                 adam._learning_rate.step_num = 0
 
-            para_state_dict, opti_state_dict = paddle.load("./test_dy")
+            para_state_dict, opti_state_dict = fluid.load_dygraph("./test_dy")
             adam.set_state_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
@@ -898,31 +898,31 @@ class TestDygraphPtbRnn(unittest.TestCase):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+            fluid.save_dygraph(state_dict, os.path.join('saved_dy', 'emb_dy'))
 
-            para_state_dict, opti_state_dict = paddle.load(
+            para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy'))
 
             self.assertTrue(opti_state_dict == None)
 
-            para_state_dict, opti_state_dict = paddle.load(
+            para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict, opti_state_dict = paddle.load(
+            para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy.pdopt'))
 
     def test_load_compatible_with_keep_name_table(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+            fluid.save_dygraph(state_dict, os.path.join('saved_dy', 'emb_dy'))
 
-            para_state_dict, opti_state_dict = paddle.load(
+            para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy'), True)
             self.assertTrue(para_state_dict != None)
             self.assertTrue(opti_state_dict == None)
 
-            para_state_dict, opti_state_dict = paddle.load(
+            para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy'), keep_name_table=True)
             self.assertTrue(para_state_dict != None)
             self.assertTrue(opti_state_dict == None)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 3eb413a6266..5b7998198ef 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -285,7 +285,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 else:
                     self.base_opti[k] = v
 
-            fluid.save_dygraph(self.opti_dict, "./test_dy_v2")
+            paddle.save(self.opti_dict, "./test_dy_v2.pdopt")
 
             self.state_dict = ptb_model.state_dict()
 
@@ -294,7 +294,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 np_t = v.numpy()
                 self.model_base[k] = np_t
 
-            paddle.save(self.state_dict, "./test_dy_v2")
+            paddle.save(self.state_dict, "./test_dy_v2.pdparams")
 
     def testLoadAndSetVarBase(self):
         self.setUp()
@@ -374,7 +374,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                     self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
 
-            para_state_dict, opti_state_dict = paddle.load("./test_dy_v2")
+            para_state_dict = paddle.load("./test_dy_v2.pdparams")
+            opti_state_dict = paddle.load("./test_dy_v2.pdopt")
             adam.set_state_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
@@ -905,26 +906,19 @@ class TestDygraphPtbRnn(unittest.TestCase):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict, opti_state_dict = paddle.load(
-                os.path.join('saved_dy', 'emb_dy'))
-
-            self.assertTrue(opti_state_dict == None)
-
-            para_state_dict, opti_state_dict = paddle.load(
+            para_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict, opti_state_dict = paddle.load(
-                os.path.join('saved_dy', 'emb_dy.pdopt'))
-
     def test_no_state_in_input_dict(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict, _ = paddle.load(os.path.join('saved_dy', 'emb_dy'))
+            para_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy.pdparams'))
             para_state_dict.pop('weight')
 
             emb.set_state_dict(para_state_dict)
@@ -933,9 +927,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict, _ = paddle.load(os.path.join('saved_dy', 'emb_dy'))
+            para_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy.pdparams'))
             para_state_dict['weight'] = np.expand_dims(
                 para_state_dict['weight'], axis=-1)
 
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
index a1a9b3f444f..fdc1e6b52ab 100644
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -124,52 +124,67 @@ class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
         self.params_filename = None
         orig_param_dict = self.train_and_save_model()
 
-        load_param_dict, _ = paddle.load(self.save_dirname)
+        load_param_dict, _ = fluid.load_dygraph(self.save_dirname)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
+        new_load_param_dict = paddle.load(self.save_dirname)
+        self.check_load_state_dict(orig_param_dict, new_load_param_dict)
+
     def test_load_with_model_filename(self):
         self.save_dirname = "static_mnist.load_state_dict.model_filename"
         self.model_filename = "static_mnist.model"
         self.params_filename = None
         orig_param_dict = self.train_and_save_model()
 
-        configs = paddle.SaveLoadConfig()
-        configs.separate_params = True
-        configs.model_filename = self.model_filename
-        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        config = paddle.SaveLoadConfig()
+        config.separate_params = True
+        config.model_filename = self.model_filename
+        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
+        new_load_param_dict = paddle.load(self.save_dirname, config)
+        self.check_load_state_dict(orig_param_dict, new_load_param_dict)
+
     def test_load_with_param_filename(self):
         self.save_dirname = "static_mnist.load_state_dict.param_filename"
         self.model_filename = None
         self.params_filename = "static_mnist.params"
         orig_param_dict = self.train_and_save_model()
 
-        configs = paddle.SaveLoadConfig()
-        configs.params_filename = self.params_filename
-        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        config = paddle.SaveLoadConfig()
+        config.params_filename = self.params_filename
+        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
+        new_load_param_dict = paddle.load(self.save_dirname, config)
+        self.check_load_state_dict(orig_param_dict, new_load_param_dict)
+
     def test_load_with_model_and_param_filename(self):
         self.save_dirname = "static_mnist.load_state_dict.model_and_param_filename"
         self.model_filename = "static_mnist.model"
         self.params_filename = "static_mnist.params"
         orig_param_dict = self.train_and_save_model()
 
-        configs = paddle.SaveLoadConfig()
-        configs.params_filename = self.params_filename
-        configs.model_filename = self.model_filename
-        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        config = paddle.SaveLoadConfig()
+        config.params_filename = self.params_filename
+        config.model_filename = self.model_filename
+        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
+        new_load_param_dict = paddle.load(self.save_dirname, config)
+        self.check_load_state_dict(orig_param_dict, new_load_param_dict)
+
     def test_load_state_dict_from_save_params(self):
         self.save_dirname = "static_mnist.load_state_dict.save_params"
         self.params_filename = None
         orig_param_dict = self.train_and_save_model(True)
 
-        load_param_dict, _ = paddle.load(self.save_dirname)
+        load_param_dict, _ = fluid.load_dygraph(self.save_dirname)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
+        new_load_param_dict = paddle.load(self.save_dirname)
+        self.check_load_state_dict(orig_param_dict, new_load_param_dict)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
new file mode 100644
index 00000000000..74d44d0f8b6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.optimizer as opt
+
+BATCH_SIZE = 16
+BATCH_NUM = 4
+EPOCH_NUM = 4
+SEED = 10
+
+IMAGE_SIZE = 784
+CLASS_NUM = 10
+
+
+# define a random dataset
+class RandomDataset(paddle.io.Dataset):
+    def __init__(self, num_samples):
+        self.num_samples = num_samples
+
+    def __getitem__(self, idx):
+        np.random.seed(SEED)
+        image = np.random.random([IMAGE_SIZE]).astype('float32')
+        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+        return image, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class LinearNet(nn.Layer):
+    def __init__(self):
+        super(LinearNet, self).__init__()
+        self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+
+    def forward(self, x):
+        return self._linear(x)
+
+
+def train(layer, loader, loss_fn, opt):
+    for epoch_id in range(EPOCH_NUM):
+        for batch_id, (image, label) in enumerate(loader()):
+            out = layer(image)
+            loss = loss_fn(out, label)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+
+
+class TestSaveLoad(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        self.place = paddle.CPUPlace()
+        paddle.disable_static(self.place)
+
+        # config seed
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+    def build_and_train_model(self):
+        # create network
+        layer = LinearNet()
+        loss_fn = nn.CrossEntropyLoss()
+
+        adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
+
+        # create data loader
+        dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+        loader = paddle.io.DataLoader(
+            dataset,
+            places=self.place,
+            batch_size=BATCH_SIZE,
+            shuffle=True,
+            drop_last=True,
+            num_workers=2)
+
+        # train
+        train(layer, loader, loss_fn, adam)
+
+        return layer, adam
+
+    def check_load_state_dict(self, orig_dict, load_dict):
+        for var_name, value in orig_dict.items():
+            self.assertTrue(np.array_equal(value.numpy(), load_dict[var_name]))
+
+    def test_save_load(self):
+        layer, opt = self.build_and_train_model()
+
+        # save
+        layer_save_path = "linear.pdparams"
+        opt_save_path = "linear.pdopt"
+        layer_state_dict = layer.state_dict()
+        opt_state_dict = opt.state_dict()
+
+        paddle.save(layer_state_dict, layer_save_path)
+        paddle.save(opt_state_dict, opt_save_path)
+
+        # load
+        load_layer_state_dict = paddle.load(layer_save_path)
+        load_opt_state_dict = paddle.load(opt_save_path)
+
+        self.check_load_state_dict(layer_state_dict, load_layer_state_dict)
+        self.check_load_state_dict(opt_state_dict, load_opt_state_dict)
+
+        # test save load in static mode
+        paddle.enable_static()
+        static_save_path = "static_mode_test/linear.pdparams"
+        paddle.save(layer_state_dict, static_save_path)
+        load_static_state_dict = paddle.load(static_save_path)
+        self.check_load_state_dict(layer_state_dict, load_static_state_dict)
+
+        # error test cases, some tests relay base test above
+        # 1. test save obj not dict error
+        test_list = [1, 2, 3]
+        with self.assertRaises(NotImplementedError):
+            paddle.save(test_list, "not_dict_error_path")
+
+        # 2. test save path format error
+        with self.assertRaises(ValueError):
+            paddle.save(layer_state_dict, "linear.model/")
+
+        # 3. test load path not exist error
+        with self.assertRaises(ValueError):
+            paddle.load("linear.params")
+
+        # 4. test load old save path error
+        with self.assertRaises(ValueError):
+            paddle.load("linear")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index f33e4e0fca8..2ce442add2e 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -48,8 +48,8 @@ from paddle.fluid import core  #DEFINE_ALIAS
 from ..fluid.dygraph.base import no_grad  #DEFINE_ALIAS
 from ..fluid.dygraph.base import to_variable  #DEFINE_ALIAS
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
-from ..fluid.dygraph.checkpoint import load_dygraph as load  #DEFINE_ALIAS
-from ..fluid.dygraph.checkpoint import save_dygraph as save  #DEFINE_ALIAS
+from .io import save
+from .io import load
 from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
 from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
 
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
new file mode 100644
index 00000000000..7175f310144
--- /dev/null
+++ b/python/paddle/framework/io.py
@@ -0,0 +1,291 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import collections
+import pickle
+import six
+import warnings
+
+import paddle
+
+# deprecated module import
+from paddle import fluid
+from paddle.fluid import core
+from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer
+from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME
+
+__all__ = [
+    'save',
+    'load',
+]
+
+
+def _build_saved_state_dict(state_dict):
+    save_dict = {}
+    name_table = {}
+    for key, value in state_dict.items():
+        if isinstance(value, (Variable, core.VarBase)):
+            save_dict[key] = value.numpy()
+            name_table[key] = value.name
+        else:
+            save_dict[key] = value
+    save_dict["StructuredToParameterName@@"] = name_table
+
+    return save_dict
+
+
+def _load_state_dict_from_save_inference_model(model_path, config):
+    # 1. load program desc & construct _ProgramHolder
+    programs = _construct_program_holders(model_path, config.model_filename)
+
+    # 2. load layer parameters & buffers
+    with fluid.dygraph.guard():
+        persistable_var_dict = _construct_params_and_buffers(
+            model_path,
+            programs,
+            config.separate_params,
+            config.params_filename,
+            append_suffix=False)
+
+        # 3. construct state_dict
+        load_param_dict = dict()
+        for var_name in persistable_var_dict:
+            load_param_dict[var_name] = persistable_var_dict[var_name].numpy()
+
+        # if __variables.info__ exists, we can recover structured_name
+        var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+        if os.path.exists(var_info_path):
+            with open(var_info_path, 'rb') as f:
+                extra_var_info = pickle.load(f)
+            structured_para_dict = dict()
+            for var_name in load_param_dict:
+                structured_name = extra_var_info[var_name].get(
+                    'structured_name', None)
+                assert structured_name is not None, "Cannot find saved variable (%s)'s structured name in saved model." % var_name
+                structured_para_dict[structured_name] = load_param_dict[
+                    var_name]
+            load_param_dict = structured_para_dict
+
+    return load_param_dict
+
+
+def _load_state_dict_from_save_params(model_path):
+    # Try to load all the files in the directory in VarBase format, 
+    # the file name is used as the name of VarBase
+    load_var_list = []
+
+    # 1. load file names
+    var_name_list = []
+    for root, _, files in os.walk(model_path):
+        for filename in files:
+            file_path = os.path.join(root, filename)
+            tmp_var_name = os.path.relpath(file_path, model_path)
+            var_name = tmp_var_name.replace("\\", "/")
+            var_name_list.append(var_name)
+
+    # 2. create and load VarBase
+    with fluid.dygraph.guard():
+        for name in var_name_list:
+            new_var = _varbase_creator(name=name, persistable=True)
+            _dygraph_tracer().trace_op(
+                type='load',
+                inputs={},
+                outputs={'Out': new_var},
+                attrs={'file_path': os.path.join(model_path, name)})
+            load_var_list.append(new_var)
+
+    # 3. construct state_dict
+    load_param_dict = dict()
+    for var in load_var_list:
+        load_param_dict[var.name] = var.numpy()
+
+    return load_param_dict
+
+
+def save(obj, path):
+    '''
+    Save an object to the specified path.
+    
+    .. note::
+        Now only supports save ``state_dict`` of Layer or Optimizer.
+    
+    Args:
+        obj(Object) : The object to be saved.
+        path(str) : The path of the object to be saved. 
+          If saved in the current directory, the input path string will be used as the file name. 
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.disable_static()
+
+            emb = paddle.nn.Embedding(10, 10)
+            layer_state_dict = emb.state_dict()
+            paddle.save(layer_state_dict, "emb.pdparams")
+
+            scheduler = paddle.optimizer.lr_scheduler.NoamLR(	
+                d_model=0.01, warmup_steps=100, verbose=True)
+            adam = paddle.optimizer.Adam(
+                learning_rate=scheduler,
+                parameters=emb.parameters())
+            opt_state_dict = adam.state_dict()
+            paddle.save(opt_state_dict, "adam.pdopt")
+    '''
+
+    # 1. input check
+    if not isinstance(obj, dict):
+        raise NotImplementedError(
+            "Now only supports save state_dict of Layer or Optimizer, "
+            "expect dict, but received %s." % type(obj))
+
+    if len(obj) == 0:
+        warnings.warn("The input state dict is empty, no need to save.")
+
+    filename = os.path.basename(path)
+    if filename == "":
+        raise ValueError("The input path MUST be format of dirname/filename "
+                         "[dirname\\filename in Windows system], but received "
+                         "filename is empty string.")
+
+    # 2. save object
+    dirname = os.path.dirname(path)
+    if dirname and not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    # TODO(chenweihang): supports save other object
+    saved_obj = _build_saved_state_dict(obj)
+
+    with open(path, 'wb') as f:
+        pickle.dump(saved_obj, f, protocol=2)
+
+
+def load(path, config=None):
+    '''
+    Load an object can be used in paddle from specified path.
+
+    .. note::
+        Now only supports load ``state_dict`` of Layer or Optimizer.
+
+    .. note::
+        ``paddle.load`` supports loading ``state_dict`` from the result of several 
+        paddle1.x save APIs in static mode, but due to some historical reasons, 
+        if you load ``state_dict`` from the saved result of 
+        ``paddle.static.save_inference_model/paddle.fluid.io.save_params/paddle.fluid.io.save_persistables`` , 
+        the structured variable name will cannot be restored. You need to set the argument 
+        ``use_structured_name=False`` when using ``Layer.set_state_dict`` later.
+
+    Args:
+        path(str) : The path to load the target object. Generally, the path is the target 
+            file path, when compatible with loading the saved results of 
+            ``paddle.jit.save/paddle.static.save_inference_model`` , the path is a directory. 
+        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig`
+            object that specifies additional configuration options, these options 
+            are for compatibility with ``paddle.jit.save/paddle.static.save_inference_model`` 
+            formats. Default None.
+
+    Returns:
+        Object(Object): a target object can be used in paddle
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            
+            paddle.disable_static()
+
+            emb = paddle.nn.Embedding(10, 10)
+            layer_state_dict = emb.state_dict()
+            paddle.save(layer_state_dict, "emb.pdparams")
+
+            scheduler = paddle.optimizer.lr_scheduler.NoamLR(	
+                d_model=0.01, warmup_steps=100, verbose=True)
+            adam = paddle.optimizer.Adam(
+                learning_rate=scheduler,
+                parameters=emb.parameters())
+            opt_state_dict = adam.state_dict()
+            paddle.save(opt_state_dict, "adam.pdopt")
+
+            load_layer_state_dict = paddle.load("emb.pdparams")
+            load_opt_state_dict = paddle.load("adam.pdopt")
+    '''
+    # 1. input check
+    if not os.path.exists(path):
+        error_msg = "The path `%s` does not exist."
+        # if current path is a prefix, and the path.pdparams or path.pdopt
+        # is exist, users may want use `paddle.load` load the result of 
+        # `fluid.save_dygraph`, we raise error here for users
+        params_file_path = path + ".pdparams"
+        opti_file_path = path + ".pdopt"
+        if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
+            error_msg += " If you want to load the results saved by `fluid.save_dygraph`, " \
+                "please specify the full file name, not just the file name prefix. For " \
+                "example, it should be written as `paddle.load('model.pdparams')` instead of " \
+                "`paddle.load('model')`."
+        raise ValueError(error_msg % path)
+
+    if config is None:
+        config = paddle.SaveLoadConfig()
+
+    # 2. load target
+    load_result = None
+    if os.path.isfile(path):
+        # we think path is file means this file is created by paddle.save
+        with open(path, 'rb') as f:
+            load_result = pickle.load(f) if six.PY2 else pickle.load(
+                f, encoding='latin1')
+
+        if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
+            del load_result["StructuredToParameterName@@"]
+    elif os.path.isdir(path):
+        # we think path is directory means compatible with loading 
+        # store results of static mode related save APIs
+
+        # check whether model file exists
+        if config.model_filename is None:
+            model_filename = '__model__'
+        else:
+            model_filename = config.model_filename
+        model_file_path = os.path.join(path, model_filename)
+
+        if os.path.exists(model_file_path):
+            # Load state dict by `jit.save/io.save_inference_model` save format
+            # NOTE(chenweihang): [ Compatibility of save_inference_model save format ]
+            # The model saved by `save_inference_model` does not completely correspond to 
+            # the information required by the `state_dict` under the dygraph. 
+            # `save_inference_model` not save structured name, we need to remind 
+            # the user to configure the `use_structured_name` argument when `set_state_dict`
+            # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
+            load_result = _load_state_dict_from_save_inference_model(path,
+                                                                     config)
+        else:
+            # load state dict by `io.save_params/persistables` save format
+            # TODO(chenweihang): [ Now only supports loading parameters seperately ]
+            # If users save all parameters as one file, the [ variable.name -> variable ]
+            # mapping info will lost, so users need to give variable list, but users build 
+            # variable list in dygraph mode is difficult, we recommend users to use
+            # paddle.static.load_program_state in this case
+            load_result = _load_state_dict_from_save_params(path)
+    else:
+        raise ValueError(
+            "Unsupported path format, now only supports file or directory.")
+
+    return load_result
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index 6f0b0f3c9c1..92dd819b3cd 100644
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -25,16 +25,8 @@ __all__ = [
     'Sampler',
     'SequenceSampler',
     'RandomSampler',
-    'load',
-    'save',
-    'load_program_state',
-    'set_program_state',
-    'load_inference_model',
-    'save_inference_model',
 ]
 
 from ..fluid.io import DataLoader
 from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
         TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler
-from ..fluid.io import load, save, load_program_state, set_program_state, \
-        load_inference_model, save_inference_model, batch
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 42a28a4f04e..e0a9bc6eec3 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -17,8 +17,9 @@ __all__ = [
     'append_backward', 'gradients', 'Executor', 'global_scope', 'scope_guard',
     'BuildStrategy', 'CompiledProgram', 'Print', 'py_func', 'ExecutionStrategy',
     'name_scope', 'ParallelExecutor', 'program_guard', 'WeightNormParamAttr',
-    'default_main_program', 'default_startup_program', 'Program', 'save',
-    'load', 'data', 'InputSpec'
+    'default_main_program', 'default_startup_program', 'Program', 'data',
+    'InputSpec', 'save', 'load', 'save_inference_model', 'load_inference_model',
+    'load_program_state', 'set_program_state'
 ]
 
 from . import nn
@@ -41,5 +42,9 @@ from ..fluid.layers.control_flow import Print  #DEFINE_ALIAS
 from ..fluid.layers.nn import py_func  #DEFINE_ALIAS
 from ..fluid.parallel_executor import ParallelExecutor  #DEFINE_ALIAS
 from ..fluid.param_attr import WeightNormParamAttr  #DEFINE_ALIAS
-from ..tensor.io import save  #DEFINE_ALIAS
-from ..tensor.io import load  #DEFINE_ALIAS
+from ..fluid.io import save  #DEFINE_ALIAS
+from ..fluid.io import load  #DEFINE_ALIAS
+from ..fluid.io import save_inference_model  #DEFINE_ALIAS
+from ..fluid.io import load_inference_model  #DEFINE_ALIAS
+from ..fluid.io import load_program_state  #DEFINE_ALIAS
+from ..fluid.io import set_program_state  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index cec989fba8b..b6bab16c968 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -42,8 +42,6 @@ from .creation import tril  #DEFINE_ALIAS
 from .creation import meshgrid  #DEFINE_ALIAS
 from .creation import empty  #DEFINE_ALIAS
 from .creation import empty_like  #DEFINE_ALIAS
-from .io import save  #DEFINE_ALIAS
-from .io import load  #DEFINE_ALIAS
 from .linalg import matmul  #DEFINE_ALIAS
 from .linalg import dot  #DEFINE_ALIAS
 # from .linalg import einsum        #DEFINE_ALIAS
diff --git a/python/paddle/tensor/io.py b/python/paddle/tensor/io.py
deleted file mode 100644
index 66e956e8e4b..00000000000
--- a/python/paddle/tensor/io.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TODO: define functions to save & load a tensor  
-from ..fluid import save  #DEFINE_ALIAS
-from ..fluid.io import load  #DEFINE_ALIAS
-
-__all__ = ['save', 'load']
-- 
GitLab


From 66951ab2eaf345422e91ba448fba7755834d3b38 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Wed, 23 Sep 2020 20:39:24 +0800
Subject: [PATCH 190/261] modified timeout value for 4 ut (#27462)

---
 paddle/fluid/inference/tests/api/CMakeLists.txt       | 7 +++++++
 python/paddle/fluid/contrib/slim/tests/CMakeLists.txt | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 146d5932577..28211d0ce08 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -198,6 +198,9 @@ inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
 if(NOT WIN32 AND NOT APPLE)
     set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
 endif()
+if (WIN32)
+    set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 200)
+endif()
 
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
@@ -258,6 +261,10 @@ set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
 download_data(${RESNET50_MODEL_DIR} "resnet50_model.tar.gz")
 inference_analysis_api_test_with_fake_data_run(test_analyzer_resnet50 ${IMG_CLASS_TEST_APP}
 	${RESNET50_MODEL_DIR} true)
+if (WIN32)
+    set_tests_properties(test_analyzer_resnet50 PROPERTIES TIMEOUT 200)
+endif()
+
 
 # mobilenet with depthwise_conv op
 set(MOBILENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv")
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 6ac005060e0..dd4bea06572 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -270,7 +270,7 @@ foreach(src ${TEST_OPS})
 endforeach()
 
 # setting timeout value for old unittests
-if(NOT WIN32 AND NOT APPLE)
+if(NOT WIN32)
     set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250 LABELS "RUN_TYPE=NIGHTLY")
 	set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
 endif()
-- 
GitLab


From c0caf0e45fac2bbbae184a702a4fe0313a9c3c69 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 23 Sep 2020 20:44:15 +0800
Subject: [PATCH 191/261] fix ut for static graph (#27506)

---
 python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py     | 3 +++
 .../tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py  | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
index d5b1284e3ce..c09f22f3fc5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
@@ -15,11 +15,14 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
+paddle.enable_static()
+
 # For Net
 base_lr = 0.2
 emb_lr = base_lr * 3
diff --git a/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py b/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
index bca91c536ba..ee099e48eff 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_lookup_sparse_table_fuse_ops.py
@@ -20,6 +20,9 @@ import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 
+import paddle
+paddle.enable_static()
+
 
 class TestLookupTableFuseOp(unittest.TestCase):
     def test_fuse(self):
-- 
GitLab


From 4a9d21de4987ced5aaf58a318ac598abff853b48 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 24 Sep 2020 10:11:06 +0800
Subject: [PATCH 192/261] Add GPU Kernels of Segment Ops,  support, sum, max,
 min, mean

Add GPU Kernels of Segment Ops,  support, sum, max, min, mean
---
 .../fluid/operators/math/segment_pooling.cu   | 365 ++++++++++++++++++
 paddle/fluid/operators/segment_pool_op.cu     |  28 ++
 paddle/fluid/operators/segment_pool_op.h      |  40 ++
 paddle/fluid/platform/cuda_primitives.h       | 107 +++++
 4 files changed, 540 insertions(+)
 create mode 100644 paddle/fluid/operators/math/segment_pooling.cu
 create mode 100644 paddle/fluid/operators/segment_pool_op.cu

diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
new file mode 100644
index 00000000000..bb2b6db100b
--- /dev/null
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -0,0 +1,365 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/segment_pooling.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_param_config.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, typename Index, int DimTileSize>
+__global__ void SegmentMeanCustomKernel(
+    const Index* segment_ids, const T* input, T* output, T* summed_ids,
+    const Index input_length_size, const Index inner_dim_size,
+    const Index output_length_size, const Index total_stripe_count) {
+  CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) {
+    const Index segment_offset = stripe_index % inner_dim_size;
+    const Index dim_index_base =
+        stripe_index / inner_dim_size * Index(DimTileSize);
+    const Index actual_height =
+        min(Index(DimTileSize), input_length_size - dim_index_base);
+
+    Index first_segment_id = segment_ids[dim_index_base];
+    Index last_segment_id = -1;
+    if (dim_index_base > 0) {
+      last_segment_id = segment_ids[dim_index_base - 1];
+    }
+    if (segment_offset == 0) {
+      T sum = T(0);
+      for (Index j = 0; j < actual_height; j++) {
+        Index current_segment_id = segment_ids[dim_index_base + j];
+        // Note(ZHUI): following check may cause
+        // cudaErrorLaunchOutOfResources.
+        // PADDLE_ENFORCE(current_segment_id >= last_segment_id,
+        //               "the segment ids should be sorted, but got "
+        //               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+        //               dim_index_base + j - 1, dim_index_base + j,
+        //               last_segment_id, current_segment_id);
+
+        if (j > 0 && current_segment_id > last_segment_id) {
+          if (last_segment_id == first_segment_id) {
+            platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+          } else {
+            *(summed_ids + last_segment_id) = sum;
+          }
+          sum = T(0);
+        }
+        sum += T(1);
+        last_segment_id = current_segment_id;
+      }
+      platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+    }
+    // ensure last_segment_id is the largest
+    last_segment_id = output_length_size;
+    __syncthreads();
+    T sum = T(0);
+    for (Index j = 0; j < actual_height; j++) {
+      Index current_segment_id = segment_ids[dim_index_base + j];
+      if (current_segment_id > last_segment_id) {
+        const Index output_index =
+            last_segment_id * inner_dim_size + segment_offset;
+        if (last_segment_id == first_segment_id) {
+          platform::CudaAtomicAdd(output + output_index,
+                                  sum / *(summed_ids + last_segment_id));
+        } else {
+          *(output + output_index) = sum / *(summed_ids + last_segment_id);
+        }
+        sum = T(0);
+      }
+      sum += input[(dim_index_base + j) * inner_dim_size + segment_offset];
+      last_segment_id = current_segment_id;
+    }
+    const Index output_index =
+        last_segment_id * inner_dim_size + segment_offset;
+    platform::CudaAtomicAdd(output + output_index,
+                            sum / *(summed_ids + last_segment_id));
+  }
+}
+
+template <typename T, typename Index, typename Helper, typename Pool>
+__global__ void SegmentOpsKernel(const Index* segment_ids, const T* input,
+                                 T* output, Helper h, Pool pool) {
+  CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
+    Index segment_offset, dim_index_base, actual_height;
+    Index inner_dim_size = h.inner_dim_size;
+    h.calculate(stripe_index, segment_offset, dim_index_base, actual_height);
+
+    T minmax = pool.initial();
+    Index first_segment_id = segment_ids[dim_index_base];
+    // -1 is for the start value when interval_id = 0
+    Index last_segment_id = -1;
+    if (dim_index_base > 0) {
+      last_segment_id = segment_ids[dim_index_base - 1];
+    }
+
+    for (Index j = 0; j < actual_height; j++) {
+      Index current_segment_id = segment_ids[dim_index_base + j];
+      // ensure the segment_ids is sorted.
+      PADDLE_ENFORCE(current_segment_id >= last_segment_id,
+                     "The segment ids should be sorted, but got "
+                     "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+                     dim_index_base + j - 1, dim_index_base + j,
+                     last_segment_id, current_segment_id);
+
+      if (current_segment_id > last_segment_id) {
+        // reset the interval value which do not have corresponding ids.
+        for (Index interval_id = last_segment_id + 1;
+             interval_id < current_segment_id; ++interval_id) {
+          *(output + interval_id * inner_dim_size + segment_offset) = 0;
+        }
+        // don't update result when j=0
+        if (j > 0) {
+          const Index output_index =
+              last_segment_id * inner_dim_size + segment_offset;
+          if (last_segment_id == first_segment_id) {
+            pool.atomic(output + output_index, minmax);
+          } else {
+            *(output + output_index) = minmax;
+          }
+          minmax = pool.initial();
+        }
+      }
+      pool.compute(
+          input[(dim_index_base + j) * inner_dim_size + segment_offset],
+          &minmax);
+      last_segment_id = current_segment_id;
+    }
+    const Index output_index =
+        last_segment_id * inner_dim_size + segment_offset;
+    pool.atomic(output + output_index, minmax);
+  }
+}
+
+template <typename T, typename Index, typename Helper>
+__global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input,
+                                       const T* output, const T* out_grad,
+                                       T* in_grad, Helper h) {
+  CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
+    Index segment_offset, dim_index_base, actual_height;
+    h.calculate(stripe_index, segment_offset, dim_index_base, actual_height);
+
+    for (Index j = 0; j < actual_height; j++) {
+      Index current_segment_id = segment_ids[dim_index_base + j];
+      Index input_index =
+          (dim_index_base + j) * h.inner_dim_size + segment_offset;
+      Index output_index =
+          current_segment_id * h.inner_dim_size + segment_offset;
+      if (input[input_index] == output[output_index]) {
+        in_grad[input_index] = out_grad[output_index];
+      }
+    }
+  }
+}
+
+template <class T>
+class MaxPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
+  DEVICE inline T atomic(T* address, const T val) {
+    return platform::CudaAtomicMax(address, val);
+  }
+};
+
+template <class T>
+class MinPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
+  DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
+  DEVICE inline T atomic(T* address, const T val) {
+    return platform::CudaAtomicMin(address, val);
+  }
+};
+
+template <class T>
+class SumPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(0); }
+  DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
+  DEVICE inline T atomic(T* address, const T val) {
+    return platform::CudaAtomicAdd(address, val);
+  }
+};
+
+template <class T>
+class ArrangeHelper {
+ public:
+  const T input_total_size;
+  const T input_length_size;
+  const T output_length_size;
+  T inner_dim_size;
+  T total_stripe_count;
+  const T DimTileSize = 8;
+
+  ArrangeHelper(T a, T b, T c)
+      : input_total_size(a), input_length_size(b), output_length_size(c) {
+    T input_outer_dim_num_stripe =
+        (input_length_size + DimTileSize - 1) / DimTileSize;
+    inner_dim_size = input_total_size / input_length_size;
+    total_stripe_count = inner_dim_size * input_outer_dim_num_stripe;
+  }
+
+  DEVICE inline void calculate(T stripe_index, T& segment_offset,
+                               T& dim_index_base, T& actual_height) {
+    segment_offset = stripe_index % inner_dim_size;
+    dim_index_base = stripe_index / inner_dim_size * DimTileSize;
+    actual_height = min(DimTileSize, input_length_size - dim_index_base);
+  }
+};
+
+template <typename T, typename Index>
+void SegmentPoolCUDAGradFunctor(const platform::CUDADeviceContext& ctx,
+                                const framework::Tensor& input,
+                                const framework::Tensor& segment_ids,
+                                const framework::Tensor& output,
+                                const framework::Tensor& out_grad,
+                                framework::Tensor* in_grad,
+                                const std::string pooltype = "SUM") {
+  auto h = ArrangeHelper<Index>(input.numel(), segment_ids.dims()[0],
+                                output.dims()[0]);
+  auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+  if (pooltype == "MAX" || pooltype == "MIN") {
+    SegmentIndexGradKernel<T, Index, ArrangeHelper<Index>><<<
+        config.block_per_grid.x, config.thread_per_block.x, 0, ctx.stream()>>>(
+        segment_ids.data<Index>(), input.data<T>(), output.data<T>(),
+        out_grad.data<T>(), in_grad->data<T>(), h);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Unsupported segment pooling grad operation, Only MAX, MIN "
+        "available, but got %s.",
+        pooltype));
+  }
+}
+
+template <typename T>
+__global__ void SimpleDiv(T* x, const T* y, const int len, const int dim) {
+  for (int i = blockIdx.x; i < len; i += gridDim.x) {
+    __shared__ T y_i;
+    auto base = i * dim;
+    if (threadIdx.x == 0) {
+      y_i = y[i];
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < dim; j += blockDim.x) {
+      x[base + j] /= y_i;
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
+ public:
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& input,
+                  const framework::Tensor& segment_ids,
+                  framework::Tensor* output,
+                  framework::Tensor* summed_ids = nullptr,
+                  const std::string pooltype = "SUM") {
+    auto h = ArrangeHelper<IndexT>(input.numel(), segment_ids.dims()[0],
+                                   output->dims()[0]);
+    auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+    if (pooltype == "MEAN") {
+      SegmentMeanCustomKernel<
+          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                  config.thread_per_block.x, 0, ctx.stream()>>>(
+          segment_ids.data<IndexT>(), input.data<T>(), output->data<T>(),
+          summed_ids->data<T>(), h.input_length_size, h.inner_dim_size,
+          h.output_length_size, h.total_stripe_count);
+    } else if (pooltype == "SUM") {
+      SumPool<T> pool;
+      SegmentOpsKernel<
+          T, IndexT, ArrangeHelper<IndexT>,
+          SumPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
+                        ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                        input.data<T>(), output->data<T>(), h,
+                                        pool);
+    } else if (pooltype == "MAX") {
+      MaxPool<T> pool;
+      SegmentOpsKernel<
+          T, IndexT, ArrangeHelper<IndexT>,
+          MaxPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
+                        ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                        input.data<T>(), output->data<T>(), h,
+                                        pool);
+    } else if (pooltype == "MIN") {
+      MinPool<T> pool;
+      SegmentOpsKernel<
+          T, IndexT, ArrangeHelper<IndexT>,
+          MinPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
+                        ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                        input.data<T>(), output->data<T>(), h,
+                                        pool);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
+          "available, but got %s.",
+          pooltype));
+    }
+  }
+};
+
+template <typename T, typename IndexT>
+class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& segments, framework::Tensor* in_grad,
+                  const framework::Tensor* summed_ids = nullptr,
+                  const std::string pooltype = "SUM") {
+    if (pooltype == "MAX" || pooltype == "MIN") {
+      SegmentPoolCUDAGradFunctor<T, IndexT>(context, input, segments, output,
+                                            out_grad, in_grad, pooltype);
+    } else if (pooltype == "MEAN") {
+      framework::Tensor mean_grad;
+      mean_grad.mutable_data<T>(input.dims(), context.GetPlace());
+      framework::TensorCopy(out_grad, context.GetPlace(), context, &mean_grad);
+      int len = output.dims()[0];
+      int dim = output.numel() / len;
+      auto config = platform::GetGpuLaunchConfig1D(context, len);
+      SimpleDiv<T><<<config.block_per_grid.x, config.thread_per_block.x, 0,
+                     context.stream()>>>(mean_grad.data<T>(),
+                                         summed_ids->data<T>(), len, dim);
+      GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
+    } else if (pooltype == "SUM") {
+      GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
+          "available, but got %s.",
+          pooltype));
+    }
+  }
+};
+
+using CUDA = paddle::platform::CUDADeviceContext;
+template class SegmentPoolFunctor<CUDA, float, int>;
+template class SegmentPoolFunctor<CUDA, float, int64_t>;
+template class SegmentPoolFunctor<CUDA, double, int>;
+template class SegmentPoolFunctor<CUDA, double, int64_t>;
+template class SegmentPoolGradFunctor<CUDA, float, int>;
+template class SegmentPoolGradFunctor<CUDA, float, int64_t>;
+template class SegmentPoolGradFunctor<CUDA, double, int>;
+template class SegmentPoolGradFunctor<CUDA, double, int64_t>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
new file mode 100644
index 00000000000..dc92d7fcc3a
--- /dev/null
+++ b/paddle/fluid/operators/segment_pool_op.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/operators/segment_pool_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_param_config.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    segment_pool,
+    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    segment_pool_grad,
+    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
index a505946b9f5..23b0c31608d 100644
--- a/paddle/fluid/operators/segment_pool_op.h
+++ b/paddle/fluid/operators/segment_pool_op.h
@@ -63,6 +63,46 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     set_zero(dev_ctx, output, static_cast<T>(0));
   }
+#ifdef PADDLE_WITH_CUDA
+  if (!cpu_place) {
+    Tensor length;
+    length.mutable_data<IndexT>(framework::make_ddim({1}),
+                                platform::CPUPlace());
+    IndexT* length_data = length.data<IndexT>();
+    const IndexT* segment_ids = segment->data<IndexT>();
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
+                   cudaMemcpyDeviceToHost));
+
+    IndexT length_host = length_data[0];
+    length_host++;
+    PADDLE_ENFORCE_GT(
+        length_host, 0,
+        platform::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", length_data[0]));
+    auto dims = input->dims();
+    dims[0] = static_cast<int64_t>(length_host);
+    output->Resize({dims});
+    output->mutable_data<T>(context.GetPlace());
+    T init_value = 0;
+    if (pooltype == "MAX") {
+      init_value = static_cast<T>(-FLT_MAX);
+    } else if (pooltype == "MIN") {
+      init_value = static_cast<T>(FLT_MAX);
+    }
+    math::SetConstant<DeviceContext, T> setconst;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    setconst(dev_ctx, output, static_cast<T>(init_value));
+    // the gpu kernel of mean pool record the counts of segment_ids
+    if (pooltype == "MEAN") {
+      summed_ids = context.Output<Tensor>("SummedIds");
+      summed_ids->Resize({dims[0], 1});
+      summed_ids->mutable_data<T>(context.GetPlace());
+      setconst(dev_ctx, summed_ids, static_cast<T>(1e-12));
+    }
+  }
+#endif
 
   SegmentPoolFunctor<DeviceContext, T, IndexT> pool;
 
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index 67ea64833d3..f7c77071b12 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -128,5 +128,112 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
 }
 
 #endif
+
+// For atomicMax
+USE_CUDA_ATOMIC(Max, int);
+USE_CUDA_ATOMIC(Max, unsigned int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
+
+CUDA_ATOMIC_WRAPPER(Max, int64_t) {
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
+                "long long should be int64");
+  return CudaAtomicMax(
+      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));            // NOLINT
+}
+
+CUDA_ATOMIC_WRAPPER(Max, float) {
+  if (*address >= val) {
+    return;
+  }
+
+  int *const address_as_i = (int *)address;
+  int old = *address_as_i, assumed;
+
+  do {
+    assumed = old;
+    if (__int_as_float(assumed) >= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_i, assumed, __float_as_int(val));
+  } while (assumed != old);
+}
+
+CUDA_ATOMIC_WRAPPER(Max, double) {
+  if (*address >= val) {
+    return;
+  }
+
+  unsigned long long int *const address_as_ull =
+      (unsigned long long int *)address;
+  unsigned long long int old = *address_as_ull, assumed;
+
+  do {
+    assumed = old;
+    if (__longlong_as_double(assumed) >= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
+  } while (assumed != old);
+}
+
+// For atomicMin
+USE_CUDA_ATOMIC(Min, int);
+USE_CUDA_ATOMIC(Min, unsigned int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
+
+CUDA_ATOMIC_WRAPPER(Min, int64_t) {
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
+                "long long should be int64");
+  return CudaAtomicMin(
+      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));            // NOLINT
+}
+
+CUDA_ATOMIC_WRAPPER(Min, float) {
+  if (*address <= val) {
+    return;
+  }
+
+  int *const address_as_i = (int *)address;
+  int old = *address_as_i, assumed;
+
+  do {
+    assumed = old;
+    if (__int_as_float(assumed) <= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_i, assumed, __float_as_int(val));
+  } while (assumed != old);
+}
+
+CUDA_ATOMIC_WRAPPER(Min, double) {
+  if (*address <= val) {
+    return;
+  }
+
+  unsigned long long int *const address_as_ull =
+      (unsigned long long int *)address;
+  unsigned long long int old = *address_as_ull, assumed;
+
+  do {
+    assumed = old;
+    if (__longlong_as_double(assumed) <= val) {
+      break;
+    }
+
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
+  } while (assumed != old);
+}
+
 }  // namespace platform
 }  // namespace paddle
-- 
GitLab


From 5c8fdb59265e7e22a4bd52629e0038180d494ff5 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Thu, 24 Sep 2020 10:21:44 +0800
Subject: [PATCH 193/261] Fix GradientClipByGlobalNorm dtype bug (#27437)

* fix dtype of gradientclipbyglobalnorm

* fix dtype bug of GradientClipbyGlobalnorm
---
 python/paddle/fluid/clip.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 7b301ac19d1..04e4906868e 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -590,7 +590,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
         global_norm_var = layers.reduce_sum(global_norm_var)
         global_norm_var = layers.sqrt(global_norm_var)
         max_global_norm = layers.fill_constant(
-            shape=[1], dtype='float32', value=self.clip_norm)
+            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
         clip_var = layers.elementwise_div(
             x=max_global_norm,
             y=layers.elementwise_max(
@@ -635,7 +635,9 @@ class GradientClipByGlobalNorm(GradientClipBase):
                 global_norm_var = layers.sums(sum_square_list)
                 global_norm_var = layers.sqrt(x=global_norm_var)
                 max_global_norm = layers.fill_constant(
-                    shape=[1], dtype="float32", value=self.clip_norm)
+                    shape=[1],
+                    dtype=global_norm_var.dtype,
+                    value=self.clip_norm)
                 scale_var = layers.elementwise_div(
                     x=max_global_norm,
                     y=layers.elementwise_max(
@@ -663,7 +665,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
             context[self.group_name] = []
             context[self.group_name + "_clip_value"] = self.clip_norm
             context[self.group_name + "_clip"] = layers.fill_constant(
-                shape=[1], dtype="float32", value=self.clip_norm)
+                shape=[1], dtype=grad.dtype, value=self.clip_norm)
         else:
             if not self.clip_norm == context[self.group_name + "_clip_value"]:
                 raise ValueError(
-- 
GitLab


From dc713116e01898986e02f6f30f8279d343bfd957 Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Wed, 23 Sep 2020 19:32:38 -0700
Subject: [PATCH 194/261] refine the error message for bath size like OP 
 (#27446)

* refine the error message for bath size like
---
 paddle/fluid/operators/batch_size_like.h | 40 ++++++++++++++++++------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index d2cf3804930..f24a3c316a0 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -26,25 +26,47 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of %s should not be null.", Type());
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of %s should not be null.", Type());
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", Type());
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", Type());
 
     auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE_GT(shape.size(), 0);
+    PADDLE_ENFORCE_GT(shape.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Shape size must be larger than 0, but received: %s.",
+                          shape.size()));
     std::vector<int64_t> shape_int64(shape.size(), 0);
     std::transform(shape.begin(), shape.end(), shape_int64.begin(),
                    [](int a) { return static_cast<int64_t>(a); });
     auto output_dim = framework::make_ddim(shape_int64);
 
     int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
-    PADDLE_ENFORCE_GE(input_dim_idx, 0);
-    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
+    int input_dim_size = static_cast<int>(ctx->GetInputDim("Input").size());
+    PADDLE_ENFORCE_GE(input_dim_idx, 0,
+                      platform::errors::InvalidArgument(
+                          "Input dimension index must be larger "
+                          "equal than 0, but received: %s.",
+                          input_dim_idx));
+    PADDLE_ENFORCE_GT(input_dim_size, input_dim_idx,
+                      platform::errors::InvalidArgument(
+                          "Input dimension size must be larger than "
+                          "input dimension index, but received input "
+                          "dimension size: %s, input dimension index: %s.",
+                          input_dim_size, input_dim_idx));
 
     int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
-    PADDLE_ENFORCE_GE(output_dim_idx, 0);
-    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
+    int output_dim_size = static_cast<int>(shape.size());
+    PADDLE_ENFORCE_GE(output_dim_idx, 0,
+                      platform::errors::InvalidArgument(
+                          "Output dimension index must be larger "
+                          "equal than 0, but received: %s.",
+                          output_dim_idx));
+    PADDLE_ENFORCE_GT(
+        output_dim_size, output_dim_idx,
+        platform::errors::InvalidArgument(
+            "Output dimension size must be larger than output dimension index, "
+            "but received output dimension size: %s, output dimension index: "
+            "%s.",
+            output_dim_size, output_dim_idx));
 
     output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
     ctx->SetOutputDim("Out", output_dim);
-- 
GitLab


From fc9d80bc9eb996310d9e3a4b1b7227c030e5b05a Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Thu, 24 Sep 2020 12:02:16 +0800
Subject: [PATCH 195/261] [Dy2Stat]rename StaticLayer into StaticFunction
 (#27487)

* rename StaticLayer

* rename
---
 .../dygraph/dygraph_to_static/convert_call_func.py   | 10 +++++-----
 .../dygraph/dygraph_to_static/program_translator.py  | 10 +++++-----
 python/paddle/fluid/dygraph/jit.py                   | 12 ++++++------
 .../unittests/dygraph_to_static/test_declarative.py  |  8 ++++----
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index c837c8eb123..908587c0d9c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -29,7 +29,7 @@ import six
 
 from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import TranslatorLogger
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticLayer
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import convert_to_static
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import unwrap_decorators
 from paddle.fluid.dygraph.layers import Layer
@@ -143,14 +143,14 @@ def convert_call(func):
             #      def foo(x):
             #          return x
             #
-            # `foo` will be converted into a wrapper class, suppose as `StaticLayer`.
-            # And `foo.__globals__['foo']` will still return this `StaticLayer` instead of
-            # `foo` function. So `isinstance(fn, StaticLayer)` is added here. 
+            # `foo` will be converted into a wrapper class, suppose as `StaticFunction`.
+            # And `foo.__globals__['foo']` will still return this `StaticFunction` instead of
+            # `foo` function. So `isinstance(fn, StaticFunction)` is added here. 
             global_functions = set()
             for fn in func.__globals__.values():
                 if inspect.isfunction(fn):
                     global_functions.add(fn)
-                elif isinstance(fn, StaticLayer):
+                elif isinstance(fn, StaticFunction):
                     _, fn = unwrap_decorators(fn)
                     global_functions.add(fn)
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 3b3b9bbe96f..ddf44d805d1 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -205,7 +205,7 @@ def unwrap_decorators(func):
     decorators = []
     cur = func
     while True:
-        if isinstance(cur, StaticLayer):
+        if isinstance(cur, StaticFunction):
             decorators.append(cur)
             # Note: if `cur` is a method, keep it as bound method of class.
             instance = cur._class_instance
@@ -218,7 +218,7 @@ def unwrap_decorators(func):
     return decorators, cur
 
 
-class StaticLayer(object):
+class StaticFunction(object):
     """
     Wrapper class to Manage program conversion of decorated function.
 
@@ -226,7 +226,7 @@ class StaticLayer(object):
 
     def __init__(self, function, input_spec=None):
         """
-        Initializes a `StaticLayer`.
+        Initializes a `StaticFunction`.
 
         Args:
             function(callable): A function or method that will be converted into static program.
@@ -268,12 +268,12 @@ class StaticLayer(object):
         
         In above case, `net(x, y)` will call `net.forward(x, y)` firstly that is a bound method
         of `Net` instance. After decorated by `@paddle.jit.to_static`, it will firstly to call `__get__`
-        to parse the class instance correctly instead of the `StaticLayer` instance.
+        to parse the class instance correctly instead of the `StaticFunction` instance.
         """
         if instance not in self._descriptor_cache:
             if instance is None:
                 return self
-            # Note(Aurelius84): To construct new instance of StaticLayer when we
+            # Note(Aurelius84): To construct new instance of StaticFunction when we
             # first encouter the bound function of layer and cache it.
             new_static_layer = self._clone()
             new_static_layer._class_instance = instance
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index d0e3d23b04b..194ebafb08e 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -28,7 +28,7 @@ from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
 from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticLayer, unwrap_decorators
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticFunction, unwrap_decorators
 from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
 from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.executor import Executor, scope_guard
@@ -141,7 +141,7 @@ def copy_decorator_attrs(original_func, decorated_obj):
 
     Args:
         original_func(callable): the original decorated function.
-        decorated_obj(StaticLayer): the target decorated StaticLayer object.
+        decorated_obj(StaticFunction): the target decorated StaticFunction object.
     """
     decorator_name = "declarative"
 
@@ -198,7 +198,7 @@ def declarative(function=None, input_spec=None):
 
     def decorated(python_func):
         """
-        Decorates a python function into a StaticLayer object.
+        Decorates a python function into a StaticFunction object.
         """
         # Step 1. unwrap the function if it is already decorated.
         _, python_func = unwrap_decorators(python_func)
@@ -206,7 +206,7 @@ def declarative(function=None, input_spec=None):
         # Step 2. copy some attributes from original python function.
         static_layer = copy_decorator_attrs(
             original_func=python_func,
-            decorated_obj=StaticLayer(
+            decorated_obj=StaticFunction(
                 function=python_func, input_spec=input_spec))
 
         return static_layer
@@ -214,7 +214,7 @@ def declarative(function=None, input_spec=None):
     # for usage: `declarative(foo, ...)`
     if function is not None:
         if isinstance(function, Layer):
-            if isinstance(function.forward, StaticLayer):
+            if isinstance(function.forward, StaticFunction):
                 class_name = function.__class__.__name__
                 logging_utils.warn(
                     "`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one.".
@@ -868,7 +868,7 @@ def save(layer, model_path, input_spec=None, config=None):
 
     # 2. get program from Layer
     # TODO(chenweihang): add support for other method, not only forward
-    if isinstance(layer.forward, StaticLayer):
+    if isinstance(layer.forward, StaticFunction):
         concrete_program = layer.forward.concrete_program
     else:
         # transform in jit.save, if input_spec is incomplete, declarative will throw error
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index 450ef7557bc..095eda2a5cb 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -19,7 +19,7 @@ import paddle
 import paddle.fluid as fluid
 from paddle.static import InputSpec
 from paddle.fluid.dygraph import to_variable, declarative, ProgramTranslator, Layer, jit
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import ConcreteProgram, StaticLayer
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ConcreteProgram, StaticFunction
 
 from test_basic_api_transformation import dyfunc_to_variable
 
@@ -81,14 +81,14 @@ class SimpleNet(Layer):
         return z
 
 
-class TestStaticLayerInstance(unittest.TestCase):
+class TestStaticFunctionInstance(unittest.TestCase):
     def test_instance_same_class(self):
         with fluid.dygraph.guard(fluid.CPUPlace()):
             net_1 = SimpleNet()
             net_2 = SimpleNet()
 
-            self.assertTrue(isinstance(net_1.forward, StaticLayer))
-            self.assertTrue(isinstance(net_2.forward, StaticLayer))
+            self.assertTrue(isinstance(net_1.forward, StaticFunction))
+            self.assertTrue(isinstance(net_2.forward, StaticFunction))
             self.assertNotEqual(net_1.forward, net_2.forward)
 
             # convert layer into static progam of net_1
-- 
GitLab


From 29f1560d8fbb1e516dfac5c609e6e869196475a5 Mon Sep 17 00:00:00 2001
From: chalsliu <45041955+chalsliu@users.noreply.github.com>
Date: Thu, 24 Sep 2020 12:49:33 +0800
Subject: [PATCH 196/261] Disable ut quickly.

---
 paddle/scripts/paddle_build.sh     |  5 ++++
 tools/check_file_diff_approvals.sh |  2 +-
 tools/is_ut_disabled.py            | 40 ++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 tools/is_ut_disabled.py

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 69303013d2a..ac6531a2cc5 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -988,6 +988,11 @@ set +x
                 fi
                 read testcase <<< $(echo "$line"|grep -oEi "\w+$")
 
+                if python $PADDLE_ROOT/tools/is_ut_disabled.py $testcase; then
+                    echo $testcase" is disabled."
+                    continue
+                fi
+
                 if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then
                     echo $testcase" will only run at night."
                     continue
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 84254cc89bb..16e61d7c77a 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -286,7 +286,7 @@ fi
 # Get the list of PR authors with unresolved unit test issues
 pip install PyGithub
 # For getting PR related data
-wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate
+wget https://sys-p0.bj.bcebos.com/blk/block.txt --no-check-certificate
 wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate
 HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
 if [ "${HASUTFIXED}" != "" ]; then
diff --git a/tools/is_ut_disabled.py b/tools/is_ut_disabled.py
new file mode 100644
index 00000000000..a21fe39e71e
--- /dev/null
+++ b/tools/is_ut_disabled.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Check whether ut is disabled. """
+
+import os
+import sys
+
+
+def check_ut():
+    """ Get disabled unit tests. """
+    disable_ut_file = 'disable_ut'
+    cmd = 'wget -q --no-check-certificate https://sys-p0.bj.bcebos.com/prec/{}'.format(
+        disable_ut_file)
+    os.system(cmd)
+    with open(disable_ut_file) as utfile:
+        for u in utfile:
+            if u.rstrip('\r\n') == sys.argv[1]:
+                exit(0)
+    exit(1)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        exit(1)
+    try:
+        check_ut()
+    except Exception as e:
+        print(e)
+        exit(1)
-- 
GitLab


From 78a27a2b0d7ad7b6676dc34ae305faf3ee5b0482 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Thu, 24 Sep 2020 12:54:53 +0800
Subject: [PATCH 197/261] Reproduce summary api (#27367)

* reproduce summary api
---
 python/paddle/hapi/model.py         |   5 +-
 python/paddle/hapi/model_summary.py | 219 ++++++++++++++++++++--------
 python/paddle/tests/test_model.py   |  15 +-
 3 files changed, 173 insertions(+), 66 deletions(-)

diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index d41852c9d7f..53928ebed1b 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -1813,7 +1813,7 @@ class Model(object):
             return logs, outputs
         return logs
 
-    def summary(self, input_size=None, batch_size=None, dtype=None):
+    def summary(self, input_size=None, dtype=None):
         """Prints a string summary of the network.
 
         Args:
@@ -1822,7 +1822,6 @@ class Model(object):
                     one input, input_size can be tuple or InputSpec. if model have multiple 
                     input, input_size must be a list which contain every input's shape. 
                     Default: None.
-            batch_size (int, optional): batch size of input tensor, Default: None.
             dtypes (str, optional): if dtypes is None, 'float32' will be used, Default: None.
 
         Returns:
@@ -1859,7 +1858,7 @@ class Model(object):
             _input_size = input_size
         else:
             _input_size = self._inputs
-        return summary(self.network, _input_size, batch_size, dtype)
+        return summary(self.network, _input_size, dtype)
 
     def _verify_spec(self, specs, is_input=False):
         out_specs = []
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index d388ba62f2a..3ead3fc295c 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -25,7 +25,7 @@ from collections import OrderedDict
 __all__ = ['summary']
 
 
-def summary(net, input_size, batch_size=None, dtypes=None):
+def summary(net, input_size, dtypes=None):
     """Prints a string summary of the network.
 
     Args:
@@ -33,8 +33,8 @@ def summary(net, input_size, batch_size=None, dtypes=None):
         input_size (tuple|InputSpec|list[tuple|InputSpec]): size of input tensor. if model only 
                     have one input, input_size can be tuple or InputSpec. if model
                     have multiple input, input_size must be a list which contain 
-                    every input's shape.
-        batch_size (int, optional): batch size of input tensor, Default: None.
+                    every input's shape. Note that input_size only dim of
+                    batch_size can be None or -1.
         dtypes (str, optional): if dtypes is None, 'float32' will be used, Default: None.
 
     Returns:
@@ -77,14 +77,12 @@ def summary(net, input_size, batch_size=None, dtypes=None):
 
             lenet = LeNet()
 
-            params_info = paddle.summary(lenet, (1, 28, 28))
+            params_info = paddle.summary(lenet, (1, 1, 28, 28))
             print(params_info)
 
     """
     if isinstance(input_size, InputSpec):
-        _input_size = tuple(input_size.shape[1:])
-        if batch_size is None:
-            batch_size = input_size.shape[0]
+        _input_size = tuple(input_size.shape)
     elif isinstance(input_size, list):
         _input_size = []
         for item in input_size:
@@ -96,9 +94,7 @@ def summary(net, input_size, batch_size=None, dtypes=None):
                                   type(item))
 
             if isinstance(item, InputSpec):
-                _input_size.append(tuple(item.shape[1:]))
-                if batch_size is None:
-                    batch_size = item.shape[0]
+                _input_size.append(tuple(item.shape))
             else:
                 _input_size.append(item)
     elif isinstance(input_size, int):
@@ -106,28 +102,88 @@ def summary(net, input_size, batch_size=None, dtypes=None):
     else:
         _input_size = input_size
 
-    if batch_size is None:
-        batch_size = -1
-
     if not paddle.in_dynamic_mode():
         warnings.warn(
             "Your model was created in static mode, this may not get correct summary information!"
         )
 
-    result, params_info = summary_string(net, _input_size, batch_size, dtypes)
+    def _is_shape(shape):
+        for item in shape:
+            if isinstance(item, (list, tuple)):
+                return False
+        return True
+
+    def _check_shape(shape):
+        num_unknown = 0
+        new_shape = []
+        for i in range(len(shape)):
+            item = shape[i]
+            if item is None or item == -1:
+                num_unknown += 1
+                if num_unknown > 1:
+                    raise ValueError(
+                        'Option input_size only the dim of batch_size can be None or -1.'
+                    )
+                item = 1
+            elif isinstance(item, numbers.Number):
+                if item <= 0:
+                    raise ValueError(
+                        "Expected element in input size greater than zero, but got {}".
+                        format(item))
+            new_shape.append(item)
+        return tuple(new_shape)
+
+    def _check_input(input_size):
+        if isinstance(input_size, (list, tuple)) and _is_shape(input_size):
+            return _check_shape(input_size)
+        else:
+            return [_check_input(i) for i in input_size]
+
+    _input_size = _check_input(_input_size)
+    result, params_info = summary_string(net, _input_size, dtypes)
     print(result)
 
     return params_info
 
 
-def summary_string(model, input_size, batch_size=-1, dtypes=None):
-    if dtypes == None:
-        dtypes = ['float32'] * len(input_size)
+def summary_string(model, input_size, dtypes=None):
+    def _all_is_numper(items):
+        for item in items:
+            if not isinstance(item, numbers.Number):
+                return False
+        return True
+
+    def _build_dtypes(input_size, dtype):
+        if dtype is None:
+            dtype = 'float32'
+
+        if isinstance(input_size, (list, tuple)) and _all_is_numper(input_size):
+            return [dtype]
+        else:
+            return [_build_dtypes(i, dtype) for i in input_size]
+
+    if not isinstance(dtypes, (list, tuple)):
+        dtypes = _build_dtypes(input_size, dtypes)
+
+    batch_size = 1
 
     summary_str = ''
 
     depth = len(list(model.sublayers()))
 
+    def _get_shape_from_tensor(x):
+        if isinstance(x, (paddle.fluid.Variable, paddle.fluid.core.VarBase)):
+            return list(x.shape)
+        elif isinstance(x, (list, tuple)):
+            return [_get_shape_from_tensor(xx) for xx in x]
+
+    def _get_output_shape(output):
+        if isinstance(output, (list, tuple)):
+            output_shape = [_get_output_shape(o) for o in output]
+        else:
+            output_shape = list(output.shape)
+        return output_shape
+
     def register_hook(layer):
         def hook(layer, input, output):
             class_name = str(layer.__class__).split(".")[-1].split("'")[0]
@@ -139,14 +195,18 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
 
             m_key = "%s-%i" % (class_name, layer_idx + 1)
             summary[m_key] = OrderedDict()
-            summary[m_key]["input_shape"] = list(input[0].shape)
-            summary[m_key]["input_shape"][0] = batch_size
-            if isinstance(output, (list, tuple)):
-                summary[m_key]["output_shape"] = [[-1] + list(o.shape)[1:]
-                                                  for o in output]
-            else:
-                summary[m_key]["output_shape"] = list(output.shape)
-                summary[m_key]["output_shape"][0] = batch_size
+
+            try:
+                summary[m_key]["input_shape"] = _get_shape_from_tensor(input)
+            except:
+                warnings.warn('Get layer {} input shape failed!')
+                summary[m_key]["input_shape"] = []
+
+            try:
+                summary[m_key]["output_shape"] = _get_output_shape(output)
+            except:
+                warnings.warn('Get layer {} output shape failed!')
+                summary[m_key]["output_shape"]
 
             params = 0
 
@@ -175,29 +235,22 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
 
             hooks.append(layer.register_forward_post_hook(hook))
 
-    def _check_input_size(input_sizes):
-        for input_size in input_sizes:
-            for item in input_size:
-                if not isinstance(item, numbers.Number):
-                    raise TypeError(
-                        "Expected item in input size be a number, but got {}".
-                        format(type(item)))
-
-                if item <= 0:
-                    raise ValueError(
-                        "Expected item in input size greater than zero, but got {}".
-                        format(item))
-
     if isinstance(input_size, tuple):
         input_size = [input_size]
 
-    _check_input_size(input_size)
+    def build_input(input_size, dtypes):
+        if isinstance(input_size, (list, tuple)) and _all_is_numper(input_size):
+            if isinstance(dtypes, (list, tuple)):
+                dtype = dtypes[0]
+            else:
+                dtype = dtypes
+            return paddle.rand(list(input_size), dtype)
+        else:
+            return [
+                build_input(i, dtype) for i, dtype in zip(input_size, dtypes)
+            ]
 
-    x = [
-        paddle.rand(
-            [2] + list(in_size), dtype=dtype)
-        for in_size, dtype in zip(input_size, dtypes)
-    ]
+    x = build_input(input_size, dtypes)
 
     # create properties
     summary = OrderedDict()
@@ -213,22 +266,65 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
     for h in hooks:
         h.remove()
 
-    table_width = 80
-    summary_str += "-" * table_width + "\n"
-    line_new = "{:>15} {:>20} {:>20} {:>15}".format(
-        "Layer (type)", "Input Shape", "Output Shape", "Param #")
+    def _get_str_length(summary):
+        head_length = {
+            'layer_width': 15,
+            'input_shape_width': 20,
+            'output_shape_width': 20,
+            'params_width': 15,
+            'table_width': 75
+        }
+
+        for layer in summary:
+            if head_length['output_shape_width'] < len(
+                    str(summary[layer]["output_shape"])):
+                head_length['output_shape_width'] = len(
+                    str(summary[layer]["output_shape"]))
+            if head_length['input_shape_width'] < len(
+                    str(summary[layer]["input_shape"])):
+                head_length['input_shape_width'] = len(
+                    str(summary[layer]["input_shape"]))
+            if head_length['layer_width'] < len(str(layer)):
+                head_length['layer_width'] = len(str(layer))
+            if head_length['params_width'] < len(
+                    str(summary[layer]["nb_params"])):
+                head_length['params_width'] = len(
+                    str(summary[layer]["nb_params"]))
+
+        _temp_width = 0
+        for k, v in head_length.items():
+            if k != 'table_width':
+                _temp_width += v
+
+        if head_length['table_width'] < _temp_width + 5:
+            head_length['table_width'] = _temp_width + 5
+
+        return head_length
+
+    table_width = _get_str_length(summary)
+
+    summary_str += "-" * table_width['table_width'] + "\n"
+    line_new = "{:^{}} {:^{}} {:^{}} {:^{}}".format(
+        "Layer (type)", table_width['layer_width'], "Input Shape",
+        table_width['input_shape_width'], "Output Shape",
+        table_width['output_shape_width'], "Param #",
+        table_width['params_width'])
     summary_str += line_new + "\n"
-    summary_str += "=" * table_width + "\n"
+    summary_str += "=" * table_width['table_width'] + "\n"
     total_params = 0
     total_output = 0
     trainable_params = 0
+    max_length = 0
     for layer in summary:
         # input_shape, output_shape, trainable, nb_params
-        line_new = "{:>15} {:>20} {:>20} {:>15}".format(
-            layer,
+        line_new = "{:^{}} {:^{}} {:^{}} {:^{}}".format(
+            layer, table_width['layer_width'],
             str(summary[layer]["input_shape"]),
+            table_width['input_shape_width'],
             str(summary[layer]["output_shape"]),
-            "{0:,}".format(summary[layer]["nb_params"]), )
+            table_width['output_shape_width'],
+            "{0:,}".format(summary[layer]["nb_params"]),
+            table_width['params_width'])
         total_params += summary[layer]["nb_params"]
 
         try:
@@ -242,25 +338,32 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
                 trainable_params += summary[layer]["nb_params"]
         summary_str += line_new + "\n"
 
-    # assume 4 bytes/number (float on cuda).
-    total_input_size = abs(
-        np.prod(sum(input_size, ())) * batch_size * 4. / (1024**2.))
+    def _get_input_size(input_size, size):
+        if isinstance(input_size, (list, tuple)) and _all_is_numper(input_size):
+            size = abs(np.prod(input_size) * 4. / (1024**2.))
+        else:
+            size = sum([_get_input_size(i, size) for i in input_size])
+        return size
+
+    total_input_size = _get_input_size(input_size, 0)
+
     total_output_size = abs(2. * total_output * 4. /
                             (1024**2.))  # x2 for gradients
     total_params_size = abs(total_params * 4. / (1024**2.))
     total_size = total_params_size + total_output_size + total_input_size
 
-    summary_str += "=" * table_width + "\n"
+    summary_str += "=" * table_width['table_width'] + "\n"
     summary_str += "Total params: {0:,}".format(total_params) + "\n"
     summary_str += "Trainable params: {0:,}".format(trainable_params) + "\n"
     summary_str += "Non-trainable params: {0:,}".format(total_params -
                                                         trainable_params) + "\n"
-    summary_str += "-" * table_width + "\n"
+    summary_str += "-" * table_width['table_width'] + "\n"
     summary_str += "Input size (MB): %0.2f" % total_input_size + "\n"
     summary_str += "Forward/backward pass size (MB): %0.2f" % total_output_size + "\n"
     summary_str += "Params size (MB): %0.2f" % total_params_size + "\n"
     summary_str += "Estimated Total Size (MB): %0.2f" % total_size + "\n"
-    summary_str += "-" * table_width + "\n"
+    summary_str += "-" * table_width['table_width'] + "\n"
+
     # return summary
     return summary_str, {
         'total_params': total_params,
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 62cc39c1f7b..c89cbbbfbda 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -494,17 +494,22 @@ class TestModelFunction(unittest.TestCase):
 
             model.summary(input_size=(20))
             model.summary(input_size=[(20)])
-            model.summary(input_size=(20), batch_size=2)
+            model.summary(input_size=(20), dtype='float32')
 
     def test_summary_nlp(self):
         paddle.enable_static()
-        nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
-        paddle.summary(nlp_net, (1, 2))
+        nlp_net = paddle.nn.GRU(input_size=2,
+                                hidden_size=3,
+                                num_layers=3,
+                                direction="bidirectional")
+        paddle.summary(nlp_net, (1, 1, 2))
+        rnn = paddle.nn.LSTM(16, 32, 2)
+        paddle.summary(rnn, [(-1, 23, 16), ((2, None, 32), (2, -1, 32))])
 
     def test_summary_error(self):
         with self.assertRaises(TypeError):
             nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
-            paddle.summary(nlp_net, (1, '2'))
+            paddle.summary(nlp_net, (1, 1, '2'))
 
         with self.assertRaises(ValueError):
             nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
@@ -512,7 +517,7 @@ class TestModelFunction(unittest.TestCase):
 
         paddle.disable_static()
         nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
-        paddle.summary(nlp_net, (1, 2))
+        paddle.summary(nlp_net, (1, 1, 2))
 
     def test_export_deploy_model(self):
         for dynamic in [True, False]:
-- 
GitLab


From f2c97b6da519138826a3be730f1468c1a5c69389 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 24 Sep 2020 13:14:27 +0800
Subject: [PATCH 198/261] replace dataset with fake data (#27519)

---
 .../contrib/tests/test_weight_decay_extend.py | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
index 906d83fff4f..6000a44ceb6 100644
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import six
 import unittest
 from functools import partial
 import numpy as np
@@ -24,6 +25,24 @@ import contextlib
 paddle.enable_static()
 
 
+def fake_imdb_reader(word_dict_size,
+                     sample_num,
+                     lower_seq_len=100,
+                     upper_seq_len=200,
+                     class_dim=2):
+    def __reader__():
+        for _ in six.moves.range(sample_num):
+            length = np.random.random_integers(
+                low=lower_seq_len, high=upper_seq_len, size=[1])[0]
+            ids = np.random.random_integers(
+                low=0, high=word_dict_size - 1, size=[length]).astype('int64')
+            label = np.random.random_integers(
+                low=0, high=class_dim - 1, size=[1]).astype('int64')[0]
+            yield ids, label
+
+    return __reader__
+
+
 def get_places():
     places = [fluid.CPUPlace()]
     if fluid.core.is_compiled_with_cuda():
@@ -68,10 +87,11 @@ def bow_net(data,
 
 class TestWeightDecay(unittest.TestCase):
     def setUp(self):
-        self.word_dict = paddle.dataset.imdb.word_dict()
-        reader = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict), batch_size=2)()
-        self.train_data = [next(reader) for _ in range(5)]
+        self.word_dict_len = 5147
+        batch_size = 2
+        reader = fake_imdb_reader(self.word_dict_len, batch_size * 100)
+        reader = paddle.batch(reader, batch_size=batch_size)()
+        self.train_data = [next(reader) for _ in range(3)]
         self.learning_rate = .5
 
     def run_program(self, place, feed_list):
@@ -103,7 +123,7 @@ class TestWeightDecay(unittest.TestCase):
             data = fluid.layers.data(
                 name="words", shape=[1], dtype="int64", lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            avg_cost = model(data, label, len(self.word_dict))
+            avg_cost = model(data, label, self.word_dict_len)
             AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
                 fluid.optimizer.Adam)
 
@@ -127,7 +147,7 @@ class TestWeightDecay(unittest.TestCase):
                 name="words", shape=[1], dtype="int64", lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
-            avg_cost = model(data, label, len(self.word_dict))
+            avg_cost = model(data, label, self.word_dict_len)
 
             param_list = [(var, var * self.learning_rate)
                           for var in main_prog.block(0).all_parameters()]
-- 
GitLab


From df43905f1295f76f08d98aab5b6cc6875b4597f0 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 24 Sep 2020 13:27:13 +0800
Subject: [PATCH 199/261] use iwyu clean include (#27267)

* use iwyu clean include, test=develop, test=win

* compilation error, test=develop

* fix compilation error2, test=develop

* fix compilation error3, test=develop

* fix compilation error4, test=develop

* fix compilation error5, test=develop

* fix compilation error6, test=develop

* fix compilation error7, test=develop

* fix compilation error8, test=develop

* fix compilation error8, test=develop

* fix compilation error10, test=develop

* fix compilation error11, test=develop
---
 paddle/fluid/framework/attribute.cc           |  2 --
 paddle/fluid/framework/block_desc.h           |  2 ++
 paddle/fluid/framework/c/c_api.cc             | 10 -------
 paddle/fluid/framework/c/c_api.h              |  9 ++++++
 paddle/fluid/framework/channel.h              |  2 +-
 .../fluid/framework/copy_same_tensor_test.cc  |  2 +-
 .../fluid/framework/data_device_transform.h   |  2 ++
 paddle/fluid/framework/data_feed.h            | 10 +++++++
 paddle/fluid/framework/data_feed_factory.cc   |  4 +--
 paddle/fluid/framework/data_feed_factory.h    |  3 ++
 .../fluid/framework/data_layout_transform.cc  |  2 +-
 .../fluid/framework/data_layout_transform.h   |  8 ++++++
 .../framework/data_layout_transform_test.cc   |  1 -
 paddle/fluid/framework/data_transform.cc      |  7 ++++-
 paddle/fluid/framework/data_transform.h       |  4 +++
 paddle/fluid/framework/data_type.cc           |  1 -
 paddle/fluid/framework/data_type.h            | 11 ++++++--
 paddle/fluid/framework/data_type_test.cc      |  8 ++++++
 paddle/fluid/framework/data_type_transform.h  |  4 +++
 paddle/fluid/framework/dataset_factory.cc     |  3 --
 paddle/fluid/framework/dataset_factory.h      |  1 +
 paddle/fluid/framework/ddim.h                 |  1 +
 paddle/fluid/framework/ddim_test.cc           |  1 -
 .../framework/details/all_reduce_op_handle.h  | 11 ++++++++
 .../framework/details/broadcast_op_handle.h   | 14 ++++++++++
 .../details/broadcast_op_handle_test.h        |  4 ++-
 .../fluid/framework/details/build_strategy.h  | 12 ++++++++
 .../details/computation_op_handle.cc          |  2 ++
 .../framework/details/computation_op_handle.h | 12 ++++++++
 .../details/eager_deletion_op_handle.h        | 11 ++++++++
 .../details/exception_holder_test.cc          |  3 +-
 .../details/fetch_async_op_handle.cc          |  7 ++++-
 .../framework/details/fetch_async_op_handle.h | 12 ++++++++
 .../details/fetch_barrier_op_handle.cc        |  2 ++
 .../details/fetch_barrier_op_handle.h         | 11 ++++++++
 .../fluid/framework/details/fetch_op_handle.h | 11 ++++++++
 .../details/fused_all_reduce_op_handle.h      | 12 ++++++++
 .../details/fused_broadcast_op_handle.h       | 11 ++++++++
 .../details/fused_broadcast_op_handle_test.cc |  9 ++++++
 .../framework/details/gather_op_handle.h      |  8 ++++++
 .../details/gather_op_handle_test.cc          |  4 +--
 .../framework/details/multi_devices_helper.h  | 11 ++++++--
 .../framework/details/nan_inf_utils_detail.h  |  6 ++++
 .../fluid/framework/details/op_handle_base.h  | 12 ++++++++
 .../framework/details/reduce_op_handle.h      | 15 ++++++++++
 .../fluid/framework/details/rpc_op_handle.h   | 10 +++++++
 .../details/scale_loss_grad_op_handle.cc      |  8 ++++++
 .../details/scale_loss_grad_op_handle.h       | 12 ++++++++
 .../details/scope_buffered_monitor.h          |  2 ++
 .../details/share_tensor_buffer_functor.cc    | 11 ++++++++
 .../details/share_tensor_buffer_functor.h     |  9 ++++++
 .../details/share_tensor_buffer_op_handle.cc  | 10 +++++++
 .../details/share_tensor_buffer_op_handle.h   | 12 ++++++++
 .../details/sparse_all_reduce_op_handle.h     | 11 ++++++++
 paddle/fluid/framework/details/var_handle.h   |  8 ++++++
 .../framework/details/variable_visitor.cc     |  8 ++++++
 .../framework/details/variable_visitor.h      |  7 +++++
 paddle/fluid/framework/device_worker.cc       |  4 ++-
 paddle/fluid/framework/device_worker.h        | 12 ++++++++
 .../fluid/framework/device_worker_factory.cc  |  2 ++
 .../fluid/framework/device_worker_factory.h   |  3 ++
 paddle/fluid/framework/device_worker_test.cc  |  3 +-
 paddle/fluid/framework/dlpack_tensor.cc       | 11 ++++++--
 paddle/fluid/framework/dlpack_tensor.h        |  3 ++
 paddle/fluid/framework/dlpack_tensor_test.cc  |  6 ++++
 paddle/fluid/framework/downpour_worker.cc     | 10 +++++--
 paddle/fluid/framework/downpour_worker_opt.cc | 10 +++----
 paddle/fluid/framework/eigen.h                |  2 ++
 paddle/fluid/framework/executor.h             |  6 ++++
 paddle/fluid/framework/executor_gc_helper.cc  | 13 +++++----
 paddle/fluid/framework/executor_gc_helper.h   |  5 ++++
 paddle/fluid/framework/feed_fetch_method.cc   |  9 +++---
 paddle/fluid/framework/feed_fetch_method.h    |  4 +++
 paddle/fluid/framework/fleet/fleet_wrapper.cc |  6 ----
 paddle/fluid/framework/fleet/fleet_wrapper.h  |  6 ++++
 paddle/fluid/framework/fleet/gloo_wrapper.cc  |  3 --
 paddle/fluid/framework/fleet/heter_wrapper.cc | 16 +++--------
 paddle/fluid/framework/fleet/nccl_wrapper.cc  |  3 --
 paddle/fluid/framework/fleet/nccl_wrapper.h   |  7 +++++
 paddle/fluid/framework/garbage_collector.h    |  7 +++++
 paddle/fluid/framework/generator.h            |  1 -
 paddle/fluid/framework/hetercpu_worker.cc     |  7 -----
 paddle/fluid/framework/heterxpu_trainer.cc    | 12 +-------
 paddle/fluid/framework/inlined_vector.h       |  1 +
 paddle/fluid/framework/inlined_vector_test.cc |  4 +--
 .../fluid/framework/io/crypto/cipher_utils.cc |  2 --
 .../fluid/framework/io/crypto/cipher_utils.h  |  1 +
 paddle/fluid/framework/io/fs.h                |  1 +
 paddle/fluid/framework/io/shell.h             |  1 +
 .../framework/ir/attention_lstm_fuse_pass.h   |  2 ++
 .../ir/conv_affine_channel_fuse_pass.cc       | 13 +++++++--
 .../ir/conv_affine_channel_fuse_pass.h        |  3 ++
 .../fluid/framework/ir/conv_bn_fuse_pass.cc   | 11 ++++++--
 paddle/fluid/framework/ir/conv_bn_fuse_pass.h |  3 ++
 .../ir/conv_elementwise_add2_act_fuse_pass.h  |  2 ++
 .../ir/conv_elementwise_add_act_fuse_pass.cc  |  1 +
 .../ir/conv_elementwise_add_act_fuse_pass.h   |  2 ++
 .../ir/conv_elementwise_add_fuse_pass.cc      |  1 +
 .../ir/conv_elementwise_add_fuse_pass.h       |  2 ++
 .../fluid/framework/ir/cudnn_placement_pass.h |  1 +
 .../ir/delete_quant_dequant_op_pass.cc        |  4 +--
 .../ir/delete_quant_dequant_op_pass.h         |  3 ++
 .../embedding_eltwise_layernorm_fuse_pass.cc  |  2 +-
 .../embedding_eltwise_layernorm_fuse_pass.h   |  9 ++++++
 .../ir/embedding_fc_lstm_fuse_pass.h          |  2 ++
 .../ir/fc_elementwise_layernorm_fuse_pass.cc  |  2 ++
 .../ir/fc_elementwise_layernorm_fuse_pass.h   |  2 ++
 paddle/fluid/framework/ir/fc_fuse_pass.cc     |  4 +--
 paddle/fluid/framework/ir/fc_fuse_pass.h      |  2 ++
 paddle/fluid/framework/ir/fc_gru_fuse_pass.cc |  3 ++
 paddle/fluid/framework/ir/fc_gru_fuse_pass.h  |  3 ++
 .../fluid/framework/ir/fc_lstm_fuse_pass.cc   |  3 ++
 paddle/fluid/framework/ir/fc_lstm_fuse_pass.h |  2 ++
 paddle/fluid/framework/ir/fuse_bn_act_pass.cc | 11 ++++++--
 paddle/fluid/framework/ir/fuse_bn_act_pass.h  |  4 +++
 .../framework/ir/fuse_elewise_add_act_pass.h  |  4 +++
 .../fuse_adam_op_pass.cc                      | 13 +++++----
 .../fuse_momentum_op_pass.cc                  | 11 +++++---
 .../fuse_optimizer_op_pass.h                  | 11 ++++++++
 .../fuse_sgd_op_pass.cc                       | 12 +++++---
 paddle/fluid/framework/ir/fuse_pass_base.cc   |  8 ++++++
 paddle/fluid/framework/ir/fuse_pass_base.h    | 10 +++++++
 .../ir/fuse_relu_depthwise_conv_pass.h        |  3 ++
 .../ir/fusion_group/code_generator.h          |  3 ++
 .../ir/fusion_group/code_generator_tester.cc  | 12 +++++---
 .../fusion_group/elementwise_group_detector.h |  9 ++++++
 .../ir/fusion_group/fusion_group_pass.h       |  6 ++++
 .../framework/ir/fusion_group/operation.h     |  1 +
 paddle/fluid/framework/ir/graph.h             |  7 +++++
 paddle/fluid/framework/ir/graph_helper.h      |  2 ++
 .../fluid/framework/ir/graph_helper_test.cc   |  1 -
 .../framework/ir/graph_pattern_detector.h     | 10 +++++++
 .../ir/graph_pattern_detector_tester.cc       |  6 ++--
 .../framework/ir/graph_to_program_pass.h      |  2 ++
 .../ir/graph_to_program_pass_test.cc          |  3 +-
 paddle/fluid/framework/ir/graph_traits.cc     |  5 ++--
 paddle/fluid/framework/ir/graph_traits.h      |  3 ++
 paddle/fluid/framework/ir/graph_viz_pass.h    |  2 ++
 .../ir/identity_scale_op_clean_pass.cc        |  2 ++
 .../ir/identity_scale_op_clean_pass.h         |  2 ++
 paddle/fluid/framework/ir/is_test_pass.cc     |  2 ++
 paddle/fluid/framework/ir/is_test_pass.h      |  2 ++
 .../framework/ir/lock_free_optimize_pass.h    |  1 +
 ...uffer_shared_cross_op_memory_reuse_pass.cc | 15 +++++-----
 .../buffer_shared_inplace_op_pass.cc          |  5 +++-
 .../memory_optimize_pass/memory_reuse_pass.cc | 13 ++++++---
 .../memory_optimize_pass/memory_reuse_pass.h  | 13 +++++++++
 .../ir/memory_optimize_pass/op_graph_view.cc  | 10 +++++--
 .../ir/memory_optimize_pass/op_graph_view.h   |  9 ++++++
 .../reference_count_pass_helper.cc            |  2 --
 .../reference_count_pass_helper.h             |  3 ++
 .../conv_activation_mkldnn_fuse_pass.cc       |  9 +++++-
 .../mkldnn/conv_activation_mkldnn_fuse_pass.h |  3 ++
 .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc   |  1 -
 .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.h    |  4 +++
 .../conv_concat_relu_mkldnn_fuse_pass.cc      |  2 ++
 .../conv_concat_relu_mkldnn_fuse_pass.h       |  3 ++
 .../conv_elementwise_add_mkldnn_fuse_pass.h   |  7 +++++
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |  5 ++--
 .../framework/ir/mkldnn/cpu_quantize_pass.h   |  4 +++
 .../ir/mkldnn/cpu_quantize_placement_pass.cc  |  3 +-
 .../ir/mkldnn/cpu_quantize_placement_pass.h   |  3 ++
 .../ir/mkldnn/cpu_quantize_squash_pass.cc     |  5 +++-
 .../ir/mkldnn/cpu_quantize_squash_pass.h      |  3 ++
 .../ir/mkldnn/depthwise_conv_mkldnn_pass.cc   |  3 ++
 .../ir/mkldnn/depthwise_conv_mkldnn_pass.h    |  2 ++
 .../framework/ir/mkldnn/fc_mkldnn_pass.cc     | 15 ++++++----
 .../framework/ir/mkldnn/fc_mkldnn_pass.h      |  3 ++
 .../matmul_transpose_reshape_fuse_pass.h      |  5 +++-
 .../ir/mkldnn/mkldnn_placement_pass.h         |  1 +
 ...eshape_transpose_matmul_mkldnn_fuse_pass.h |  3 ++
 .../ir/mkldnn/scale_matmul_fuse_pass.cc       |  4 +++
 .../ir/mkldnn/scale_matmul_fuse_pass.h        |  2 ++
 .../framework/ir/multi_batch_merge_pass.h     |  2 ++
 .../multi_devices_graph_pass.h                | 15 ++++++++++
 .../sequential_execution_pass.cc              | 11 ++++----
 .../set_reader_device_info_utils.h            |  8 ++++++
 .../ir/multihead_matmul_fuse_pass.cc          |  4 +--
 .../framework/ir/multihead_matmul_fuse_pass.h |  9 ++++++
 paddle/fluid/framework/ir/node.cc             |  1 -
 paddle/fluid/framework/ir/node.h              |  8 ++++++
 paddle/fluid/framework/ir/pass.cc             | 12 +++++---
 paddle/fluid/framework/ir/pass.h              |  2 ++
 paddle/fluid/framework/ir/pass_builder.cc     |  2 ++
 paddle/fluid/framework/ir/pass_builder.h      |  3 ++
 paddle/fluid/framework/ir/pass_test.cc        |  8 ++++--
 .../fluid/framework/ir/placement_pass_base.h  |  3 ++
 .../ir/repeated_fc_relu_fuse_pass.cc          |  3 +-
 .../framework/ir/repeated_fc_relu_fuse_pass.h |  3 ++
 .../framework/ir/runtime_context_cache_pass.h |  3 ++
 .../framework/ir/seq_concat_fc_fuse_pass.cc   |  3 --
 .../framework/ir/seq_concat_fc_fuse_pass.h    |  2 ++
 .../ir/seqconv_eltadd_relu_fuse_pass.cc       |  3 ++
 .../ir/seqconv_eltadd_relu_fuse_pass.h        |  3 ++
 .../framework/ir/seqpool_concat_fuse_pass.cc  |  9 +++++-
 .../framework/ir/seqpool_concat_fuse_pass.h   |  3 ++
 .../ir/seqpool_cvm_concat_fuse_pass.cc        |  5 ++--
 .../ir/seqpool_cvm_concat_fuse_pass.h         |  3 ++
 .../ir/shuffle_channel_detect_pass.cc         |  3 --
 .../ir/shuffle_channel_detect_pass.h          |  1 +
 .../ir/simplify_with_basic_ops_pass.cc        |  3 +-
 .../ir/simplify_with_basic_ops_pass.h         |  4 +++
 .../framework/ir/skip_layernorm_fuse_pass.cc  |  3 +-
 .../framework/ir/skip_layernorm_fuse_pass.h   |  2 ++
 .../framework/ir/squared_mat_sub_fuse_pass.h  |  3 ++
 .../fluid/framework/ir/subgraph_detector.cc   |  7 ++---
 paddle/fluid/framework/ir/subgraph_detector.h |  4 +++
 .../framework/ir/sync_batch_norm_pass.cc      |  5 ++--
 .../ir/transpose_flatten_concat_fuse_pass.cc  |  1 -
 paddle/fluid/framework/lod_rank_table.h       |  1 +
 paddle/fluid/framework/lod_tensor.cc          | 15 ++++------
 paddle/fluid/framework/lod_tensor.h           | 10 +++++++
 paddle/fluid/framework/lod_tensor_test.cc     |  3 --
 paddle/fluid/framework/mixed_vector.h         |  4 +--
 paddle/fluid/framework/mixed_vector_test.cc   |  4 +--
 paddle/fluid/framework/naive_executor.h       |  6 ++++
 .../framework/no_need_buffer_vars_inference.h |  1 +
 paddle/fluid/framework/op_call_stack.h        |  7 +++++
 paddle/fluid/framework/op_compatible_info.h   |  5 ++++
 .../framework/op_compatible_info_test.cc      |  7 +++--
 paddle/fluid/framework/op_desc.h              |  2 ++
 paddle/fluid/framework/op_info.cc             |  2 --
 paddle/fluid/framework/op_info.h              |  5 ++++
 paddle/fluid/framework/op_kernel_type.h       |  1 +
 paddle/fluid/framework/op_kernel_type_test.cc |  2 +-
 paddle/fluid/framework/op_proto_maker_test.cc |  6 ++++
 paddle/fluid/framework/op_registry.cc         |  2 --
 paddle/fluid/framework/op_registry.h          |  6 ++++
 .../framework/op_version_registry_test.cc     |  1 -
 paddle/fluid/framework/operator.h             | 11 +++++++-
 paddle/fluid/framework/program_desc.h         |  1 +
 paddle/fluid/framework/program_desc_test.cc   |  4 ++-
 paddle/fluid/framework/prune.h                |  1 +
 paddle/fluid/framework/prune_test.cc          |  4 ---
 paddle/fluid/framework/pull_dense_worker.cc   |  6 +++-
 paddle/fluid/framework/rw_lock_test.cc        |  1 -
 paddle/fluid/framework/save_load_util.h       |  2 ++
 paddle/fluid/framework/save_load_util_test.cc |  3 --
 paddle/fluid/framework/scope.h                |  6 ++++
 paddle/fluid/framework/scope_pool.cc          |  4 +--
 paddle/fluid/framework/scope_pool.h           |  3 ++
 paddle/fluid/framework/scope_test.cc          |  8 +++++-
 paddle/fluid/framework/selected_rows.cc       |  6 ++++
 paddle/fluid/framework/selected_rows.h        |  9 ++++++
 paddle/fluid/framework/shape_inference.cc     |  5 ----
 paddle/fluid/framework/tensor.cc              |  9 +++++-
 paddle/fluid/framework/tensor.h               |  9 ++++++
 paddle/fluid/framework/tensor_test.cc         |  8 +++++-
 paddle/fluid/framework/tensor_util.cc         |  2 +-
 paddle/fluid/framework/tensor_util.h          |  3 ++
 paddle/fluid/framework/tensor_util_test.cc    |  2 +-
 paddle/fluid/framework/threadpool.cc          |  2 --
 paddle/fluid/framework/threadpool.h           |  1 +
 paddle/fluid/framework/trainer.h              | 12 +++++++-
 paddle/fluid/framework/trainer_factory.cc     |  4 +--
 paddle/fluid/framework/trainer_factory.h      |  3 ++
 paddle/fluid/framework/transfer_scope_cache.h |  4 +++
 paddle/fluid/framework/tuple.h                |  1 +
 paddle/fluid/framework/tuple_test.cc          |  5 +---
 paddle/fluid/framework/unroll_array_ops.h     |  1 +
 .../fluid/framework/unroll_array_ops_test.cc  |  3 +-
 paddle/fluid/framework/unused_var_check.cc    |  4 +--
 paddle/fluid/framework/unused_var_check.h     |  1 -
 paddle/fluid/framework/var_desc.cc            |  2 --
 paddle/fluid/framework/var_desc.h             |  1 +
 paddle/fluid/framework/var_type_inference.h   |  3 +-
 .../framework/var_type_inference_test.cc      |  4 +++
 paddle/fluid/framework/var_type_traits.h      |  7 +++--
 paddle/fluid/framework/variable_helper.h      |  2 ++
 paddle/fluid/framework/variable_test.cc       |  6 +---
 paddle/fluid/framework/version.cc             |  2 +-
 paddle/fluid/imperative/all_reduce.h          |  8 ++++++
 paddle/fluid/imperative/amp_auto_cast.cc      |  7 ++---
 paddle/fluid/imperative/amp_auto_cast.h       |  2 ++
 paddle/fluid/imperative/data_loader.cc        |  4 ---
 paddle/fluid/imperative/jit/op_desc_meta.cc   |  1 -
 paddle/fluid/imperative/jit/op_desc_meta.h    |  1 +
 .../imperative/jit/program_desc_tracer.cc     |  7 ++++-
 .../imperative/jit/program_desc_tracer.h      |  7 +++++
 paddle/fluid/imperative/layer.h               |  9 ++++++
 paddle/fluid/imperative/prepared_operator.h   | 11 ++++++++
 paddle/fluid/imperative/profiler.cc           |  3 +-
 .../imperative/tests/nccl_context_test.cc     |  2 +-
 .../analysis/ir_passes/subgraph_util.cc       |  9 +++++-
 .../analysis/ir_passes/subgraph_util.h        |  6 ++++
 .../ir_passes/tensorrt_subgraph_pass.h        | 10 +++++++
 .../passes/adjust_cudnn_workspace_size_pass.h |  2 ++
 .../passes/inference_op_replace_pass.h        |  2 ++
 .../analysis/passes/ir_graph_clean_pass.h     |  3 ++
 .../passes/ir_graph_to_program_pass.h         |  3 ++
 .../analysis/passes/memory_optimize_pass.cc   | 21 +++++++-------
 .../analysis/passes/memory_optimize_pass.h    |  9 ++++++
 paddle/fluid/inference/api/analysis_config.cc |  4 +--
 paddle/fluid/inference/api/api_impl.h         |  8 ++++++
 .../api/details/reset_tensor_array.cc         |  6 ++++
 .../api/details/reset_tensor_array.h          |  8 ++++++
 paddle/fluid/inference/api/helper.h           |  1 +
 .../inference/api/mkldnn_quantizer_config.cc  |  2 ++
 .../fluid/inference/api/paddle_pass_builder.h |  1 +
 .../tensorrt/convert/activation_op.cc         | 21 +++++++++++++-
 .../tensorrt/convert/batch_norm_op.cc         | 13 ++++++++-
 .../inference/tensorrt/convert/concat_op.cc   |  9 ++++++
 .../inference/tensorrt/convert/conv2d_op.cc   |  9 ++++++
 .../inference/tensorrt/convert/dropout_op.cc  |  9 ++++++
 .../tensorrt/convert/emb_eltwise_layernorm.cc |  9 ++++++
 .../fluid/inference/tensorrt/convert/fc_op.cc |  9 ++++++
 .../inference/tensorrt/convert/gelu_op.cc     | 12 ++++++++
 .../tensorrt/convert/hard_sigmoid_op.cc       |  9 ++++++
 .../tensorrt/convert/hard_swish_op.cc         | 12 ++++++++
 .../tensorrt/convert/instance_norm_op.cc      | 12 ++++++++
 .../tensorrt/convert/leaky_relu_op.cc         | 12 ++++++++
 .../inference/tensorrt/convert/mul_op.cc      |  9 ++++++
 .../inference/tensorrt/convert/pad_op.cc      |  9 ++++++
 .../inference/tensorrt/convert/pool2d_op.cc   |  9 ++++++
 .../inference/tensorrt/convert/scale_op.cc    |  9 ++++++
 .../tensorrt/convert/shuffle_channel_op.cc    |  9 ++++++
 .../inference/tensorrt/convert/softmax_op.cc  |  9 ++++++
 .../inference/tensorrt/convert/swish_op.cc    | 12 ++++++++
 paddle/fluid/inference/tensorrt/engine.cc     |  7 +++--
 paddle/fluid/inference/tensorrt/engine.h      | 12 ++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  6 ++++
 paddle/fluid/inference/tensorrt/op_teller.h   |  7 +++++
 .../tensorrt/plugin/trt_plugin_factory.h      |  1 +
 .../fluid/inference/tensorrt/test_engine.cc   |  3 --
 .../inference/tensorrt/trt_int8_calibrator.h  |  1 +
 paddle/fluid/inference/utils/benchmark.cc     |  2 +-
 paddle/fluid/inference/utils/io_utils.h       |  5 ++++
 .../memory/allocation/aligned_allocator.h     |  1 +
 paddle/fluid/memory/allocation/allocator.h    |  1 +
 .../allocator_facade_abs_flags_test.cc        |  4 +--
 .../allocator_facade_frac_flags_test.cc       |  4 +--
 .../memory/allocation/allocator_strategy.cc   |  2 +-
 .../auto_growth_best_fit_allocator.h          |  1 +
 .../auto_growth_best_fit_allocator_test.cc    |  2 ++
 .../memory/allocation/best_fit_allocator.cc   |  6 +---
 .../memory/allocation/best_fit_allocator.h    |  7 +++++
 .../memory/allocation/buffered_allocator.cc   |  3 --
 .../memory/allocation/buffered_allocator.h    |  1 +
 .../fluid/memory/allocation/cpu_allocator.cc  |  2 --
 .../memory/allocation/locked_allocator.h      |  1 +
 .../fluid/memory/allocation/mmap_allocator.cc |  5 ----
 .../memory/allocation/mmap_allocator_test.cc  |  2 --
 .../allocation/naive_best_fit_allocator.h     |  2 ++
 .../memory/allocation/pinned_allocator.cc     |  3 --
 .../allocation/thread_local_allocator.h       |  1 +
 paddle/fluid/memory/detail/buddy_allocator.cc |  2 --
 .../memory/detail/buddy_allocator_test.cc     |  3 --
 .../memory/detail/system_allocator_test.cc    |  1 -
 paddle/fluid/memory/malloc.h                  |  2 ++
 .../fluid/operators/activation_cudnn_op.cu.cc |  9 ++++--
 .../fluid/operators/array_to_lod_tensor_op.cc | 15 +++++++---
 paddle/fluid/operators/assert_op.cc           | 16 ++++++++++-
 paddle/fluid/operators/assign_op.cc           | 16 ++++++++++-
 paddle/fluid/operators/assign_op.h            |  7 +++++
 paddle/fluid/operators/assign_op_test.cc      |  2 --
 paddle/fluid/operators/assign_value_op.cc     | 17 ++++++++++-
 paddle/fluid/operators/assign_value_op.h      |  1 +
 .../fluid/operators/beam_search_decode_op.cc  | 14 +++++++++-
 .../collective/c_allreduce_max_op.cc          | 15 ++++++++++
 .../collective/c_allreduce_max_op.cu.cc       |  7 +++++
 .../collective/c_allreduce_min_op.cc          | 15 ++++++++++
 .../collective/c_allreduce_min_op.cu.cc       |  7 +++++
 .../collective/c_allreduce_prod_op.cc         | 15 ++++++++++
 .../collective/c_allreduce_prod_op.cu.cc      |  7 +++++
 .../collective/c_allreduce_sum_op.cc          | 13 +++++++++
 .../collective/c_allreduce_sum_op.cu.cc       |  7 +++++
 .../operators/collective/c_comm_init_op.cc    | 14 ++++------
 .../operators/collective/c_gen_nccl_id_op.cc  | 24 ++++++++--------
 .../operators/collective/c_reduce_max_op.cc   | 15 ++++++++++
 .../collective/c_reduce_max_op.cu.cc          |  7 +++++
 .../operators/collective/c_reduce_min_op.cc   | 15 ++++++++++
 .../collective/c_reduce_min_op.cu.cc          |  7 +++++
 .../operators/collective/c_reduce_prod_op.cc  | 15 ++++++++++
 .../collective/c_reduce_prod_op.cu.cc         |  7 +++++
 .../operators/collective/c_reduce_sum_op.cc   | 15 ++++++++++
 .../collective/c_reduce_sum_op.cu.cc          |  7 +++++
 .../collective/c_sync_calc_stream_op.cc       | 14 ++++------
 .../collective/c_sync_comm_stream_op.cc       | 13 ++++-----
 .../operators/common_infer_shape_functions.cc |  7 ++++-
 .../operators/common_infer_shape_functions.h  |  6 ++++
 .../controlflow/conditional_block_infer_op.cc | 12 ++++++++
 .../controlflow/conditional_block_op.h        |  1 +
 .../conditional_block_op_helper.cc            |  9 +++++-
 .../controlflow/conditional_block_op_helper.h |  7 +++++
 .../controlflow/conditional_block_op_test.cc  |  6 +---
 paddle/fluid/operators/controlflow/feed_op.cc | 13 ++++++++-
 .../operators/controlflow/get_places_op.cc    | 14 +++++++++-
 .../fluid/operators/controlflow/op_variant.h  |  7 +++++
 .../controlflow/recurrent_op_helper.cc        | 11 ++++----
 .../controlflow/recurrent_op_helper.h         |  6 ++++
 .../controlflow/tensor_array_read_write_op.cc | 11 +++++++-
 .../fluid/operators/controlflow/while_op.cc   | 11 ++++++--
 .../operators/controlflow/while_op_helper.h   |  8 ++++++
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    |  9 ++++--
 paddle/fluid/operators/delete_var_op.cc       | 13 +++++++++
 .../fluid/operators/dequantize_abs_max_op.cc  | 17 ++++++++++-
 .../fluid/operators/dequantize_abs_max_op.h   |  8 ++++++
 paddle/fluid/operators/dequantize_log_op.cc   | 18 ++++++++++--
 paddle/fluid/operators/dequantize_log_op.h    |  7 +++++
 .../async_sparse_param_update_recorder.h      |  3 +-
 ...async_sparse_param_update_recorder_test.cc |  2 --
 .../distributed/collective_client.cc          |  6 ++--
 .../operators/distributed/collective_client.h | 13 ++++++++-
 .../distributed/collective_server.cc          |  8 +-----
 .../operators/distributed/collective_server.h |  8 +++++-
 .../distributed/collective_server_test.cc     | 16 ++++-------
 .../operators/distributed/communicator.h      |  3 +-
 .../distributed/communicator_test.cc          |  4 ---
 .../grpc/grpc_bytebuffer_stream.cc            |  4 +++
 .../distributed/grpc/grpc_bytebuffer_stream.h |  4 +++
 .../operators/distributed/grpc/grpc_client.h  | 13 ++++++++-
 .../operators/distributed/grpc/grpc_serde.cc  | 21 +++++++++-----
 .../operators/distributed/grpc/grpc_serde.h   | 14 +++++++++-
 .../operators/distributed/grpc/grpc_server.cc | 14 ++++++++++
 .../operators/distributed/grpc/grpc_server.h  |  4 +++
 .../grpc/grpc_variable_response.cc            | 19 +++++++++++--
 .../distributed/grpc/grpc_variable_response.h | 19 ++++++++++---
 .../distributed/heart_beat_monitor.cc         |  2 +-
 .../distributed/heart_beat_monitor.h          |  7 ++---
 .../distributed/heart_beat_monitor_test.cc    |  3 --
 .../operators/distributed/large_scale_kv.h    |  1 -
 .../distributed/parameter_prefetch.cc         | 22 +++++++--------
 .../distributed/parameter_prefetch.h          |  7 +++++
 .../operators/distributed/parameter_recv.cc   | 21 +++++++-------
 .../operators/distributed/parameter_send.cc   | 27 ++++++++++--------
 .../distributed/request_handler_impl.h        |  8 +++++-
 .../fluid/operators/distributed/rpc_client.h  | 11 +++++++-
 .../fluid/operators/distributed/rpc_server.cc | 14 ++++++++--
 .../fluid/operators/distributed/rpc_server.h  | 12 ++++++++
 .../operators/distributed/sendrecvop_utils.cc | 19 +++++++------
 .../operators/distributed/sendrecvop_utils.h  | 16 +++++++++++
 .../operators/distributed/varhandle_test.cc   |  5 ----
 .../operators/distributed/variable_response.h | 24 +++++++++++++---
 .../distributed_ops/checkpoint_notify_op.cc   | 20 ++++++++-----
 .../distributed_ops/fetch_barrier_op.cc       | 24 +++++++++++-----
 .../distributed_ops/fl_listen_and_serv_op.h   | 16 +++++++++++
 .../distributed_ops/gen_nccl_id_op.cc         | 16 +++++++----
 .../distributed_ops/listen_and_serv_op.h      | 16 +++++++++++
 .../operators/distributed_ops/prefetch_op.cc  | 23 +++++++++++----
 .../operators/distributed_ops/recv_op.cc      | 26 +++++++++++------
 .../distributed_ops/ref_by_trainer_id_op.cc   | 13 +++++++++
 .../distributed_ops/ref_by_trainer_id_op.h    |  1 +
 .../distributed_ops/send_barrier_op.cc        | 23 ++++++++++-----
 .../operators/distributed_ops/send_op.cc      | 28 +++++++++++--------
 .../elementwise/elementwise_add_op.cc         | 13 +++++++++
 .../elementwise/elementwise_add_op.h          |  1 +
 .../elementwise/elementwise_floordiv_op.cc    | 17 +++++++++++
 .../elementwise/elementwise_max_op.cc         | 16 ++++++++++-
 .../elementwise/elementwise_min_op.cc         | 16 ++++++++++-
 .../elementwise/elementwise_min_op.h          |  1 +
 .../elementwise/elementwise_mod_op.cc         | 17 +++++++++++
 .../elementwise/elementwise_pow_op.cc         | 16 ++++++++++-
 .../elementwise/elementwise_pow_op.h          |  1 +
 .../elementwise/elementwise_sub_op.cc         | 16 ++++++++++-
 .../mkldnn/elementwise_add_mkldnn_op.cc       | 10 +++++++
 .../test_elementwise_add_grad_grad.cc         | 12 +-------
 .../test_elementwise_add_op_inplace.cc        |  5 +---
 paddle/fluid/operators/enqueue_op.cc          | 15 ++++++++--
 paddle/fluid/operators/eye_op.h               |  2 +-
 .../fusion_transpose_flatten_concat_op.cu.cc  |  7 ++++-
 paddle/fluid/operators/gather_test.cc         |  3 --
 .../operators/grid_sampler_cudnn_op.cu.cc     |  6 ++++
 paddle/fluid/operators/gru_op.cu.cc           |  7 +++++
 paddle/fluid/operators/hash_op.cc             | 16 +++++++++++
 paddle/fluid/operators/hash_op.h              |  1 +
 paddle/fluid/operators/increment_op.cc        | 16 ++++++++++-
 paddle/fluid/operators/isfinite_op.cc         | 18 +++++++++++-
 paddle/fluid/operators/isfinite_op.h          |  7 +++++
 paddle/fluid/operators/isfinite_v2_op.cc      | 25 +++++++++++++++--
 paddle/fluid/operators/isfinite_v2_op.h       |  7 +++++
 paddle/fluid/operators/jit/gen/act.h          |  1 +
 paddle/fluid/operators/jit/gen/blas.h         |  1 +
 paddle/fluid/operators/jit/gen/embseqpool.cc  |  3 +-
 paddle/fluid/operators/jit/gen/embseqpool.h   |  1 +
 paddle/fluid/operators/jit/gen/gru.cc         |  2 ++
 paddle/fluid/operators/jit/gen/gru.h          |  1 +
 paddle/fluid/operators/jit/gen/hopv.h         |  1 +
 paddle/fluid/operators/jit/gen/lstm.cc        |  2 ++
 paddle/fluid/operators/jit/gen/lstm.h         |  1 +
 paddle/fluid/operators/jit/gen/matmul.cc      |  3 +-
 paddle/fluid/operators/jit/gen/matmul.h       |  1 +
 paddle/fluid/operators/jit/gen/seqpool.h      |  1 +
 paddle/fluid/operators/jit/gen/sgd.cc         |  3 +-
 paddle/fluid/operators/jit/gen/sgd.h          |  1 +
 paddle/fluid/operators/jit/gen/vbroadcast.h   |  1 +
 paddle/fluid/operators/jit/gen_base.h         |  1 +
 paddle/fluid/operators/jit/helper.h           |  3 ++
 paddle/fluid/operators/jit/kernel_pool.cc     |  3 --
 paddle/fluid/operators/jit/kernel_pool.h      |  3 ++
 .../jit/more/intrinsic/crf_decoding.h         |  1 +
 .../operators/jit/more/intrinsic/layer_norm.h |  1 +
 paddle/fluid/operators/jit/more/mix/mix.h     |  1 +
 paddle/fluid/operators/jit/more/mkl/mkl.h     |  1 +
 paddle/fluid/operators/jit/refer/refer.h      |  1 +
 paddle/fluid/operators/label_smooth_op.cc     | 16 ++++++++++-
 paddle/fluid/operators/layer_norm_op.h        |  9 ++++++
 paddle/fluid/operators/lod_array_length_op.cc | 14 +++++++++-
 paddle/fluid/operators/lod_rank_table_op.cc   | 15 +++++++++-
 .../fluid/operators/lod_tensor_to_array_op.cc | 15 ++++++----
 paddle/fluid/operators/math/beam_search.cc    | 12 ++++++--
 .../fluid/operators/math/beam_search_test.cc  |  2 +-
 paddle/fluid/operators/math/blas.cc           |  2 +-
 paddle/fluid/operators/math/blas.h            |  7 +++++
 paddle/fluid/operators/math/blas_impl.h       |  6 ++--
 .../fluid/operators/math/concat_and_split.cc  | 12 +++++++-
 paddle/fluid/operators/math/concat_test.cc    |  2 +-
 .../fluid/operators/math/context_project.cc   |  6 ++++
 paddle/fluid/operators/math/context_project.h |  2 ++
 .../fluid/operators/math/cos_sim_functor.cc   |  6 ++++
 paddle/fluid/operators/math/cos_sim_functor.h |  2 ++
 paddle/fluid/operators/math/cpu_vec.h         |  1 +
 paddle/fluid/operators/math/cpu_vec_test.cc   |  4 +--
 paddle/fluid/operators/math/cross_entropy.cc  |  6 ++++
 paddle/fluid/operators/math/gru_compute.cc    |  7 +++++
 paddle/fluid/operators/math/im2col.cc         |  8 +++++-
 paddle/fluid/operators/math/lstm_compute.cc   |  7 +++++
 .../fluid/operators/math/matrix_bit_code.cc   |  2 --
 .../fluid/operators/math/matrix_inverse.cu.cc | 11 ++++++--
 paddle/fluid/operators/math/pooling.cc        |  3 --
 paddle/fluid/operators/math/sample_prob.cc    |  6 ++++
 paddle/fluid/operators/math/sample_prob.h     |  7 +++++
 .../math/selected_rows_functor_test.cu.cc     |  1 -
 paddle/fluid/operators/math/sequence2batch.cc |  6 ++++
 .../fluid/operators/math/sequence_padding.cc  | 10 +++++++
 .../operators/math/sequence_padding_test.cc   |  2 +-
 paddle/fluid/operators/math/sequence_scale.cc |  6 ++++
 paddle/fluid/operators/math/sequence_scale.h  |  7 +++++
 paddle/fluid/operators/math/vol2col.cc        |  7 ++++-
 paddle/fluid/operators/math/vol2col_test.cc   |  3 +-
 paddle/fluid/operators/matmul_v2_op.h         |  6 ++--
 paddle/fluid/operators/max_sequence_len_op.cc | 14 +++++++++-
 paddle/fluid/operators/merge_lod_tensor_op.cc | 13 ++++++++-
 .../operators/mkldnn/activation_mkldnn_op.cc  |  9 ++++++
 .../operators/mkldnn/batch_norm_mkldnn_op.cc  | 10 ++++++-
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  |  8 ++++--
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 15 +++++++---
 .../fluid/operators/mkldnn/lrn_mkldnn_op.cc   | 11 ++++++--
 .../operators/mkldnn/matmul_mkldnn_op.cc      |  8 +++++-
 .../fluid/operators/mkldnn/mul_mkldnn_op.cc   | 14 +++++++---
 .../fluid/operators/mkldnn/pool_mkldnn_op.cc  |  1 -
 .../operators/mkldnn/softmax_mkldnn_op.cc     | 12 ++++++--
 .../fluid/operators/mkldnn/sum_mkldnn_op.cc   | 14 +++++++---
 .../fluid/operators/nccl/nccl_gpu_common.cc   |  1 -
 .../fluid/operators/op_debug_string_test.cc   |  2 +-
 paddle/fluid/operators/print_op.cc            | 16 ++++++++---
 paddle/fluid/operators/rank_loss_op.cc        | 16 ++++++++++-
 .../fluid/operators/reader/buffered_reader.h  |  1 +
 paddle/fluid/operators/reader/py_reader.cc    |  1 -
 paddle/fluid/operators/reader/py_reader.h     |  3 ++
 .../reader/reader_blocking_queue_test.cc      |  5 +---
 .../operators/reader/reader_op_registry.cc    |  8 ++++--
 .../operators/reader/reader_op_registry.h     |  8 ++++++
 paddle/fluid/operators/recurrent_op.cc        | 10 +++++--
 paddle/fluid/operators/recurrent_op.h         |  6 ++++
 .../operators/reduce_ops/frobenius_norm_op.cc | 15 +++++++++-
 .../operators/reduce_ops/reduce_all_op.cc     | 15 ++++++++++
 .../operators/reduce_ops/reduce_any_op.cc     | 15 ++++++++++
 .../operators/reduce_ops/reduce_prod_op.cc    | 13 +++++++++
 .../operators/reduce_ops/reduce_sum_op.cc     | 15 +++++++++-
 .../reorder_lod_tensor_by_rank_op.cc          | 13 ++++++++-
 paddle/fluid/operators/reshape_op.cc          | 17 ++++++++++-
 .../fluid/operators/rnn_memory_helper_op.cc   | 11 ++++++++
 paddle/fluid/operators/scale_op.cc            | 14 +++++++++-
 paddle/fluid/operators/scatter_test.cc        |  5 ++--
 .../sequence_ops/sequence_concat_op.cu.cc     |  7 +++++
 paddle/fluid/operators/softmax_cudnn_op.cu.cc |  8 +++++-
 paddle/fluid/operators/split_lod_tensor_op.cc | 13 ++++++++-
 paddle/fluid/operators/strided_memcpy.h       |  2 ++
 paddle/fluid/operators/strided_memcpy_test.cc |  2 +-
 paddle/fluid/operators/tensor_formatter.cc    |  3 +-
 paddle/fluid/operators/tensor_formatter.h     |  6 ++++
 .../operators/tensorrt/tensorrt_engine_op.cc  |  3 --
 .../operators/tensorrt/tensorrt_engine_op.h   | 12 ++++++++
 .../test_leaky_relu_grad_grad_functor.h       |  1 +
 paddle/fluid/platform/bfloat16.h              |  6 ++++
 paddle/fluid/platform/bfloat16_test.cc        |  2 --
 paddle/fluid/platform/collective_helper.cc    |  4 ---
 paddle/fluid/platform/cpu_info.cc             |  1 -
 paddle/fluid/platform/cuda_resource_pool.h    |  1 +
 paddle/fluid/platform/cudnn_desc.h            |  7 +++++
 paddle/fluid/platform/cudnn_helper.h          |  6 ++++
 paddle/fluid/platform/device_code.cc          |  2 ++
 paddle/fluid/platform/device_code.h           |  1 +
 paddle/fluid/platform/device_context.h        |  9 +++++-
 .../fluid/platform/device_memory_aligment.h   |  1 +
 paddle/fluid/platform/device_tracer.h         |  2 ++
 paddle/fluid/platform/dynload/cublas.h        |  1 +
 paddle/fluid/platform/dynload/cuda_driver.h   |  1 +
 paddle/fluid/platform/dynload/cudnn.h         |  4 +--
 paddle/fluid/platform/dynload/cupti.cc        |  1 -
 paddle/fluid/platform/dynload/curand.h        |  3 +-
 paddle/fluid/platform/dynload/cusolver.h      |  3 +-
 .../fluid/platform/dynload/dynamic_loader.cc  |  3 --
 paddle/fluid/platform/dynload/mklml.h         |  1 +
 paddle/fluid/platform/dynload/nccl.h          |  2 +-
 paddle/fluid/platform/dynload/nvrtc.h         |  1 +
 paddle/fluid/platform/dynload/warpctc.h       |  1 +
 paddle/fluid/platform/enforce.h               |  6 ++++
 paddle/fluid/platform/errors_test.cc          |  1 -
 paddle/fluid/platform/float16_test.cc         |  4 +--
 paddle/fluid/platform/lodtensor_printer.cc    | 10 +++++--
 paddle/fluid/platform/lodtensor_printer.h     |  7 +++++
 .../fluid/platform/lodtensor_printer_test.cc  |  2 +-
 paddle/fluid/platform/monitor.cc              |  1 -
 paddle/fluid/platform/monitor.h               |  1 +
 paddle/fluid/platform/place_test.cc           |  2 +-
 paddle/fluid/platform/profiler.cc             | 13 ---------
 paddle/fluid/platform/profiler.h              |  1 +
 paddle/fluid/platform/profiler_test.cc        |  5 ++--
 paddle/fluid/platform/stream/cuda_stream.h    |  1 +
 paddle/fluid/platform/timer.h                 |  1 +
 paddle/fluid/string/piece_test.cc             |  2 --
 paddle/fluid/string/pretty_log.h              |  1 +
 paddle/fluid/string/string_helper.cc          |  4 +--
 paddle/fluid/string/string_helper.h           |  1 +
 paddle/testing/paddle_gtest_main.cc           |  3 --
 616 files changed, 3308 insertions(+), 766 deletions(-)

diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc
index 9ca3fe31a33..7460686c1a3 100644
--- a/paddle/fluid/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/attribute.h"
 
-#include <vector>
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 5c6e4215162..8c8fcadb05b 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -30,6 +30,8 @@ namespace paddle {
 namespace framework {
 
 class ProgramDesc;
+class OpDesc;
+class VarDesc;
 
 // Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
 // read/write speed. Only when we want the protobuf message, the local changes
diff --git a/paddle/fluid/framework/c/c_api.cc b/paddle/fluid/framework/c/c_api.cc
index 0dd2768ccb9..48181dac662 100644
--- a/paddle/fluid/framework/c/c_api.cc
+++ b/paddle/fluid/framework/c/c_api.cc
@@ -12,17 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/c/c_api.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/init.h"
 
 extern "C" {
 
diff --git a/paddle/fluid/framework/c/c_api.h b/paddle/fluid/framework/c/c_api.h
index 04dbfbebe5d..a9ec402f381 100644
--- a/paddle/fluid/framework/c/c_api.h
+++ b/paddle/fluid/framework/c/c_api.h
@@ -24,6 +24,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class OpInfoMap;
+}  // namespace framework
+namespace platform {
+class DeviceContextPool;
+}  // namespace platform
+}  // namespace paddle
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index 64a645bf8b2..503f1513aad 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -277,7 +277,7 @@ class ChannelObject {
     size_t finished = 0;
     while (finished < n && WaitForWrite(lock)) {
       size_t m =
-          std::min(n - finished, capacity_ + reading_count_ - data_.size());
+          (std::min)(n - finished, capacity_ + reading_count_ - data_.size());
       for (size_t i = 0; i < m; i++) {
         data_.push_back(std::move(p[finished++]));
       }
diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc
index 9350c387a6e..5b89166e2f4 100644
--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <cstring>
 #include <random>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/framework/data_device_transform.h b/paddle/fluid/framework/data_device_transform.h
index 8ff97646cfc..60b52a5e706 100644
--- a/paddle/fluid/framework/data_device_transform.h
+++ b/paddle/fluid/framework/data_device_transform.h
@@ -21,6 +21,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class Tensor;
+
 void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
                      Tensor* out);
 
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index b48d152fe35..da156bfc5c7 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -41,6 +41,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/string/string_helper.h"
 
+namespace paddle {
+namespace framework {
+class DataFeedDesc;
+class LoDTensor;
+class Scope;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
@@ -418,6 +427,7 @@ class MultiSlotType {
 
   std::string DebugString() {
     std::stringstream ss;
+
     ss << "\ntype: " << type_ << "\n";
     ss << "offset: ";
     ss << "[";
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
index 1d8aec76240..048d539f9b9 100644
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -17,10 +17,10 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/framework/data_feed.h"
-
 namespace paddle {
 namespace framework {
+class DataFeed;
+
 typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
 typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
 data_feedMap g_data_feed_map;
diff --git a/paddle/fluid/framework/data_feed_factory.h b/paddle/fluid/framework/data_feed_factory.h
index 13678edb0b8..49381a98706 100644
--- a/paddle/fluid/framework/data_feed_factory.h
+++ b/paddle/fluid/framework/data_feed_factory.h
@@ -16,10 +16,13 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/data_feed.h"
 
 namespace paddle {
 namespace framework {
+class DataFeed;
+
 class DataFeedFactory {
  public:
   static std::string DataFeedTypeList();
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index f757e244e38..108cd9ac6d1 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/data_layout_transform.h"
+
 #include <string>
-#include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index b92c47c2eb0..238f2d2e679 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -17,10 +17,18 @@
 #include <map>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 
+namespace paddle {
+namespace framework {
+class OpKernelType;
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc
index 8dfad23db65..20443e9a3dc 100644
--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/framework/data_layout_transform.h"
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/platform/device_context.h"
 
 TEST(DataTransform, DataLayoutFunction) {
   auto place = paddle::platform::CPUPlace();
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index f54311eebfa..3a40de6988f 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -18,8 +18,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 #ifdef PADDLE_WITH_MKLDNN
-#include <algorithm>
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
index ef2271d530d..2bbdac52ee4 100644
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -30,6 +30,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class OpKernelType;
+class Tensor;
+class Variable;
+
 void TransformData(const OpKernelType &expected_kernel_type,
                    const OpKernelType &kernel_type_for_var,
                    const Tensor &input_tensor, Tensor *out);
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index 8188d5cde1b..e4be866dca1 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/data_type.h"
-#include <stdint.h>
 #include <string>
 #include <unordered_map>
 
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 720e422e114..4477a9cac09 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -15,12 +15,19 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <typeindex>
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/platform/enforce.h"
 
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 
+namespace paddle {
+namespace platform {
+struct bfloat16;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 331596da33a..5599edcd222 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -14,9 +14,17 @@
 #include "paddle/fluid/framework/data_type.h"
 
 #include <string>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor.h"
 
+namespace paddle {
+namespace platform {
+struct bfloat16;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 TEST(DataType, float16) {
   using paddle::framework::Tensor;
   using paddle::platform::CPUPlace;
diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h
index 1c281b03ed6..b42b2f594aa 100644
--- a/paddle/fluid/framework/data_type_transform.h
+++ b/paddle/fluid/framework/data_type_transform.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <utility>
+
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
@@ -23,6 +24,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class OpKernelType;
+class Tensor;
+
 using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
 
 void TransDataType(const OpKernelType& kernel_type_for_var,
diff --git a/paddle/fluid/framework/dataset_factory.cc b/paddle/fluid/framework/dataset_factory.cc
index 3a28c101d48..cdb513f70ad 100644
--- a/paddle/fluid/framework/dataset_factory.cc
+++ b/paddle/fluid/framework/dataset_factory.cc
@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/dataset_factory.h"
-#include <memory>
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/framework/data_set.h"
-
 namespace paddle {
 namespace framework {
 typedef std::unique_ptr<Dataset> (*CreateDatasetFunction)();
diff --git a/paddle/fluid/framework/dataset_factory.h b/paddle/fluid/framework/dataset_factory.h
index d4a36cec22f..425c488daa8 100644
--- a/paddle/fluid/framework/dataset_factory.h
+++ b/paddle/fluid/framework/dataset_factory.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/data_set.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h
index 29c4732f991..e69fb4e7619 100644
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/dim.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ddim_test.cc b/paddle/fluid/framework/ddim_test.cc
index b7b42fa019f..e89f77ae496 100644
--- a/paddle/fluid/framework/ddim_test.cc
+++ b/paddle/fluid/framework/ddim_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <sstream>
-#include <vector>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ddim.h"
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index 36f5d3adfad..e0064ec2642 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -20,6 +20,17 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class NCCLCommunicator;
+}  // namespace platform
+}  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 588248d6454..1412e2cd9db 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -24,6 +24,20 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+namespace details {
+struct VarHandle;
+}  // namespace details
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+struct NCCLContextMap;
+}  // namespace platform
+}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index 6d14c7e4e7b..e455879a68f 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -21,13 +21,15 @@
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
-
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
+struct DummyVarHandle;
+struct VarHandle;
+
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 01d496d4ea7..87b27eaa440 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -27,6 +27,18 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+class PassBuilder;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class NCCLCommunicator;
+}  // namespace platform
+}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 0b653e57f6d..2256b826ed5 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -19,6 +19,8 @@
 namespace paddle {
 namespace framework {
 namespace details {
+struct VarHandleBase;
+
 ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
                                          platform::Place place,
                                          size_t scope_idx)
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index 41c51b95800..3c219ee27d3 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -24,9 +24,21 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class OperatorBase;
+class Scope;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
+struct VarHandleBase;
+
 class ComputationOpHandle : public OpHandleBase {
  public:
   ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index a048799a280..8edce6782de 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -19,12 +19,23 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 class Scope;
+class GarbageCollector;
+namespace ir {
+class Node;
+}  // namespace ir
 
 namespace ir {
 class MemOptVarInfo;
diff --git a/paddle/fluid/framework/details/exception_holder_test.cc b/paddle/fluid/framework/details/exception_holder_test.cc
index c20563a0860..3db358667eb 100644
--- a/paddle/fluid/framework/details/exception_holder_test.cc
+++ b/paddle/fluid/framework/details/exception_holder_test.cc
@@ -13,8 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/exception_holder.h"
-#include <memory>
-#include <unordered_map>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
index 6aae523365e..09aedafc6bb 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -15,9 +15,14 @@
 #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 #include <string>
 #include <utility>
-#include <vector>
 #include "paddle/fluid/platform/profiler.h"
 
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.h b/paddle/fluid/framework/details/fetch_async_op_handle.h
index 691a3286c27..ff9271942da 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.h
@@ -22,6 +22,18 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/fetch_barrier_op_handle.cc b/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
index 127183a32e9..fc836ade786 100644
--- a/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.cc
@@ -19,6 +19,8 @@
 namespace paddle {
 namespace framework {
 namespace details {
+struct VarHandleBase;
+
 FetchBarrierOpHandle::FetchBarrierOpHandle(
     ir::Node *node, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places)
diff --git a/paddle/fluid/framework/details/fetch_barrier_op_handle.h b/paddle/fluid/framework/details/fetch_barrier_op_handle.h
index d1f7e08b28e..7ce790f38e8 100644
--- a/paddle/fluid/framework/details/fetch_barrier_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.h
@@ -24,6 +24,15 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -32,6 +41,8 @@ namespace details {
 // all places if there are multiple places, must init with
 // multiple dev_ctxes_ !!!!
 
+struct VarHandleBase;
+
 struct FetchBarrierOpHandle : public OpHandleBase {
  public:
   FetchBarrierOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index 31ffd1211d2..41deeb0af27 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -22,6 +22,17 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
index 16c13ac1c03..9bed792a42f 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -17,10 +17,22 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class NCCLCommunicator;
+}  // namespace platform
+}  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
index 8a59d2bfa9a..8fd3ec56d18 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -25,6 +25,17 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+struct NCCLContextMap;
+}  // namespace platform
+}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
index cbded074f20..761a5b5a30a 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -17,11 +17,20 @@
 #include <unordered_map>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
 namespace details {
 
+struct VarHandle;
+
 struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
   std::vector<std::string> out_varnames_;
   std::vector<std::unique_ptr<ir::Node>> nodes_;
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
index ac87b246b50..9cbd94cd6b8 100644
--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -24,6 +24,14 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 5d8562e7046..f3fcc1a436d 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -17,11 +17,11 @@
 #include <unordered_map>
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/platform/device_context.h"
-
 namespace paddle {
 namespace framework {
 namespace details {
+struct DummyVarHandle;
+
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
index 21e781877a4..c3a18433cf8 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -20,16 +20,21 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/var_handle.h"
-
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/place.h"
 
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h
index 15d00932f1c..b4459e5a7c1 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.h
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h
@@ -19,6 +19,12 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/place.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 097f54d5d58..eb3d9c32ffc 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -24,10 +24,22 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"
 
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
 class Scope;
+namespace details {
+struct VarHandleBase;
+}  // namespace details
+namespace ir {
+class Node;
+}  // namespace ir
 
 namespace details {
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 8b92bdef475..e76a48d207d 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -24,6 +24,21 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+class SelectedRows;
+namespace details {
+struct VarHandle;
+}  // namespace details
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+struct NCCLContextMap;
+}  // namespace platform
+}  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/rpc_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h
index d86d33dd676..909f565f2c0 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -24,6 +24,16 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Scope;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 6e2f2327abd..287667d5ee9 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -13,9 +13,17 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+
 #include <string>
+
 #include "paddle/fluid/platform/profiler.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
index d4f28dbe2b2..02e5aa88443 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -21,6 +21,18 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.h b/paddle/fluid/framework/details/scope_buffered_monitor.h
index 1246c35af6a..3a94534eff4 100644
--- a/paddle/fluid/framework/details/scope_buffered_monitor.h
+++ b/paddle/fluid/framework/details/scope_buffered_monitor.h
@@ -17,7 +17,9 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/scope.h"
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
index 5fbaf3cbfe0..bf93d8f85b1 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -22,6 +22,17 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+class Tensor;
+class Variable;
+namespace ir {
+class MemOptVarInfo;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.h b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
index be49d1c432b..0db69d07bf6 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
@@ -25,6 +25,15 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace ir {
+class MemOptVarInfo;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index be3f5515a97..3d53bb62855 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -23,10 +23,20 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class MemOptVarInfo;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
 
+class ComputationOpHandle;
+
 ComputationOpHandle *GetUniquePendingComputationOpHandle(
     ShareTensorBufferOpHandle *share_tensor_op) {
   ComputationOpHandle *result_op = nullptr;
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
index a02c346485e..d14cbc31d82 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
@@ -22,10 +22,22 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace ir {
+class MemOptVarInfo;
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
 
+class ComputationOpHandle;
+
 class ShareTensorBufferOpHandle : public OpHandleBase {
  public:
   ShareTensorBufferOpHandle(
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
index b24b457d21d..8bfea0f1ae8 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
@@ -23,6 +23,17 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+namespace platform {
+class NCCLCommunicator;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index bb38424d3ae..a35ac0bd732 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -24,6 +24,14 @@
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index fba0c1bf463..71e5dd28ede 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -15,6 +15,14 @@
 #include "paddle/fluid/framework/details/variable_visitor.h"
 
 #include "paddle/fluid/framework/selected_rows.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/variable_visitor.h b/paddle/fluid/framework/details/variable_visitor.h
index ca9a19bdcf1..a882d5120bc 100644
--- a/paddle/fluid/framework/details/variable_visitor.h
+++ b/paddle/fluid/framework/details/variable_visitor.h
@@ -17,6 +17,13 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/variable.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc
index aeec6161714..fbaae5a21c2 100644
--- a/paddle/fluid/framework/device_worker.cc
+++ b/paddle/fluid/framework/device_worker.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/device_worker.h"
-#include "xxhash.h"  // NOLINT
 
 namespace paddle {
 namespace framework {
 
+class LoDTensor;
+class Scope;
+
 void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; }
 
 void DeviceWorker::SetDataFeed(DataFeed* data_feed) {
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index efe6fa1b2da..ee2ef9a0c3d 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -39,6 +39,18 @@ limitations under the License. */
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/timer.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class ProgramDesc;
+class Scope;
+class Tensor;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index 67be8db6e80..3b60cb65e34 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class DeviceWorker;
+
 typedef std::shared_ptr<DeviceWorker> (*Createdevice_workerFunction)();
 typedef std::unordered_map<std::string, Createdevice_workerFunction>
     device_workerMap;
diff --git a/paddle/fluid/framework/device_worker_factory.h b/paddle/fluid/framework/device_worker_factory.h
index 9d0613385e7..6a31c3ea7a4 100644
--- a/paddle/fluid/framework/device_worker_factory.h
+++ b/paddle/fluid/framework/device_worker_factory.h
@@ -16,11 +16,14 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/device_worker.h"
 
 namespace paddle {
 namespace framework {
 
+class DeviceWorker;
+
 class DeviceWorkerFactory {
  public:
   static std::string DeviceWorkerTypeList();
diff --git a/paddle/fluid/framework/device_worker_test.cc b/paddle/fluid/framework/device_worker_test.cc
index b488e4cfe7a..461d329a371 100644
--- a/paddle/fluid/framework/device_worker_test.cc
+++ b/paddle/fluid/framework/device_worker_test.cc
@@ -13,9 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/device_worker.h"
+
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/trainer.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 915589b3242..ac1e39ad2c1 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -11,10 +11,17 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/framework/dlpack_tensor.h"
 #include <unordered_map>
-
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/dlpack_tensor.h"
+
+namespace paddle {
+namespace platform {
+struct bfloat16;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h
index 5346ba62894..e342523718b 100644
--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -15,11 +15,14 @@
 #pragma once
 
 #include <dlpack/dlpack.h>
+
 #include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
 namespace framework {
 
+class Tensor;
+
 class DLPackTensor {
  public:
   using LaneType = decltype(::DLTensor::dtype.lanes);  // uint16_t
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index 4dead063b47..4a1f151f69b 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -17,6 +17,12 @@
 #include <gtest/gtest.h>
 #include <vector>
 
+namespace paddle {
+namespace platform {
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 1c64bf1d3f7..00f721701a4 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
 
 #if defined _WIN32 || defined __APPLE__
 #else
diff --git a/paddle/fluid/framework/downpour_worker_opt.cc b/paddle/fluid/framework/downpour_worker_opt.cc
index b40a00ef9cb..afe6ddfa3d9 100644
--- a/paddle/fluid/framework/downpour_worker_opt.cc
+++ b/paddle/fluid/framework/downpour_worker_opt.cc
@@ -12,18 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <set>
-#include <unordered_map>
-#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/lodtensor_printer.h"
 
 namespace paddle {
 namespace framework {
 
+class OpDesc;
+class OperatorBase;
+class ProgramDesc;
+
 bool HasDependentOutput(const OpDesc& op_desc,
                         const std::unordered_set<std::string>& dependent_vars) {
   for (auto& var : op_desc.Outputs()) {
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index 0e3edfb95cb..a6abda8a83b 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <stdint.h>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index fa6a65d5892..7593b60abff 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/garbage_collector.h"
@@ -32,6 +33,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class Dataset;
+class ProgramDesc;
+class Scope;
+class TrainerBase;
+
 struct ExecutorPrepareContext {
   ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
 
diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
index 706248229bc..c80eedb1b86 100644
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -13,16 +13,19 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/executor_gc_helper.h"
+
 #include <deque>
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 #include <utility>
-#include <vector>
+
 #include "glog/logging.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/executor_gc_helper.h b/paddle/fluid/framework/executor_gc_helper.h
index a4c71c5304e..e44edc5aa1c 100644
--- a/paddle/fluid/framework/executor_gc_helper.h
+++ b/paddle/fluid/framework/executor_gc_helper.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
@@ -26,6 +27,10 @@ namespace paddle {
 namespace framework {
 
 // Result map: op -> variable names that can be deleted after op runs
+class GarbageCollector;
+class OperatorBase;
+class Scope;
+
 std::unordered_map<const OperatorBase *, std::vector<std::string>>
 GetUnusedVars(const BlockDesc &block,
               const std::vector<std::unique_ptr<OperatorBase>> &ops,
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index fd857f7735c..3bd85b2b24b 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -13,16 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
+
 #include <string>
-#include <vector>
+
 #include "glog/logging.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
 
+class LoDTensor;
+class Variable;
+
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
                      const std::string& var_name, size_t index) {
   // If var_name Variable is not found in GlobalScope, a new variable will
diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
index 65c8b255ffb..a52ef517c8b 100644
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -15,12 +15,16 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace framework {
 
+class LoDTensor;
+class Scope;
+
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
                      const std::string& var_name, size_t index);
 
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 34fff042770..3c076805932 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -29,12 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include <algorithm>
 #include <utility>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index cc13a50160a..be87bdf1e75 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -35,6 +35,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc
index f195dde4084..f4b2d2d7d18 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -10,10 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
-#include <thread>  // NOLINT
-#include <vector>
 #include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/string/string_helper.h"
 
 namespace gloo {
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index b70d5e5fc1a..7a27b6a9d7a 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -27,15 +27,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/fleet/heter_wrapper.h"
-#include <algorithm>
-#include <utility>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/timer.h"
 #ifdef PADDLE_WITH_PSLIB
 
 namespace paddle {
@@ -122,14 +113,15 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
   if (platform::is_cpu_place(tensor->place())) {
     memcpy(data_ptr, tensor->data<void>(),
            tensor->numel() * SizeOfType(tensor->type()));
-  }
 #ifdef PADDLE_WITH_CUDA
-  else {
+  } else {
     memory::Copy(platform::CPUPlace(), data_ptr,
                  BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
                  tensor->data<void>(),
                  tensor->numel() * SizeOfType(tensor->type()), nullptr);
   }
+#else
+  }
 #endif
 }
 
@@ -239,7 +231,7 @@ void HeterWrapper::CallRemoteXpu(std::shared_ptr<HeterTask> task,
   request.set_cur_batch(task->cur_batch_);
 
   OnHeterRpcDone* done = new OnHeterRpcDone([this, task, worker](void* done) {
-    auto* closure = (OnHeterRpcDone*)done;
+    auto* closure = reinterpret_cast<OnHeterRpcDone*>(done);
     if (closure->cntl.Failed()) {
       VLOG(0) << "call xpu fail: " << closure->cntl.ErrorText();
     } else {
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
index 33a91388fd8..ed92e2e9aad 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -13,9 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/fleet/nccl_wrapper.h"
-#include <utility>
-#include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.h b/paddle/fluid/framework/fleet/nccl_wrapper.h
index a55921f1ac2..3725a225dbe 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.h
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <random>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -29,6 +30,12 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 4f773965282..884d230816b 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -19,9 +19,16 @@
 #include <memory>
 #include <mutex>  // NOLINT
 #include <utility>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
index a279c2e4e14..862e63c4c6a 100644
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <stdint.h>
-
 #include <atomic>
 #include <deque>
 #include <iostream>  // temp for debug
diff --git a/paddle/fluid/framework/hetercpu_worker.cc b/paddle/fluid/framework/hetercpu_worker.cc
index 83838f4df67..747fd434ae7 100644
--- a/paddle/fluid/framework/hetercpu_worker.cc
+++ b/paddle/fluid/framework/hetercpu_worker.cc
@@ -12,13 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/framework/fleet/heter_wrapper.h"
-#include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/string/string_helper.h"
-
 #ifdef PADDLE_WITH_PSLIB
 
 #if defined _WIN32 || defined __APPLE__
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index 5ca1aa66319..fbed74800b4 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -12,16 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cstdlib>
-#include <ctime>
-#include <string>
-#include <vector>
-#include "io/fs.h"
-#include "paddle/fluid/framework/data_feed_factory.h"
-#include "paddle/fluid/framework/data_set.h"
-#include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/framework/trainer.h"
 #if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 
@@ -334,7 +324,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
   std::shared_ptr<HeterServiceContext> context = object_pool_.Get();
 
   if (!context->scope_) {
-    int num = rand() % places_.size();
+    int num = rand_r() % places_.size();
     context->place_num_ = num;
     auto place = places_[num];
     context->scope_ = &(place_scopes_[num]->NewScope());
diff --git a/paddle/fluid/framework/inlined_vector.h b/paddle/fluid/framework/inlined_vector.h
index 2a7f26b9f96..f8e937fa107 100644
--- a/paddle/fluid/framework/inlined_vector.h
+++ b/paddle/fluid/framework/inlined_vector.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cstdint>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/inlined_vector_test.cc b/paddle/fluid/framework/inlined_vector_test.cc
index 003c0d7bbea..581e7d8934d 100644
--- a/paddle/fluid/framework/inlined_vector_test.cc
+++ b/paddle/fluid/framework/inlined_vector_test.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/inlined_vector.h"
+
 #include <cstdlib>
 #include <ctime>
-#include <iostream>
-#include <vector>
+
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/io/crypto/cipher_utils.cc b/paddle/fluid/framework/io/crypto/cipher_utils.cc
index e0c653e0016..ee9f06b2f3e 100644
--- a/paddle/fluid/framework/io/crypto/cipher_utils.cc
+++ b/paddle/fluid/framework/io/crypto/cipher_utils.cc
@@ -15,8 +15,6 @@
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
 
 #include <cryptopp/osrng.h>
-
-#include <fstream>
 #include <sstream>
 
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/io/crypto/cipher_utils.h b/paddle/fluid/framework/io/crypto/cipher_utils.h
index 936f62f6ba6..52db03f530c 100644
--- a/paddle/fluid/framework/io/crypto/cipher_utils.h
+++ b/paddle/fluid/framework/io/crypto/cipher_utils.h
@@ -17,6 +17,7 @@
 #include <sstream>
 #include <string>
 #include <unordered_map>
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/io/fs.h b/paddle/fluid/framework/io/fs.h
index c88636e2674..bb6d720ca58 100644
--- a/paddle/fluid/framework/io/fs.h
+++ b/paddle/fluid/framework/io/fs.h
@@ -18,6 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/io/shell.h"
 #include "paddle/fluid/string/string_helper.h"
diff --git a/paddle/fluid/framework/io/shell.h b/paddle/fluid/framework/io/shell.h
index dc486275d6f..7db5cd7661c 100644
--- a/paddle/fluid/framework/io/shell.h
+++ b/paddle/fluid/framework/io/shell.h
@@ -32,6 +32,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
index 47ed9f0393f..48e3989a531 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class AttentionLSTMFusePass : public FusePassBase {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index fd8b55a6b7d..9c984a23e37 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -13,19 +13,28 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
+
 #include <cmath>
-#include <functional>
-#include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 #define GET_CONV_BN_NODES(pattern_name)                                    \
   /* OPERATORS */                                                          \
   GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                     \
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
index d607020a47b..916384ec447 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -26,6 +27,8 @@ namespace ir {
 /*
  * Fuse the Conv and ConvAffineChannel.
  */
+class Graph;
+
 class ConvAffineChannelFusePass : public FusePassBase {
  public:
   virtual ~ConvAffineChannelFusePass() {}
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index fb787e08814..a915015bf55 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -13,15 +13,22 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h"
-#include <algorithm>
-#include <functional>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
index 57a9f69ca15..342cd8dad5f 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -26,6 +27,8 @@ namespace ir {
 /*
  * Fuse the Conv and BatchNorm to a ConvBNMKLDNNOp.
  */
+class Graph;
+
 class ConvBNFusePass : public FusePassBase {
  public:
   virtual ~ConvBNFusePass() {}
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
index ea9e465d8d7..e68f57d4ae9 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class ConvElementwiseAdd2ActFusePass : public FusePassBase {
  public:
   virtual ~ConvElementwiseAdd2ActFusePass() {}
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index c5fa47ec55f..93e6e13ff70 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
index 8b34c3551d8..933092c7db7 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class ConvElementwiseAddActFusePass : public FusePassBase {
  public:
   virtual ~ConvElementwiseAddActFusePass() {}
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
index 38c0b773dde..e4396f227f7 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
index 66a562cdd19..7198a7488e0 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class ConvElementwiseAddFusePass : public FusePassBase {
  public:
   virtual ~ConvElementwiseAddFusePass() {}
diff --git a/paddle/fluid/framework/ir/cudnn_placement_pass.h b/paddle/fluid/framework/ir/cudnn_placement_pass.h
index 99dd3a175d3..8d84c2bf707 100644
--- a/paddle/fluid/framework/ir/cudnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/cudnn_placement_pass.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/placement_pass_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
index 4dfbd5e00c1..886b080c662 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
@@ -12,10 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string>
-
 #include "paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include <string>
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h
index 938ada6453e..fea0498fdec 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
@@ -21,6 +22,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class DeleteQuantDequantOpPass : public FusePassBase {
  public:
   virtual ~DeleteQuantDequantOpPass() {}
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 3f88a460d14..51861b402d5 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h"
-#include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h
index 644eb1cf892..25049d7468b 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h
@@ -17,10 +17,19 @@
 #include <memory>
 #include <string>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
index 65cb4439727..a9cde13758b 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
@@ -27,6 +27,8 @@ namespace ir {
 // Fusing of Embedding , FC and LSTM op
 
 // Just FC without bias
+class Graph;
+
 class EmbeddingFCLSTMFusePass : public FusePassBase {
  public:
   virtual ~EmbeddingFCLSTMFusePass() {}
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
index 5c18a0d6c7f..bedb9689641 100644
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h"
+
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h
index ac4d0b39ee2..12e4c44b84e 100644
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class FCElementwiseLayerNormFusePass : public FusePassBase {
  public:
   virtual ~FCElementwiseLayerNormFusePass() {}
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index d60510a4074..0248aeedd0a 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
-#include <memory>
+
 #include <string>
-#include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h
index ef6636d109a..f564bbb1518 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -26,6 +26,8 @@ namespace ir {
 /*
  * Fuse the MUL and ELEMENTWISE_ADD to a FCOp.
  */
+class Graph;
+
 class FCFusePass : public FusePassBase {
  public:
   virtual ~FCFusePass() {}
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
index f5fea90ac2f..c4515bbc455 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -22,6 +23,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 static int BuildFusion(Graph* graph, const std::string& name_scope,
                        Scope* scope, bool with_fc_bias) {
   GraphPatternDetector gpd;
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
index e11cdac7ea9..73f00504d34 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -25,6 +26,8 @@ namespace ir {
 
 // The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
 
+class Graph;
+
 class FCGRUFusePass : public FusePassBase {
  public:
   virtual ~FCGRUFusePass() {}
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index a3c57e14e1a..2b451da7bfa 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -22,6 +23,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
                 bool with_fc_bias) {
   GraphPatternDetector gpd;
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
index 5dea7c91a86..d37f53b15f0 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -27,6 +27,8 @@ namespace ir {
 // The MulLstmFusePass and MulLstmFusePass will fuse to the same FusionLstm op.
 
 // Just FC without bias
+class Graph;
+
 class FCLstmFusePass : public FusePassBase {
  public:
   virtual ~FCLstmFusePass() {}
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index 54c05046a2c..db3c711201d 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -15,12 +15,17 @@
 #include "paddle/fluid/framework/ir/fuse_bn_act_pass.h"
 #include <algorithm>
 #include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.h b/paddle/fluid/framework/ir/fuse_bn_act_pass.h
index 427ff03a803..7e5f046ecaa 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.h
@@ -18,6 +18,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -30,6 +31,9 @@ namespace ir {
 /*
  * Fuse the BatchNorm and activation.
  */
+class Graph;
+class Node;
+
 class FuseBatchNormActPass : public FusePassBase {
  public:
   virtual ~FuseBatchNormActPass() {}
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
index dc73f1fda03..d9356b7bd72 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
@@ -17,6 +17,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -29,6 +30,9 @@ namespace ir {
 /*
  * Fuse the ElewiseAdd and activation
  */
+class Graph;
+class Node;
+
 class FuseElewiseAddActPass : public FusePassBase {
  public:
   virtual ~FuseElewiseAddActPass() {}
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
index c284c1f4587..0094b674c2a 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
@@ -11,19 +11,22 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <algorithm>
+#include <sys/types.h>
 #include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
 
+#include "glog/logging.h"
 #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 class FuseAdamOpPass : public FuseOptimizerOpPass {
  private:
   const std::string GetOpType() const { return "adam"; }
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
index 43ec8bff5ed..f87d31cbc40 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
@@ -12,18 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <algorithm>
 #include <string>
-#include <unordered_map>
-#include <vector>
 
+#include "glog/logging.h"
 #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 class FuseMomentumOpPass : public FuseOptimizerOpPass {
  private:
   virtual const std::string GetOpType() const { return "momentum"; }
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
index 0b5bf8a3a4b..5b7e1b7d384 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
@@ -19,14 +19,25 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
 
+namespace paddle {
+namespace framework {
+class BlockDesc;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 constexpr char kGrad[] = "Grad";
 constexpr char kParam[] = "Param";
 constexpr char kLearningRate[] = "LearningRate";
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
index 70d4d2b8652..720d252c9a6 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
@@ -11,17 +11,21 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <algorithm>
 #include <string>
-#include <unordered_map>
-#include <vector>
 
+#include "glog/logging.h"
 #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 class FuseSgdOpPass : public FuseOptimizerOpPass {
  private:
   virtual const std::string GetOpType() const { return "sgd"; }
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.cc b/paddle/fluid/framework/ir/fuse_pass_base.cc
index e6fb1302e27..f3db4f02b1c 100644
--- a/paddle/fluid/framework/ir/fuse_pass_base.cc
+++ b/paddle/fluid/framework/ir/fuse_pass_base.cc
@@ -15,10 +15,18 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include <unordered_map>
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void FusePassBase::Init(const std::string& repr, Graph* graph) const {
   repr_ = repr;
   graph_ = graph;
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h
index 3a1022bbcbd..ce7635bb35c 100644
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -15,14 +15,24 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 static const char kParamScopeAttr[] = "__param_scope__";
 static const char kFuseStatisAttr[] = "__fuse_statis__";
 // When we use trt or other third_party lib, the parameters are managed by
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
index d37c153dd2a..0b1dfaa6928 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
@@ -15,6 +15,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -27,6 +28,8 @@ namespace ir {
 /*
  * Fuse the relu and depthwise conv
  */
+class Graph;
+
 class FuseReluDepthwiseConvPass : public FusePassBase {
  public:
   virtual ~FuseReluDepthwiseConvPass() {}
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.h b/paddle/fluid/framework/ir/fusion_group/code_generator.h
index 21773f239b9..15d21cf6829 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.h
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h"
 #include "paddle/fluid/framework/ir/fusion_group/subgraph.h"
 
@@ -27,6 +28,8 @@ namespace framework {
 namespace ir {
 namespace fusion_group {
 
+class SubGraph;
+
 class CodeGenerator {
  public:
   CodeGenerator();
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index ebc89b14c26..2a7a0748cf0 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -12,18 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/fusion_group/code_generator.h"
 #include <gtest/gtest.h>
 #include <cmath>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/ir/fusion_group/code_generator.h"
 #include "paddle/fluid/framework/ir/fusion_group/operation.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device_code.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/init.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+}  // namespace framework
+}  // namespace paddle
 
 #ifdef PADDLE_WITH_CUDA
 
diff --git a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h
index 0861c2f7e96..96b38f65013 100644
--- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h
+++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h
@@ -15,9 +15,18 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
index 3438783c180..5ca785846a5 100644
--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/fusion_group/subgraph.h"
 
@@ -23,6 +24,11 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+namespace fusion_group {
+class SubGraph;
+}  // namespace fusion_group
+
 class FusionGroupPass : public FusePassBase {
  protected:
   void ApplyImpl(Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.h b/paddle/fluid/framework/ir/fusion_group/operation.h
index 74abbdaad0b..d99fe737504 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.h
+++ b/paddle/fluid/framework/ir/fusion_group/operation.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 137ab7a56d7..593ac214e56 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -25,6 +25,13 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/variant.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index 074ad320fb1..0c43febca70 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -28,6 +28,8 @@ namespace framework {
 namespace ir {
 
 // Compare nodes via node id.
+class Graph;
+
 struct NodeComp {
   bool operator()(ir::Node *const &node1, ir::Node *const &node2) const {
     return node1->id() < node2->id();
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
index d8973d5aeda..0a2dcfed000 100644
--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph.h"
-#include <string>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 053c1fe832b..7116b8a2a6f 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -27,11 +27,21 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
index 6c466fb21fb..5ac5a5d9839 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 void BuildGraph(Graph* g) {
   ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
   ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h
index 52c8f4e0fca..6b17c0076f6 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.h
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 const char kGraphToProgramVarsToRemove[] =
     "__graph_to_program_vars_to_remove__";
 const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
index 5ee6b8a5f1e..80d7839d700 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
-
 #include <memory>
 #include <string>
 #include <unordered_set>
@@ -25,6 +24,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 void BuildNoCircleGraph(Graph* g) {
   OpDesc op1;
   op1.SetType("op1");
diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc
index 4b403c46260..3fa84554d99 100644
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -13,10 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
-
 #include <set>
-#include <utility>
-#include <vector>
 
 namespace paddle {
 namespace framework {
@@ -25,6 +22,8 @@ namespace ir {
 //
 // NodesDFSIterator
 //
+class Node;
+
 NodesDFSIterator::NodesDFSIterator(const std::vector<Node *> &source) {
   for (auto *x : source) stack_.push(x);
 }
diff --git a/paddle/fluid/framework/ir/graph_traits.h b/paddle/fluid/framework/ir/graph_traits.h
index bb4212bcd33..a54cc61a63f 100644
--- a/paddle/fluid/framework/ir/graph_traits.h
+++ b/paddle/fluid/framework/ir/graph_traits.h
@@ -26,6 +26,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 template <typename IteratorT>
 class iterator_range {
   IteratorT begin_, end_;
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
index 7091aa6a95b..118c1bc6f3c 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -28,6 +28,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 const char kGraphvizMarkedNodeAttr[] = "__graphviz__marked_node__";
 
 class GraphVizPass : public Pass {
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
index c8dfa02f469..08d09fce5de 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const {
   FusePassBase::Init("identity_scale_op_clean", graph);
 
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
index d66b411257e..7e3d4e19fa8 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class IdentityScaleOpCleanPass : public FusePassBase {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index bf6fe999c1e..9c1640efcd8 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void IsTestPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it "
              "for activations and pooling.";
diff --git a/paddle/fluid/framework/ir/is_test_pass.h b/paddle/fluid/framework/ir/is_test_pass.h
index 80cedbf9f85..abf48480d71 100644
--- a/paddle/fluid/framework/ir/is_test_pass.h
+++ b/paddle/fluid/framework/ir/is_test_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class IsTestPass : public Pass {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
index f38f48fcd92..26ec61fd36e 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@@ -27,6 +27,7 @@ namespace framework {
 namespace ir {
 
 class Node;
+class Graph;
 
 /*
 * Remove the sum op of all gradients of the backward op.
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
index b1afa47910f..b12b84d4a49 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
@@ -12,24 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <algorithm>
 #include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
+
+#include "glog/logging.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 using OpHandleBase = details::OpHandleBase;
 using ComputationOpHandle = details::ComputationOpHandle;
 using VarHandle = details::VarHandle;
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
index ce7f27d2755..0cdde5c757a 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <string>
+
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -22,13 +23,15 @@
 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class BufferSharedInplaceOpPass : public MemoryReusePass {
  protected:
   std::string ReuseType() const override { return "inplace"; }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
index 3e3b9864a7b..72e29dfe156 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
@@ -16,10 +16,15 @@
 
 #include <functional>
 #include <map>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
+
+namespace paddle {
+namespace framework {
+namespace details {
+class ComputationOpHandle;
+class ShareTensorBufferOpHandle;
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
index 1c0c6ae6020..4a77d116f1e 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
@@ -28,6 +28,17 @@
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/pass.h"
 
+namespace paddle {
+namespace framework {
+class VarDesc;
+namespace details {
+class ComputationOpHandle;
+class ShareTensorBufferOpHandle;
+struct VarHandle;
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -71,6 +82,8 @@ namespace ir {
  * a pass to clean all ShareTensorBufferOpHandles and move sharing to
  * ComputationOpHandle::Run() in the future.
  */
+class Graph;
+
 class MemoryReusePass : public Pass {
  protected:
   void ApplyImpl(Graph *graph) const final;
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
index 11c2508afb5..7de62d6e482 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
@@ -13,8 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"
-#include <queue>
-#include <utility>
+
+namespace paddle {
+namespace framework {
+namespace details {
+class OpHandleBase;
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
index 5fb2caedba8..d6f286afc55 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
@@ -18,8 +18,17 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 
+namespace paddle {
+namespace framework {
+namespace details {
+class OpHandleBase;
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc
index ed87f73adf1..e85be0272de 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc
@@ -13,8 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
-#include "paddle/fluid/framework/details/var_handle.h"
-#include "paddle/fluid/framework/var_desc.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
index 0e8f4e78d22..d00e4f53022 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
@@ -30,6 +30,9 @@ namespace paddle {
 namespace framework {
 
 class VarDesc;
+namespace details {
+struct VarHandle;
+}  // namespace details
 
 namespace ir {
 
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
index 45ff275d530..a5beec87c39 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
@@ -13,14 +13,21 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
-#include <string>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void ConvActivationFusePass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
index ac15fc04512..be6b1e07c02 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -26,6 +27,8 @@ namespace ir {
 /*
  * Fuse Conv and Activation base class.
  */
+class Graph;
+
 class ConvActivationFusePass : public FusePassBase {
  public:
   virtual ~ConvActivationFusePass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index f7a8e3e3f6c..63524294b68 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
 #include <functional>
-#include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
index 833fbc748eb..9a83310ebfb 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
@@ -13,16 +13,20 @@
 // limitations under the License.
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
 /*
 * Fuse the Conv and Elementwise_add to a ConvBiasOp.
 */
+class Graph;
+
 class ConvBiasFusePass : public FusePassBase {
  public:
   virtual ~ConvBiasFusePass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
index af64cb22054..5fadd9607e9 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void ConvConcatReLUFusePass::FindConcatWithConvs(
     ir::Graph* graph,
     std::unordered_map<const Node*, int>* concat_with_convs_counter) const {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h
index 91ff0760f04..f1faa84f3d5 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -30,6 +31,8 @@ namespace ir {
  * to a:
  * (multi ConvReLU) -> Concat -> next_op.
  */
+class Graph;
+
 class ConvConcatReLUFusePass : public FusePassBase {
  public:
   virtual ~ConvConcatReLUFusePass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
index b95aec34d30..2ba4c80678f 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
@@ -28,6 +28,13 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class GraphPatternDetector;
+class Node;
+namespace patterns {
+struct Conv;
+}  // namespace patterns
+
 using graph_ptr = ir::Graph*;
 using GraphWithStats = std::pair<ir::Graph*, int>;
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index aa0979b4be6..0254b5e7573 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -13,12 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
-#include <limits>
+
 #include <sstream>
 #include <utility>
 #include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/platform/errors.h"
+
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/string/pretty_log.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index 21219e7dca8..bd87b31b781 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -31,6 +32,9 @@ namespace ir {
  * bool denotes whether quantization of the variable should be done to unsigned
  * type.
  */
+class Graph;
+class Node;
+
 using VarQuantScale =
     std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index bc268a83478..2146d833ddf 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
-#include <string>
 #include <unordered_set>
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Marks operators which are to be quantized.";
   const auto& excluded_ids_list =
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
index f3229e59d6f..474fa63f60e 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -26,6 +27,8 @@ namespace ir {
 /*
  * Specifies which operators should be quantized.
  */
+class Graph;
+
 class CPUQuantizePlacementPass : public FusePassBase {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index bc24c10d9d0..54ab244a99b 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -14,9 +14,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
-#include <algorithm>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/pretty_log.h"
 
@@ -24,6 +25,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 using string::PrettyLogDetail;
 
 void CPUQuantizeSquashPass::FindNodesToKeep(
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
index 98a518e4e53..d1465f9da5c 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -29,6 +30,8 @@ namespace ir {
 /*
  * Squash dequantize->quantize pair pattern into requantize op
  */
+class Graph;
+
 class CPUQuantizeSquashPass : public FusePassBase {
  public:
   virtual ~CPUQuantizeSquashPass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
index df5ba3314e6..b2c0afdc754 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -20,6 +21,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 #define GET_NODE(id, pattern)                                     \
   PADDLE_ENFORCE_NE(subgraph.count(pattern.RetrieveNode(#id)), 0, \
                     platform::errors::InvalidArgument(            \
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
index ca314afde57..0f4ecc71ad7 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class DepthwiseConvMKLDNNPass : public FusePassBase {
  public:
   virtual ~DepthwiseConvMKLDNNPass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
index 95afc548376..6efa9f6b749 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
@@ -13,18 +13,21 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h"
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(graph,
                           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h
index 97c6b242989..df02250394a 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <memory>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -25,6 +26,8 @@ namespace ir {
 /*
  * Transpose weights of FC to comply with MKL-DNN interface
  */
+class Graph;
+
 class FCMKLDNNPass : public FusePassBase {
  public:
   virtual ~FCMKLDNNPass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h
index 77e30b35346..ef469bac40c 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -22,6 +23,8 @@
 namespace paddle {
 namespace framework {
 namespace ir {
+class Graph;
+
 class MatmulTransposeReshapeMKLDNNPass : public FusePassBase {
  public:
   virtual ~MatmulTransposeReshapeMKLDNNPass() {}
@@ -30,6 +33,6 @@ class MatmulTransposeReshapeMKLDNNPass : public FusePassBase {
   void ApplyImpl(Graph* graph) const override;
   const std::string name_scope_{"matmul_transpose_reshape_fuse"};
 };
-}
+}  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
index de699430189..ca56a8900ca 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/placement_pass_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h
index eab9f095623..7a53b3c4984 100644
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -25,6 +26,8 @@ namespace ir {
 /*
  * Fuse Reshape->Transpose->MatMul when MatMul uses mkldnn.
  */
+class Graph;
+
 class ReshapeTransposeMatmulMkldnnFusePass : public FusePassBase {
  public:
   virtual ~ReshapeTransposeMatmulMkldnnFusePass() {}
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
index 6c87e437caa..790821e3fa4 100644
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/string/pretty_log.h"
 
@@ -22,6 +24,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 using string::PrettyLogDetail;
 
 void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h
index fe97b9681ce..32ff78d9a73 100644
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class ScaleMatmulFusePass : public FusePassBase {
  public:
   virtual ~ScaleMatmulFusePass() {}
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.h b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
index a89616683d9..ae2e68c6003 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.h
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
@@ -31,6 +31,8 @@ namespace ir {
 // sync training, we can simulate even large batch size as if we have more
 // GPUs.
 
+class Graph;
+
 class BatchMergePass : public Pass {
  public:
   virtual ~BatchMergePass() {}
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
index bb6c8079074..bb3586ba804 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
@@ -20,17 +20,32 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
 
+namespace paddle {
+namespace framework {
+namespace details {
+class OpHandleBase;
+struct VarHandle;
+}  // namespace details
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace platform {
 class NCCLContextMap;
+class NCCLCommunicator;
 }
 
 namespace framework {
 class Scope;
+
 namespace ir {
 
 constexpr char kLossVarName[] = "loss_var_name";
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
index bcbd1e066cc..78e90f82bfb 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
@@ -12,13 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h
index 0038790cae2..d37b21f0584 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h
@@ -17,10 +17,18 @@
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 void InitReaderQueueDeviceCount(Graph *graph, const Scope &scope,
                                 size_t dev_cnt);
 
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 9d2b4ebaf8c..d1fbc8396ba 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h"
-#include <memory>
+
 #include <string>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/ddim.h"
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/errors.h"
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
index 0afa00fc62a..f5327dc7108 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
@@ -16,10 +16,19 @@
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 45d81b93739..7143c9a7a3e 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/op_info.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 87e7e64acb7..d0db3bd36e1 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -19,10 +19,18 @@ limitations under the License. */
 #include <typeindex>
 #include <typeinfo>
 #include <vector>
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/macros.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index a5ca13f1ce2..0e5f5867f47 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -14,11 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/pass.h"
 
-#include <memory>
-#include <utility>
-
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 0f5ef551f04..668dc74eab2 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -29,6 +30,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+class Graph;
 template <typename PassType>
 struct PassRegistrar;
 
diff --git a/paddle/fluid/framework/ir/pass_builder.cc b/paddle/fluid/framework/ir/pass_builder.cc
index 6457bd230c5..4e99271a2ec 100644
--- a/paddle/fluid/framework/ir/pass_builder.cc
+++ b/paddle/fluid/framework/ir/pass_builder.cc
@@ -20,6 +20,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Pass;
+
 std::shared_ptr<Pass> PassBuilder::AppendPass(const std::string& pass_type) {
   VLOG(1) << "Append " << pass_type;
   auto pass = ir::PassRegistry::Instance().Get(pass_type);
diff --git a/paddle/fluid/framework/ir/pass_builder.h b/paddle/fluid/framework/ir/pass_builder.h
index 733d3a3ad1a..0e68767db3f 100644
--- a/paddle/fluid/framework/ir/pass_builder.h
+++ b/paddle/fluid/framework/ir/pass_builder.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/ir/pass.h"
@@ -22,6 +23,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Pass;
+
 class PassBuilder {
  public:
   PassBuilder() {}
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
index 0c5286b3f77..65b9c427869 100644
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -13,15 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/pass.h"
-#include <memory>
+
 #include <string>
-#include <utility>
+
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
+class Graph;
+class Node;
+
 void BuildCircleGraph(Graph* g) {
   ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
   ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
diff --git a/paddle/fluid/framework/ir/placement_pass_base.h b/paddle/fluid/framework/ir/placement_pass_base.h
index 5cdd7963e54..ef1a920db3f 100644
--- a/paddle/fluid/framework/ir/placement_pass_base.h
+++ b/paddle/fluid/framework/ir/placement_pass_base.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
@@ -25,6 +26,8 @@ namespace ir {
 /*
  * Specifies which operators should use cuDNN.
  */
+class Graph;
+
 class PlacementPassBase : public Pass {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index 9f6032ffa5b..af4a2f40605 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h"
-#include <algorithm>  // for max
+#include <algorithm>
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
index ae777bccebe..0be217cc748 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -26,6 +27,8 @@ namespace ir {
 /**
  * Fuse Repeated FC Relu
  */
+class Graph;
+
 class RepeatedFCReluFusePass : public FusePassBase {
  public:
   virtual ~RepeatedFCReluFusePass() {}
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.h b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
index e4783166e0c..741adcce8d7 100644
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.h
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
@@ -15,12 +15,15 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class RuntimeContextCachePass : public Pass {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
index 19ec2d818a3..7daa9b5eff7 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -16,10 +16,7 @@
 #include <set>
 #include <string>
 #include <unordered_set>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
index d68840a5547..a7041153645 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
@@ -22,6 +22,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class SeqConcatFcFusePass : public FusePassBase {
  public:
   virtual ~SeqConcatFcFusePass() {}
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
index 75ab04f1b91..d9a1348e05a 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h"
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -22,6 +23,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Node;
+
 int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
   GraphPatternDetector gpd;
   auto* pattern = gpd.mutable_pattern();
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
index fde9b586c85..6f623625f51 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -23,6 +24,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class SeqConvEltAddReluFusePass : public FusePassBase {
  public:
   virtual ~SeqConvEltAddReluFusePass() {}
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
index 1c220ee4d57..b6badf745c6 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
@@ -16,7 +16,14 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
 
 #define MAX_CONCAT_INPUTS 200
 
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
index 40a9edc5e64..482fd5cb580 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -37,6 +38,8 @@ namespace ir {
  *   FusionSeqPoolConcat
  *           |
  */
+class Graph;
+
 class SeqPoolConcatFusePass : public FusePassBase {
  public:
   virtual ~SeqPoolConcatFusePass() {}
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
index 8261bfc1534..d639d410466 100644
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
@@ -13,16 +13,17 @@
  * limitations under the License. */
 
 #include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h"
-#include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 namespace {
 static PDNode* BuildCVMConcatPattern(PDPattern* pattern) {
   auto cvm_behind_x = [](Node* x) -> bool {
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
index 88a41983c6b..b0a3573fb59 100644
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -39,6 +40,8 @@ namespace ir {
  * FusionSeqPoolCVMConcat
  *           |
  */
+class Graph;
+
 class SeqPoolCVMConcatFusePass : public FusePassBase {
  public:
   virtual ~SeqPoolCVMConcatFusePass() {}
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
index 8bdf3940928..92e995579fa 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string>
-
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h
index 008f8013efd..dc375988cdd 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
index 2e5c18d3352..5cc6b6171ac 100644
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
 namespace framework {
@@ -30,6 +29,8 @@ namespace ir {
  * - remove dropout_op (upscale_in_train) or
  *   replace dropout_op with scale_op (downgrade_in_infer) when is_test is true
  */
+class Graph;
+
 void SimplifyWithBasicOpsPass::ApplyImpl(Graph* graph) const {
   VLOG(3) << "Simplify the Graph with basic ops.";
   std::unordered_set<const Node*> del_node_set;
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
index f5185622468..6a245c444a7 100644
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h
@@ -16,12 +16,16 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 class SimplifyWithBasicOpsPass : public Pass {
  protected:
   void ApplyImpl(Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index 2e3cd16d5ce..e5f348dfeb1 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h"
+
 #include <string>
 #include <unordered_set>
-#include <vector>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h
index 2de8d376221..3a3e5005239 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h
@@ -29,6 +29,8 @@ namespace ir {
 //                 |                                   |
 //             other_op3
 //                 |
+class Graph;
+
 class SkipLayerNormFusePass : public FusePassBase {
  public:
   virtual ~SkipLayerNormFusePass() {}
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
index 56b7ec9b843..90def957df4 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -26,6 +27,8 @@ namespace ir {
 /**
  * Fuse ( (A * B).^2 - (A.^2 * B.^2) ) .* scalar
  */
+class Graph;
+
 class SquaredMatSubFusePass : public FusePassBase {
  public:
   virtual ~SquaredMatSubFusePass() {}
diff --git a/paddle/fluid/framework/ir/subgraph_detector.cc b/paddle/fluid/framework/ir/subgraph_detector.cc
index 7979953d7be..6ebe900e26b 100644
--- a/paddle/fluid/framework/ir/subgraph_detector.cc
+++ b/paddle/fluid/framework/ir/subgraph_detector.cc
@@ -13,18 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/subgraph_detector.h"
-#include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/node.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 std::pair<std::vector<Node *>, std::vector<Node *>>
 ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
   std::unordered_set<Node *> nodes(graph.begin(), graph.end());
diff --git a/paddle/fluid/framework/ir/subgraph_detector.h b/paddle/fluid/framework/ir/subgraph_detector.h
index 3d279e27e6a..6bd73c758b3 100644
--- a/paddle/fluid/framework/ir/subgraph_detector.h
+++ b/paddle/fluid/framework/ir/subgraph_detector.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/node.h"
@@ -24,6 +25,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+class Node;
+
 const char kIsFunctionNode[] = "__is_function_node__";
 const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
 const char kSubgraphSplitterMarkerAttrName[] =
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
index 222c73761b4..3fa008c300c 100644
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
@@ -12,15 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <memory>
-#include <string>
-#include <utility>
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+class Graph;
+
 class SyncBatchNormPass : public Pass {
  protected:
   void ApplyImpl(ir::Graph *graph) const override {
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index 405cefa99eb..2db6d0230e3 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
diff --git a/paddle/fluid/framework/lod_rank_table.h b/paddle/fluid/framework/lod_rank_table.h
index 8c6e8b0c66e..9a7c1285e30 100644
--- a/paddle/fluid/framework/lod_rank_table.h
+++ b/paddle/fluid/framework/lod_rank_table.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <iosfwd>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 40615d772e5..a044812dd31 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -12,19 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/lod_tensor.h"
 #include <stdint.h>
-#include <string.h>
 #include <algorithm>
-#include <iterator>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/version.h"
 
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/memory/memory.h"
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index da97efb6168..e09a628f491 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #endif
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -31,6 +32,15 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index c93c3f2673b..e3223e67fc9 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -14,9 +14,6 @@
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <algorithm>
-#include <memory>
-#include <vector>
 
 #include "paddle/fluid/framework/lod_tensor.h"
 
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 280996d34dd..3a6e80f718d 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -20,14 +20,14 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <utility>
 #include <vector>
+
+#include "glog/logging.h"
 #include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 
-#include "glog/logging.h"
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/mixed_vector_test.cc b/paddle/fluid/framework/mixed_vector_test.cc
index 0599c8d3846..a40a3ff33fe 100644
--- a/paddle/fluid/framework/mixed_vector_test.cc
+++ b/paddle/fluid/framework/mixed_vector_test.cc
@@ -12,11 +12,9 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <memory>
-
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/mixed_vector.h"
 
 template <typename T>
 using vec = paddle::framework::Vector<T>;
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index 81402a650a3..75677ef5243 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -14,8 +14,10 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@@ -28,6 +30,10 @@ namespace framework {
  * Simple, intuitive and effective. Only single thread is supported, and
  * currently designed for inference.
  */
+class LoDTensor;
+class ProgramDesc;
+class Scope;
+
 class NaiveExecutor {
  public:
   explicit NaiveExecutor(const platform::Place& place) : place_(place) {}
diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference.h b/paddle/fluid/framework/no_need_buffer_vars_inference.h
index 5d30f34090e..21ba0381fe6 100644
--- a/paddle/fluid/framework/no_need_buffer_vars_inference.h
+++ b/paddle/fluid/framework/no_need_buffer_vars_inference.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/op_call_stack.h b/paddle/fluid/framework/op_call_stack.h
index d48cf27285a..f633538e700 100644
--- a/paddle/fluid/framework/op_call_stack.h
+++ b/paddle/fluid/framework/op_call_stack.h
@@ -15,9 +15,16 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
 
+namespace paddle {
+namespace platform {
+struct EnforceNotMet;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/op_compatible_info.h b/paddle/fluid/framework/op_compatible_info.h
index 08b5734b5bf..01fbdef99cb 100644
--- a/paddle/fluid/framework/op_compatible_info.h
+++ b/paddle/fluid/framework/op_compatible_info.h
@@ -14,6 +14,7 @@
 
 #include <map>
 #include <string>
+
 #include "paddle/fluid/framework/program_desc.h"
 
 #pragma once
@@ -21,6 +22,10 @@
 namespace paddle {
 namespace framework {
 
+namespace proto {
+class OpCompatibleMap;
+}  // namespace proto
+
 enum class OpCompatibleType {
   compatible = 0,       //   support previous version
   DEFIN_NOT = 1,        //   definitely can't support previous version
diff --git a/paddle/fluid/framework/op_compatible_info_test.cc b/paddle/fluid/framework/op_compatible_info_test.cc
index 43959c8b2ab..98f3f5071ad 100644
--- a/paddle/fluid/framework/op_compatible_info_test.cc
+++ b/paddle/fluid/framework/op_compatible_info_test.cc
@@ -13,14 +13,17 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/op_compatible_info.h"
-#include <iostream>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace framework {
 
+namespace proto {
+class OpCompatibleMap;
+}  // namespace proto
+
 TEST(test_op_compatible_info, test_op_compatible) {
   auto comp_map = OpCompatibleMap();
   comp_map.InitOpCompatibleMap();
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index e15f0012fdc..95c33bca6c7 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/var_desc.h"
@@ -27,6 +28,7 @@ namespace framework {
 
 class BlockDesc;
 class ProgramDesc;
+
 class OpDesc {
  public:
   OpDesc() {}
diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc
index c815e194d43..820a83586b3 100644
--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_info.h"
 #include <set>
-#include <string>
-#include <vector>
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index 89b49997579..af657232e91 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -20,13 +20,18 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace framework {
 
+class InferShapeContext;
+class OpAttrChecker;
+
 class InferShapeBase {
  public:
   virtual ~InferShapeBase() = default;
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index 9edc1a3e150..f4e60bb9b78 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/library_type.h"
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index 40db85400d2..2979750fba7 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_kernel_type.h"
+
 #include <gtest/gtest.h>
-#include <iostream>
 
 TEST(OpKernelType, ToString) {
   using OpKernelType = paddle::framework::OpKernelType;
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index a8030d377fd..56f940e3997 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -16,6 +16,12 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
+namespace paddle {
+namespace platform {
+struct EnforceNotMet;
+}  // namespace platform
+}  // namespace paddle
+
 class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc
index 81cfaf92e39..72dd6fa6bbd 100644
--- a/paddle/fluid/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
-#include <vector>
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 6408fadf90a..77383386fa1 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -35,6 +35,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/shape_inference.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
index 239dbc43578..d6b18751cef 100644
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <glog/logging.h>
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index bd52d7ffef5..d493f350e69 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -39,6 +39,15 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/variant.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpInfo;
+class Scope;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 DECLARE_int32(inner_op_parallelism);
 
 namespace paddle {
@@ -105,8 +114,8 @@ inline std::string GradOriginalVarName(const std::string& grad_var_name) {
 const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var);
 Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
 
-class OperatorBase;
 class ExecutionContext;
+class OperatorBase;
 
 class RuntimeContext {
  public:
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index d5e9c755f12..5cafc9111da 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/proto_desc.h"
diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
index 48bde2785e6..0ba1099b032 100644
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/program_desc.h"
+
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
 
 namespace paddle {
 namespace framework {
+class VarDesc;
+
 TEST(ProgramDesc, copy_ctor) {
   ProgramDesc program;
   auto* global_block = program.MutableBlock(0);
diff --git a/paddle/fluid/framework/prune.h b/paddle/fluid/framework/prune.h
index 63e8aaad735..57f282536bf 100644
--- a/paddle/fluid/framework/prune.h
+++ b/paddle/fluid/framework/prune.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <set>
 #include <string>
 #include <tuple>
+
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
index 12fa0c61f81..618eaba3c5b 100644
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -15,13 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/prune.h"
 
 #include <gtest/gtest.h>
-#include <set>
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/operator.h"
-
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index 9f4c817db7d..c399c5d02eb 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -12,12 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <time.h>
+
 #include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 
 namespace paddle {
 namespace framework {
 
+class LoDTensor;
+class Scope;
+class Variable;
+
 std::shared_ptr<PullDenseWorker> PullDenseWorker::s_instance_ = NULL;
 std::mutex PullDenseWorker::mutex_for_version_;
 std::map<uint64_t, uint64_t> PullDenseWorker::last_versions_;
diff --git a/paddle/fluid/framework/rw_lock_test.cc b/paddle/fluid/framework/rw_lock_test.cc
index 16f9cbb6522..601b10787be 100644
--- a/paddle/fluid/framework/rw_lock_test.cc
+++ b/paddle/fluid/framework/rw_lock_test.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/rw_lock.h"
 #include <gtest/gtest.h>
-#include <chrono>  // NOLINT
 #include <thread>  // NOLINT
 #include <vector>
 
diff --git a/paddle/fluid/framework/save_load_util.h b/paddle/fluid/framework/save_load_util.h
index 6b50c93ddbd..0b96e002e30 100644
--- a/paddle/fluid/framework/save_load_util.h
+++ b/paddle/fluid/framework/save_load_util.h
@@ -28,6 +28,8 @@
 namespace paddle {
 namespace framework {
 
+class Scope;
+
 bool SaveStaticNameListToDisk(
     const std::string& file_name,
     const std::vector<std::string>& vec_tensor_name_list, const Scope& scope);
diff --git a/paddle/fluid/framework/save_load_util_test.cc b/paddle/fluid/framework/save_load_util_test.cc
index 4a54e2d4668..10a34d7ce91 100644
--- a/paddle/fluid/framework/save_load_util_test.cc
+++ b/paddle/fluid/framework/save_load_util_test.cc
@@ -13,12 +13,9 @@
 // limitations under the License.
 #include <stdlib.h>
 #include <time.h>
-#include <iostream>
-#include <memory>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/save_load_util.h"
-#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index db7010ecceb..922e9a9b272 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -30,6 +30,12 @@ extern "C" {
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
 
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/scope_pool.cc b/paddle/fluid/framework/scope_pool.cc
index 4bb077a2c52..cf0b3ebcddd 100644
--- a/paddle/fluid/framework/scope_pool.cc
+++ b/paddle/fluid/framework/scope_pool.cc
@@ -11,10 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <memory>
-
 #include "paddle/fluid/framework/scope_pool.h"
-#include "paddle/fluid/framework/threadpool.h"
+#include <memory>
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/scope_pool.h b/paddle/fluid/framework/scope_pool.h
index a8b468699ab..19faa9aa6a4 100644
--- a/paddle/fluid/framework/scope_pool.h
+++ b/paddle/fluid/framework/scope_pool.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <memory>
 #include <mutex>  // NOLINT
 #include <unordered_set>
 #include "paddle/fluid/framework/scope.h"
@@ -21,6 +22,8 @@
 namespace paddle {
 namespace framework {
 
+class Scope;
+
 class ScopePool {
  public:
   static ScopePool &Instance();  // NOLINT
diff --git a/paddle/fluid/framework/scope_test.cc b/paddle/fluid/framework/scope_test.cc
index 26817fc558d..a61e68279a2 100644
--- a/paddle/fluid/framework/scope_test.cc
+++ b/paddle/fluid/framework/scope_test.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/scope.h"
-#include "glog/logging.h"
+
 #include "gtest/gtest.h"
 
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 using paddle::framework::Scope;
 using paddle::framework::Variable;
 
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 1f402ea9dd3..4c30c40ad58 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/selected_rows.h"
 
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 285af1d5530..48353b43f56 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -26,9 +26,18 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 
+namespace paddle {
+namespace platform {
+class DeviceContext;
+class Place;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
+class Tensor;
+
 class SelectedRows {
   /*
    * @brief We can use the SelectedRows structure to reproduce a sparse table.
diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc
index f5bb3f68007..02e4ce914b8 100644
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/shape_inference.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/grad_op_desc_maker.h"
-#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 0b22bab2678..9f5d8d30c9c 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -13,7 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/var_type.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+class Allocation;
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index f2ccff2c133..faecba6295d 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <typeindex>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -28,6 +29,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
+namespace paddle {
+namespace memory {
+namespace allocation {
+class Allocation;
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
 namespace paddle {
 
 namespace framework {
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index cc972dd93d0..92a29d5165c 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -13,9 +13,15 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/tensor.h"
+
 #include <gtest/gtest.h>
 #include <string>
-#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace platform {
+struct float16;
+}  // namespace platform
+}  // namespace paddle
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 0e3d11b9f02..a073dbd733f 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -664,7 +664,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
     uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
 
     auto* data_ptr = tensor.data<void>();
-    PADDLE_ENFORCE_LT(size, std::numeric_limits<std::streamsize>::max(),
+    PADDLE_ENFORCE_LT(size, (std::numeric_limits<std::streamsize>::max)(),
                       platform::errors::ResourceExhausted(
                           "tensor size %d overflow when writing tensor", size));
     if (platform::is_gpu_place(tensor.place())) {
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index a0408dbc3db..1e811a41e90 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/dlpack_tensor.h"
 #include "paddle/fluid/framework/eigen.h"
@@ -30,6 +31,8 @@ namespace framework {
 // If ctx_place and src_place are the same, src_ctx.Wait() is added
 // after memory::Copy; if ctx_place and dst_place are the same,
 // src_ctx.Wait() is added before memory::Copy.
+class Tensor;
+
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst);
 
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index ef9964b3ae7..e389cb34679 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/tensor_util.h"
+
 #include <gtest/gtest.h>
 #include <cmath>
-#include <string>
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 4682bfc264b..3db8f3e36b7 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -13,8 +13,6 @@
    limitations under the License. */
 
 #include "paddle/fluid/framework/threadpool.h"
-#include <memory>
-#include <utility>
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index 09528b6fc35..7fecf07475b 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <utility>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 077fe751720..d041ef48e2c 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -14,14 +14,15 @@ limitations under the License. */
 
 #pragma once
 
+#include <ctime>
 #include <fstream>
+#include <map>
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
-#include <ctime>
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker.h"
@@ -38,6 +39,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class Dataset;
+class LoDTensor;
+class ProgramDesc;
+class PullDenseWorker;
+class Scope;
+class VarDesc;
+template <class T>
+class ChannelObject;
+
 class TrainerBase {
  public:
   TrainerBase() {}
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index 31ac11e78cf..15584620d86 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -17,11 +17,11 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/framework/trainer.h"
-
 namespace paddle {
 namespace framework {
 
+class TrainerBase;
+
 typedef std::shared_ptr<TrainerBase> (*CreatetrainerFunction)();
 typedef std::unordered_map<std::string, CreatetrainerFunction> trainerMap;
 trainerMap g_trainer_map;
diff --git a/paddle/fluid/framework/trainer_factory.h b/paddle/fluid/framework/trainer_factory.h
index 9c772a4f19e..3ef61c03817 100644
--- a/paddle/fluid/framework/trainer_factory.h
+++ b/paddle/fluid/framework/trainer_factory.h
@@ -16,11 +16,14 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/trainer.h"
 
 namespace paddle {
 namespace framework {
 
+class TrainerBase;
+
 class TrainerFactory {
  public:
   static std::string TrainerTypeList();
diff --git a/paddle/fluid/framework/transfer_scope_cache.h b/paddle/fluid/framework/transfer_scope_cache.h
index 9a5d4526321..7ff6020ff2a 100644
--- a/paddle/fluid/framework/transfer_scope_cache.h
+++ b/paddle/fluid/framework/transfer_scope_cache.h
@@ -17,12 +17,16 @@
 #include <thread>  // NOLINT
 #include <unordered_map>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace framework {
 
+class OpKernelType;
+class Scope;
+
 std::unordered_map<size_t, Scope*>& global_transfer_data_cache();
 
 std::unordered_set<Scope*>& global_transfer_scope_cache();
diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h
index 508ee931c6e..35b8220d453 100644
--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/var_desc.h"
diff --git a/paddle/fluid/framework/tuple_test.cc b/paddle/fluid/framework/tuple_test.cc
index cfdd4dc56e4..9060bd3fc89 100644
--- a/paddle/fluid/framework/tuple_test.cc
+++ b/paddle/fluid/framework/tuple_test.cc
@@ -11,11 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <sstream>
-#include <vector>
-
-#include "gtest/gtest.h"
 #include "paddle/fluid/framework/tuple.h"
+#include "gtest/gtest.h"
 
 TEST(Tuple, Make) {
   std::vector<paddle::framework::ElementVar> element_type;
diff --git a/paddle/fluid/framework/unroll_array_ops.h b/paddle/fluid/framework/unroll_array_ops.h
index ab176410805..a9c047cc6c6 100644
--- a/paddle/fluid/framework/unroll_array_ops.h
+++ b/paddle/fluid/framework/unroll_array_ops.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cstddef>
 #include <type_traits>
+
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/unroll_array_ops_test.cc b/paddle/fluid/framework/unroll_array_ops_test.cc
index be811478eec..c4fdfdb425f 100644
--- a/paddle/fluid/framework/unroll_array_ops_test.cc
+++ b/paddle/fluid/framework/unroll_array_ops_test.cc
@@ -13,10 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/unroll_array_ops.h"
+
 #include <gtest/gtest.h>
-#include <algorithm>
 #include <array>
-#include <cstdint>
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc
index e7e964b4181..ac455b9ffd7 100644
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@@ -16,11 +16,11 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
-
 #include <string>
-#include <unordered_set>
 #include <vector>
 
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+#include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/unused_var_check.h b/paddle/fluid/framework/unused_var_check.h
index 2f44a3bcde0..d78b4d928f3 100644
--- a/paddle/fluid/framework/unused_var_check.h
+++ b/paddle/fluid/framework/unused_var_check.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
-
 #include <string>
 #include <unordered_set>
 
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index 2ee0b17b64b..457c0c77b3c 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <google/protobuf/util/message_differencer.h>
-
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
index 6e8be0fdd42..b37a09a17a9 100644
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/framework.pb.h"
 
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index 9312ac075de..f649c9388f0 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
@@ -24,8 +25,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class OpDesc;
 class BlockDesc;
+class OpDesc;
 class StaticGraphVarTypeInference;
 // default infer var type context
 
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index dc86d58f600..5483ef01c08 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/var_type_inference.h"
+
 #include <string>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -22,6 +24,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class Scope;
+
 class NOP : public OperatorBase {
  public:
   NOP(const std::string& type, const VariableNameMap& inputs,
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 4875956096a..07387f87411 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -19,6 +19,7 @@
 #include <tuple>
 #include <typeindex>
 #include <vector>
+
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -43,12 +44,12 @@ class NCCLCommunicator;
 }  // namespace platform
 
 namespace framework {
-class Tensor;
-class LoDTensor;
-class SelectedRows;
 class LoDRankTable;
+class LoDTensor;
 class ReaderHolder;
 class Scope;
+class SelectedRows;
+class Tensor;
 }  // namespace framework
 
 namespace operators {
diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h
index 01a5d09e072..6e65bc2c932 100644
--- a/paddle/fluid/framework/variable_helper.h
+++ b/paddle/fluid/framework/variable_helper.h
@@ -21,6 +21,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class Variable;
+
 void InitializeVariable(Variable* var, proto::VarType::Type var_type);
 void CopyVariable(const Variable& src_var, Variable* dst_var);
 
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
index 511c9c52146..98a8ff9cf3e 100644
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -12,12 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <memory>
-#include <string>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
+#include "gtest/gtest.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc
index 7ba00f52e67..92042e47259 100644
--- a/paddle/fluid/framework/version.cc
+++ b/paddle/fluid/framework/version.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/version.h"
-#include <algorithm>
+
 #include <sstream>
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/all_reduce.h b/paddle/fluid/imperative/all_reduce.h
index 81880d0fb89..249fb4e11f1 100644
--- a/paddle/fluid/imperative/all_reduce.h
+++ b/paddle/fluid/imperative/all_reduce.h
@@ -25,9 +25,17 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/nccl_context.h"
 
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace imperative {
 
+struct ParallelStrategy;
+
 void AllReduce(const framework::Variable &src, framework::Variable *dst,
                const ParallelStrategy &strategy);
 
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index c980b014b82..d4a1519b07e 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -14,20 +14,17 @@
 
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 
-#include <algorithm>
 #include <memory>
-#include <set>
 #include <string>
-#include <unordered_set>
 #include <utility>
 
-#include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/tracer.h"
-#include "paddle/fluid/imperative/variable_wrapper.h"
 
 namespace paddle {
 namespace imperative {
 
+class VarBase;
+
 AmpOperators::AmpOperators()
     : allow_ops_(new std::unordered_set<std::string>()),
       block_ops_(new std::unordered_set<std::string>()) {}
diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h
index d1da97e5a39..7ab876c1ce8 100644
--- a/paddle/fluid/imperative/amp_auto_cast.h
+++ b/paddle/fluid/imperative/amp_auto_cast.h
@@ -26,6 +26,8 @@ namespace paddle {
 namespace imperative {
 
 // Singleton implementation with C++ 11
+class Tracer;
+
 class AmpOperators {
  public:
   ~AmpOperators();
diff --git a/paddle/fluid/imperative/data_loader.cc b/paddle/fluid/imperative/data_loader.cc
index 3b8239e566d..a2fccf7901f 100644
--- a/paddle/fluid/imperative/data_loader.cc
+++ b/paddle/fluid/imperative/data_loader.cc
@@ -16,13 +16,9 @@
 
 #include "paddle/fluid/imperative/data_loader.h"
 
-#include <string.h>
 #include <sys/wait.h>
-
-#include <atomic>
 #include <csignal>
 #include <map>
-#include <set>
 
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/imperative/jit/op_desc_meta.cc b/paddle/fluid/imperative/jit/op_desc_meta.cc
index f5c00985900..d256dafb891 100644
--- a/paddle/fluid/imperative/jit/op_desc_meta.cc
+++ b/paddle/fluid/imperative/jit/op_desc_meta.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/jit/op_desc_meta.h"
-#include "paddle/fluid/framework/op_info.h"
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/imperative/jit/op_desc_meta.h b/paddle/fluid/imperative/jit/op_desc_meta.h
index 506afee6a99..c59b2c885d2 100644
--- a/paddle/fluid/imperative/jit/op_desc_meta.h
+++ b/paddle/fluid/imperative/jit/op_desc_meta.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc
index be93a787d46..9f4cf713f7c 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.cc
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc
@@ -15,7 +15,12 @@
 #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
 #include <unordered_map>
 #include <unordered_set>
-#include <utility>
+
+namespace paddle {
+namespace imperative {
+class VarBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.h b/paddle/fluid/imperative/jit/program_desc_tracer.h
index d07acec2230..8e2e59a49ed 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.h
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.h
@@ -21,12 +21,19 @@
 #include <tuple>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/imperative/jit/op_desc_meta.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/macros.h"
 
+namespace paddle {
+namespace imperative {
+class VarBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace imperative {
 namespace jit {
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 03e83301d44..d4df052a40d 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -24,6 +24,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -35,10 +36,18 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace imperative {
 
 class OpBase;
+class GradOpNode;
+class VariableWrapper;
 
 class ThreadSafeNameSet {
  public:
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index bf02eebdbb6..3bf032e642b 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -17,12 +17,23 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+class Variable;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace imperative {
 
diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc
index 34570b3a60e..85063a68216 100644
--- a/paddle/fluid/imperative/profiler.cc
+++ b/paddle/fluid/imperative/profiler.cc
@@ -19,8 +19,7 @@
 #endif
 #include <gflags/gflags.h>
 #include <glog/logging.h>
-#include <mutex>   // NOLINT
-#include <thread>  // NOLINT
+#include <mutex>  // NOLINT
 
 DEFINE_string(
     tracer_profile_fname, "xxgperf",
diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc
index 8ce257a6c37..93ea988d638 100644
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/nccl_context.h"
+
 #include "gtest/gtest.h"
-#include "paddle/fluid/platform/device_context.h"
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
index ebb19fd486c..ee109310483 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -18,7 +18,14 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
 #include <algorithm>
-#include <string>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
index 1257562972e..621c631b853 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
@@ -27,6 +27,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/node.h"
 
+namespace paddle {
+namespace framework {
+class BlockDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
index b6b67ce8ece..bb0248008e0 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -18,11 +18,21 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h b/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h
index 65d1c545313..66d5667a37c 100644
--- a/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h
+++ b/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h
@@ -30,6 +30,8 @@ namespace analysis {
  * The default cudnn workspace is 4G, we set it to 64M in this pass, which
  * is applicable for most inference tasks.
  */
+struct Argument;
+
 class AdjustCudnnWorkSpacePass : public AnalysisPass {
  public:
   void RunImpl(Argument *argument) override;
diff --git a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h
index 7fbdd88e014..bbdf8b1009a 100644
--- a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h
+++ b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h
@@ -32,6 +32,8 @@ namespace analysis {
  * So, We added the corresponding inference impl to these ops separately.
  * This pass replaces these ops with corresponding inference ops.
  */
+struct Argument;
+
 class InferenceOpReplacePass : public AnalysisPass {
  public:
   void RunImpl(Argument *argument) override;
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
index a9d58aa2f4c..a4d60e91e84 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
@@ -16,12 +16,15 @@
 
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
+struct Argument;
+
 class IrInferCleanGraphPass : public AnalysisPass {
  public:
   void RunImpl(Argument *argument) override;
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
index 838ebdbc9d7..613eb04497e 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
@@ -14,12 +14,15 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
+struct Argument;
+
 class IrGraphToProgramPass : public AnalysisPass {
  public:
   void RunImpl(Argument *argument) override;
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index f432188131e..defa0a525f6 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -13,23 +13,24 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+
 #include <algorithm>
-#include <fstream>
 #include <functional>
 #include <limits>
-#include <map>
 #include <set>
 #include <string>
-#include <type_traits>
 #include <utility>
-#include <vector>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
-#include "paddle/fluid/framework/ir/graph_traits.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
index 77da5d40d8d..6d20aee295b 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -18,9 +18,18 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/platform/port.h"
 
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 2184574aa1f..4abe293c930 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
+struct MkldnnQuantizerConfig;
+
 extern const std::vector<std::string> kTRTSubgraphPasses;
 extern const std::vector<std::string> kLiteSubgraphPasses;
 
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 96b94777304..be771ac48fc 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -19,18 +19,26 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+#include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 
+namespace framework {
+class LoDTensor;
+class Scope;
+}  // namespace framework
+
 class NativePaddlePredictor : public PaddlePredictor {
  public:
   explicit NativePaddlePredictor(const NativeConfig &config)
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc
index 03c2aa3fb80..ee5c10b7bf6 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
@@ -14,6 +14,12 @@
 
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace details {
 
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
index 213c6891d0e..be5fe1d64f9 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -16,10 +16,18 @@
 
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace details {
 
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 014985661fd..061b83e1d1e 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -27,6 +27,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
index 76cf1661f30..30c6c21ec87 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <string>
+
 #include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 90732535204..b10c290b226 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -17,6 +17,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+
 #include "paddle_infer_declare.h"  // NOLINT
 
 ///
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 1a428e205cb..57aeee99ba2 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -12,8 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/op_registry.h"
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace nvinfer1 {
+class IActivationLayer;
+class ITensor;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index f67370b3382..2f4f9320607 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -12,9 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <math.h>
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace nvinfer1 {
+class IScaleLayer;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index 5d63aa2ace8..5ecf1923388 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index aa03bc44bd6..f582d7e0705 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
index 0541d891705..d11dbc16e87 100644
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 9fff558c583..b846b3033f6 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -13,6 +13,15 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 0dccd3cc639..cd16ed73965 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
index 7927b6cd1bb..aad822b3354 100644
--- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
@@ -15,6 +15,18 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h"
 
+namespace nvinfer1 {
+class ILayer;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/hard_sigmoid_op.cc b/paddle/fluid/inference/tensorrt/convert/hard_sigmoid_op.cc
index 3b6e464a117..f2c1bafb4ae 100644
--- a/paddle/fluid/inference/tensorrt/convert/hard_sigmoid_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/hard_sigmoid_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
index 39820068ca8..967f79a1643 100644
--- a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
@@ -15,6 +15,18 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h"
 
+namespace nvinfer1 {
+class ILayer;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc
index 8a0b50f4744..d746c51c5c5 100644
--- a/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc
@@ -15,6 +15,18 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h"
 
+namespace nvinfer1 {
+class IPluginLayer;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
index 29826909c3c..e348de9877f 100644
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -14,6 +14,18 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace nvinfer1 {
+class ILayer;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index 5b6aaad4983..c99528b207b 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
index dd594404d33..7ddedf969fd 100644
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 864e440920e..c031630f36d 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -15,6 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/scale_op.cc b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
index f9a1fe41ddc..18c97890d72 100644
--- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
index 326915dfff5..7090e298ddc 100644
--- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 9f4a048961f..0388154427e 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -14,6 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
index e220d80f0d7..ab82a6578fb 100644
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
@@ -15,6 +15,18 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h"
 
+namespace nvinfer1 {
+class ILayer;
+}  // namespace nvinfer1
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 754979f77ac..1f7ea7ea044 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -15,10 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/engine.h"
 
 #include <NvInfer.h>
-#include <cuda.h>
 #include <glog/logging.h>
 #include <string>
-#include "paddle/fluid/inference/analysis/helper.h"
+
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -26,6 +25,10 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+namespace plugin {
+class PluginTensorRT;
+}  // namespace plugin
+
 int TensorRTEngine::runtime_batch_ = 1;
 
 void TensorRTEngine::InitNetwork() {
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index a85ed483c1d..71625210054 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
@@ -33,10 +34,20 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+namespace plugin {
+class PluginTensorRT;
+}  // namespace plugin
+
 using FluidDT = framework::proto::VarType_Type;
 using TRT_DT = nvinfer1::DataType;
 
@@ -94,6 +105,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
 }  // NOLINT
 
 class TRTInt8Calibrator;
+
 /*
  * TensorRT Engine.
  *
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 31128ba8c5d..23aacedd693 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -14,6 +14,12 @@
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h
index 76784c7445e..9113525a5c9 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
@@ -17,9 +17,16 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 namespace tensorrt {
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
index ed825801fc4..076dfbcf8f0 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
@@ -26,6 +26,7 @@
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index 72962c733ec..7c763858bb2 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -12,13 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cuda.h>
-#include <cuda_runtime_api.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
index 5815bc9a146..b4b7ee50dc3 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
@@ -23,6 +23,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc
index 074a397e323..8c76a03d298 100644
--- a/paddle/fluid/inference/utils/benchmark.cc
+++ b/paddle/fluid/inference/utils/benchmark.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/utils/benchmark.h"
-#include <sstream>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/utils/io_utils.h b/paddle/fluid/inference/utils/io_utils.h
index 853aba168b5..de2c7b26d33 100644
--- a/paddle/fluid/inference/utils/io_utils.h
+++ b/paddle/fluid/inference/utils/io_utils.h
@@ -16,9 +16,14 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/api/paddle_api.h"
 
+namespace paddle {
+struct PaddleTensor;
+}  // namespace paddle
+
 namespace paddle {
 namespace inference {
 
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
index 71250766034..6fef5cae8d6 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
 #include <utility>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 379c8d00960..e54748a5367 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -18,6 +18,7 @@
 #include <type_traits>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/inlined_vector.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
index 3e10be037bd..00299911162 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include <gflags/gflags.h>
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
diff --git a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
index 3748805b1ce..1e793d1617a 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include <gflags/gflags.h>
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc
index e1c7b243bec..518b31e9430 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
+
 #include "gflags/gflags.h"
-#include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 
 DECLARE_string(allocator_strategy);
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
index 27257883d55..cbc126264ac 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -19,6 +19,7 @@
 #include <memory>
 #include <mutex>  // NOLINT
 #include <utility>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
index 8865bf0b0db..685248a88f7 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
+
 #include <cstdlib>
 #include <vector>
+
 #include "gtest/gtest.h"
 
 DECLARE_bool(free_idle_chunk);
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 957dac4d03e..2b8d2164f68 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -13,11 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
-
-#include <cmath>
-#include <list>
-#include <map>
-#include <string>
+#include <math.h>
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index 64a552e4fd2..a6015417b12 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -16,8 +16,15 @@
 #include <array>
 #include <list>
 #include <map>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 
+namespace paddle {
+namespace platform {
+class Place;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace memory {
 namespace allocation {
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc
index 7eed81a712a..d463ad1f5eb 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -13,9 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/buffered_allocator.h"
-
-#include <algorithm>
-#include <limits>
 #include <utility>
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h
index fd0996f7748..5e1733bd839 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
@@ -18,6 +18,7 @@
 #include <map>
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
index b096fbc112c..407f0f25935 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -16,8 +16,6 @@
 
 #include <stdlib.h>
 
-#include <string>
-
 namespace paddle {
 namespace memory {
 namespace allocation {
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
index 4af77e6e057..1b8418bc849 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -15,6 +15,7 @@
 #include <memory>
 #include <mutex>   // NOLINT
 #include <thread>  // NOLINT
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc
index 0ef084bafd0..77e8d9943d0 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
@@ -19,13 +19,8 @@
 #include <fcntl.h>
 #include <stdlib.h>
 #include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
 #include <random>
 #include <string>
-#include <utility>
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/mmap_allocator_test.cc b/paddle/fluid/memory/allocation/mmap_allocator_test.cc
index 5b66920be2a..bcb02e04792 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator_test.cc
@@ -16,8 +16,6 @@
 
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 
-#include <sys/types.h>
-
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
index 913d583099c..4cf1bd6123e 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -18,8 +18,10 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
+
 namespace paddle {
 namespace memory {
 namespace allocation {
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 3ea2ecf3538..42dd50af729 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -14,9 +14,6 @@
 
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
 
-#include <cuda.h>
-#include <cuda_runtime.h>
-
 namespace paddle {
 namespace memory {
 namespace allocation {
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
index bc07ad0c4dc..10ca4b828a4 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index faa87f1f01a..6ac99744d79 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
-
 #include <algorithm>
 #include <utility>
-
 #include "glog/logging.h"
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc
index 780126f57c5..1722acd10aa 100644
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -21,12 +21,9 @@ limitations under the License. */
 #endif
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
 #ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-
 #include <fstream>
 #include <string>
 
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
index 34bb40d549d..ea4897494f7 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 
 #include <memory>
-#include <vector>
 
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 9ba572acaca..73487795f75 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -15,8 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
+
 namespace paddle {
 
 namespace platform {
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index 33d8fb828f8..1903b9e30d8 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <memory>
-#include <string>
-#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/cudnn_desc.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 using framework::Tensor;
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index ea1a93b5826..89475e3a2ec 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -12,20 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <paddle/fluid/operators/math/concat_and_split.h>
-#include <numeric>
 
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
 using LoD = framework::LoD;
 
 struct ArrayToLoDFunctor;
+
 template <typename DeviceContext>
 struct ArrayToLoDFunctorImpl {
   const ArrayToLoDFunctor *prev_functor_;
diff --git a/paddle/fluid/operators/assert_op.cc b/paddle/fluid/operators/assert_op.cc
index da0e5fda636..3e4250389fc 100644
--- a/paddle/fluid/operators/assert_op.cc
+++ b/paddle/fluid/operators/assert_op.cc
@@ -13,10 +13,24 @@
    limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/operators/tensor_formatter.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class LoDTensor;
+class OpDesc;
+class Scope;
+class Variable;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 const char kCond[] = "Cond";
 const char kData[] = "Data";
 const char kSummarize[] = "summarize";
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index f8c1216e972..e5bceae1c95 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -14,9 +14,23 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/assign_op.h"
 
-#include <memory>
 #include <string>
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Variable;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h
index c2154f78bbe..ed4ba24a74b 100644
--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@@ -19,6 +19,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 class AssignFunctor {
diff --git a/paddle/fluid/operators/assign_op_test.cc b/paddle/fluid/operators/assign_op_test.cc
index 58f360ad605..f0ec04a1f20 100644
--- a/paddle/fluid/operators/assign_op_test.cc
+++ b/paddle/fluid/operators/assign_op_test.cc
@@ -14,8 +14,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/assign_op.h"
 
 #include <gtest/gtest.h>
-#include <iostream>
-#include <string>
 #include <vector>
 
 #include "paddle/fluid/framework/ddim.h"
diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc
index a35e5a80a9e..1589f9e8911 100644
--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -13,8 +13,23 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/assign_value_op.h"
+
 #include <string>
-#include <vector>
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
index 1418d96b67b..6c2e5b9ad68 100644
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index c177dad8028..3cb3f1d48bf 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -12,12 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
 #include <string>
 
 #include "paddle/fluid/operators/beam_search_decode_op.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
index bcb529f1570..835b49e57bc 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
index 34054103aa0..ec8d6518195 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
index 9d27a9ceb30..efc19659c83 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
index 4e8b6f9d0a9..7935a1f722e 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
index 3cfb1723f18..5ab07ef026b 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
index 61f76c178d0..1a78427cd19 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
index c9ed8b67647..68061e6ae6b 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@@ -14,6 +14,19 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
index 8fe7fce21e4..06e90cdff80 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index a3bf1f4dfb1..ccad96320a7 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -14,19 +14,17 @@ limitations under the License. */
 #if defined(PADDLE_WITH_NCCL)
 #include <nccl.h>
 #endif
-#include <stdint.h>
-#include <ostream>
 #include <string>
 
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index e2b09be5a9d..2822129b198 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -11,25 +11,23 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#if defined(PADDLE_WITH_NCCL)
-#include <nccl.h>
-#endif
-
-#include <stdint.h>
-#include <ostream>
 #include <string>
 
+#include "glog/logging.h"
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
-
-#if defined(PADDLE_WITH_NCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cc
index 42535187768..41a07f94399 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
index 7e260346b4b..e03da37360f 100644
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cc
index 8e849641e63..77bb96347f9 100644
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
index 77a75ed0b7a..83f7fce1ec6 100644
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
index 64935df856e..f6c1c5d50e8 100644
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
index 07e431f7bc8..83db107b36f 100644
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
index 3e20cee7e18..e59ec85fefd 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
@@ -14,6 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
index d9826422c16..39c8716a92a 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 64b60165722..00f366e6212 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -11,19 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#if defined(PADDLE_WITH_NCCL)
-#include <nccl.h>
-#endif
-
 #include <string>
 
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-#if defined(PADDLE_WITH_NCCL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index 5405ea70ef6..d8617492fb1 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -11,18 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#if defined(PADDLE_WITH_NCCL)
-#include <nccl.h>
-#endif
-
 #include <string>
 
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/common_infer_shape_functions.cc b/paddle/fluid/operators/common_infer_shape_functions.cc
index 22b212fc1b9..ce622d7501f 100644
--- a/paddle/fluid/operators/common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/common_infer_shape_functions.cc
@@ -13,10 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
-
 #include <algorithm>
 #include <vector>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+}  // namespace framework
+}  // namespace paddle
+
 // This file almostly contains all the infershape functions that are used in
 // operators.
 
diff --git a/paddle/fluid/operators/common_infer_shape_functions.h b/paddle/fluid/operators/common_infer_shape_functions.h
index 2cb9eab2865..922d5262abc 100644
--- a/paddle/fluid/operators/common_infer_shape_functions.h
+++ b/paddle/fluid/operators/common_infer_shape_functions.h
@@ -17,6 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+}  // namespace framework
+}  // namespace paddle
+
 // This file almostly contains all the infershape functions that are used in
 // operators.
 
diff --git a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
index aa31b887562..62019be26cd 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
@@ -14,6 +14,18 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/conditional_block_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index 659e098c6dd..c8ab2c91e91 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type.h"
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
index 155a5dbfec3..00b86121c0d 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc
@@ -13,12 +13,19 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
+
 #include <string>
 #include <unordered_set>
 #include <utility>
-#include <vector>
+
 #include "paddle/fluid/operators/controlflow/op_variant.h"
 
+namespace paddle {
+namespace framework {
+class ProgramDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
index f7dfba6f364..abaaa897606 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
@@ -16,9 +16,16 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op.h"
 
+namespace paddle {
+namespace framework {
+class ProgramDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_test.cc b/paddle/fluid/operators/controlflow/conditional_block_op_test.cc
index a5ca4a289de..068d853f1cc 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op_test.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_test.cc
@@ -13,14 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/conditional_block_op.h"
-#include <memory>
-#include <string>
-#include <vector>
+
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type.h"
 
 USE_NO_KERNEL_OP(conditional_block);
 USE_NO_KERNEL_OP(conditional_block_grad);
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index 088413ce223..9597dd25ec5 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -12,10 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 class FeedOp : public framework::OperatorBase {
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 4ab9f9fc863..e8829e1e1fa 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thread>  // NOLINT
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
diff --git a/paddle/fluid/operators/controlflow/op_variant.h b/paddle/fluid/operators/controlflow/op_variant.h
index 9af993f1006..28c27437de1 100644
--- a/paddle/fluid/operators/controlflow/op_variant.h
+++ b/paddle/fluid/operators/controlflow/op_variant.h
@@ -20,6 +20,13 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/variant.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+class OperatorBase;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
index f2973add84e..c96b7c6a08c 100644
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
@@ -13,14 +13,15 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
-
 #include <algorithm>
 #include <string>
-#include <unordered_set>
-#include <utility>
 
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/recurrent_op.h"
+namespace paddle {
+namespace framework {
+class BlockDesc;
+class ProgramDesc;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.h b/paddle/fluid/operators/controlflow/recurrent_op_helper.h
index aacca0762ca..3d9404e57aa 100644
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.h
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.h
@@ -26,6 +26,12 @@
 #include "paddle/fluid/platform/variant.h"
 #include "paddle/fluid/string/string_helper.h"
 
+namespace paddle {
+namespace framework {
+class ProgramDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
index 9f7702a5d6b..c4451c3b583 100644
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -12,7 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/array_operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index e4a1397f5c4..b85e740ada9 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -12,14 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <vector>
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
index 4f9d93c91f6..d2e9953e647 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
@@ -17,9 +17,17 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/variant.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class ProgramDesc;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 6457d9295dc..6ac75b78d70 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -16,8 +16,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/cudnn_lstm_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+struct CUDAPlace;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc
index ec60569be20..aa5649e4e9c 100644
--- a/paddle/fluid/operators/delete_var_op.cc
+++ b/paddle/fluid/operators/delete_var_op.cc
@@ -12,6 +12,19 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 class DeleteVarOp : public framework::OperatorBase {
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cc b/paddle/fluid/operators/dequantize_abs_max_op.cc
index 0d4d68d9f62..c8bca25b6b0 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
@@ -13,8 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/dequantize_abs_max_op.h"
+
 #include <string>
-#include <vector>
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.h b/paddle/fluid/operators/dequantize_abs_max_op.h
index 796ca93b000..0d9d20fc120 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.h
+++ b/paddle/fluid/operators/dequantize_abs_max_op.h
@@ -15,10 +15,17 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
@@ -35,6 +42,7 @@ class DequantizeMaxAbsKernel : public framework::OpKernel<T> {
   virtual void Compute(const framework::ExecutionContext& ctx) const {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto* scale = ctx.Input<framework::Tensor>("Scale");
+
     auto* out = ctx.Output<framework::Tensor>("Out");
 
     float max_range = ctx.Attr<float>("max_range");
diff --git a/paddle/fluid/operators/dequantize_log_op.cc b/paddle/fluid/operators/dequantize_log_op.cc
index 2ecd54f7edd..c12dd9e6d21 100644
--- a/paddle/fluid/operators/dequantize_log_op.cc
+++ b/paddle/fluid/operators/dequantize_log_op.cc
@@ -13,9 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/dequantize_log_op.h"
-#include <math.h>
+
 #include <string>
-#include <vector>
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/dequantize_log_op.h b/paddle/fluid/operators/dequantize_log_op.h
index f6590ecf61f..67ce9cc84d3 100644
--- a/paddle/fluid/operators/dequantize_log_op.h
+++ b/paddle/fluid/operators/dequantize_log_op.h
@@ -15,9 +15,16 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
index eadd842c7f6..cef6590ae21 100644
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <ThreadPool.h>
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
@@ -23,8 +24,6 @@
 #include <utility>
 #include <vector>
 
-#include <ThreadPool.h>
-
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
index 17d8973303b..2d78559625c 100644
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
@@ -13,9 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-
 #include <algorithm>
-
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc
index 6d3f5343111..fcd3e6abead 100644
--- a/paddle/fluid/operators/distributed/collective_client.cc
+++ b/paddle/fluid/operators/distributed/collective_client.cc
@@ -12,11 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <condition_variable>  // NOLINT
-#include <string>
-#include "gflags/gflags.h"
-
 #include "paddle/fluid/operators/distributed/collective_client.h"
+#include <memory>
+#include "gflags/gflags.h"
 
 DECLARE_int32(rpc_deadline);
 
diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h
index 6a3a450a1fd..e7d8bb8df98 100644
--- a/paddle/fluid/operators/distributed/collective_client.h
+++ b/paddle/fluid/operators/distributed/collective_client.h
@@ -15,16 +15,27 @@
 #pragma once
 
 #include <condition_variable>  // NOLINT
+#include <memory>
 #include <string>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+class SelectedRows;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 DECLARE_int32(rpc_deadline);
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc
index c95652400c2..cdd37742d2d 100644
--- a/paddle/fluid/operators/distributed/collective_server.cc
+++ b/paddle/fluid/operators/distributed/collective_server.cc
@@ -12,14 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <stdio.h>  // for removing the port file
-#include <csignal>
-#include <cstdlib>
-#include <fstream>
-#include <thread>  // NOLINT
-#include <vector>
-
 #include "paddle/fluid/operators/distributed/collective_server.h"
+#include <memory>
 
 DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get");
 
diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h
index 03c688a78e1..1015007ba0c 100644
--- a/paddle/fluid/operators/distributed/collective_server.h
+++ b/paddle/fluid/operators/distributed/collective_server.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <map>
+#include <memory>
 #include <set>
 #include <string>
 #include <thread>  // NOLINT
@@ -22,12 +23,17 @@ limitations under the License. */
 #include <vector>
 
 #include "gflags/gflags.h"
-
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
 
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
index be8c7a7dd40..92b2eb4b51e 100644
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
@@ -13,22 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdlib.h>
-#include <unistd.h>
 #include <memory>
 #include <string>
-#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
 #include "paddle/fluid/operators/distributed/collective_client.h"
 #include "paddle/fluid/operators/distributed/collective_server.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 4a9a9eb1701..07fd4ed4960 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <ThreadPool.h>
+#include <stdint.h>
 #include <atomic>
 #include <deque>
 #include <map>
@@ -26,8 +27,8 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc
index b9a6afa4cc3..38b7c8b0031 100644
--- a/paddle/fluid/operators/distributed/communicator_test.cc
+++ b/paddle/fluid/operators/distributed/communicator_test.cc
@@ -12,11 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <algorithm>
-#include <memory>
-#include <vector>
 
 #include "paddle/fluid/operators/distributed/communicator.h"
 
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
index c2cb0d7f04e..7d6756b4136 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
@@ -19,6 +19,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
 
+namespace grpc {
+class ByteBuffer;
+}  // namespace grpc
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
index e9074574cdd..486870de7a5 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
@@ -26,8 +26,12 @@ limitations under the License. */
 #include "grpc++/grpc++.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 
+struct grpc_byte_buffer;
+
 namespace grpc {
 // A ZeroCopyInputStream that reads from grpc_byte_buffer
+class ByteBuffer;
+
 class GrpcBufferReader final
     : public ::google::protobuf::io::ZeroCopyInputStream {
   typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
index 6b6249540c6..22ca74a67e7 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <time.h>
 #include <atomic>
-
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
 #include <ctime>
@@ -47,6 +46,18 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
+namespace grpc {
+class Channel;
+}  // namespace grpc
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
index eddd89cf20c..0372846ce0d 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@@ -17,19 +17,26 @@ limitations under the License. */
 #endif
 #include <limits>
 #include <memory>
-#include <thread>  // NOLINT
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
+#include "grpcpp/impl/codegen/byte_buffer.h"
+#include "grpcpp/impl/codegen/slice.h"
 #include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+class Variable;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
index 30e6907656e..932f3e2f069 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
@@ -24,10 +24,22 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/port.h"
 
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
+namespace grpc {
+class ByteBuffer;
+}  // namespace grpc
+namespace paddle {
+namespace framework {
+class Scope;
+class Variable;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index 5c0232a50a9..47e114ff4b2 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -20,6 +20,20 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
 
+namespace grpc {
+class ChannelArguments;
+}  // namespace grpc
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+namespace operators {
+namespace distributed {
+class GRPCVariableResponse;
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
+
 using ::grpc::ServerAsyncResponseWriter;
 
 DECLARE_bool(rpc_disable_reuse_port);
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h
index ee6950205b3..3d68b7e8ceb 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.h
@@ -37,6 +37,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/profiler.h"
 
+namespace grpc {
+class ServerCompletionQueue;
+}  // namespace grpc
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
index e46d2fbe01c..7d7723f1945 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
@@ -12,16 +12,29 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <stdint.h>
 #include <string>
 #include <utility>
 #include <vector>
-#ifdef PADDLE_WITH_NCCL
-#include <nccl.h>
-#endif
 
+#include "google/protobuf/io/coded_stream.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
 #include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
+namespace google {
+namespace protobuf {
+namespace io {
+class ZeroCopyInputStream;
+}  // namespace io
+}  // namespace protobuf
+}  // namespace google
+namespace grpc {
+class ByteBuffer;
+}  // namespace grpc
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
index 3ca1d89f750..4d12b4a4bac 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
@@ -16,19 +16,30 @@
 
 #include <string>
 
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 
+namespace grpc {
+class ByteBuffer;
+}  // namespace grpc
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.cc b/paddle/fluid/operators/distributed/heart_beat_monitor.cc
index 6736ea4336b..84ba9793c4e 100644
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.cc
+++ b/paddle/fluid/operators/distributed/heart_beat_monitor.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-#include <chrono>  // NOLINT
+
 #include <ctime>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.h b/paddle/fluid/operators/distributed/heart_beat_monitor.h
index 211e40757fc..cfef492de0e 100644
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.h
+++ b/paddle/fluid/operators/distributed/heart_beat_monitor.h
@@ -14,21 +14,18 @@
 
 #pragma once
 
+#include <ThreadPool.h>
 #include <gflags/gflags.h>
-
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
 #include <string>
+#include <thread>  // NOLINT
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
-#include <thread>  // NOLINT
-
-#include <ThreadPool.h>
-
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc b/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
index 699c03f6f28..7c2c0fbff11 100644
--- a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
+++ b/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
@@ -14,9 +14,6 @@
 
 #include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
 
-#include <algorithm>
-#include <thread>  // NOLINT
-
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h
index 9e39e68cba7..b4388c0002a 100644
--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ b/paddle/fluid/operators/distributed/large_scale_kv.h
@@ -16,7 +16,6 @@
 
 #include <ThreadPool.h>
 #include <gflags/gflags.h>
-
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index a9378d61c3c..6b33c1f5fcd 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -12,30 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <algorithm>
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
 #include <memory>
 #include <set>
-#include <string>
 #include <unordered_map>
 #include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+class Scope;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+class RPCClient;
+
 using LoDTensor = framework::LoDTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
index 8605bcdcd86..6fd3a998813 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@@ -20,6 +20,13 @@
 
 #include "paddle/fluid/framework/operator.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index 3b8479c91b0..a91df5b3c47 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -12,29 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <sys/types.h>
 #include <algorithm>
 #include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
 
+#include "glog/logging.h"
+#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
+#include "paddle/fluid/operators/distributed/communicator_common.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/fluid/operators/distributed/parameter_recv.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+class RPCClient;
+
 using LoDTensor = framework::LoDTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index 545b1f5e803..99af06bc7cc 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -14,26 +14,31 @@
 
 #include "paddle/fluid/operators/distributed/parameter_send.h"
 #include <memory>
-#include <set>
-#include <string>
 #include <utility>
-#include <vector>
-
+#include "glog/logging.h"
+#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
+#include "paddle/fluid/operators/distributed/communicator_common.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+class RPCClient;
+
 using LoDTensor = framework::LoDTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index 42621724e68..6d239673f91 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include <time.h>
-
 #include <functional>
 #include <memory>
 #include <string>
@@ -33,6 +32,13 @@
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 69a5e327431..6a6a795a46b 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -17,13 +17,22 @@
 #include <condition_variable>  // NOLINT
 #include <memory>
 #include <string>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 DECLARE_int32(rpc_deadline);
 DECLARE_int32(rpc_retry_times);
 
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index 691c2c1048f..52b4456f7b1 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -15,15 +15,23 @@
 #include "paddle/fluid/operators/distributed/rpc_server.h"
 
 #include <fstream>
-#include <iostream>
-#include <limits>
 #include <string>
-#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+class RequestHandler;
+
 void RPCServer::ShutDown() {
   VLOG(3) << "RPCServer ShutDown ";
   ShutDownImpl();
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index 8c7b7f1d7ee..f83144f6268 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -18,6 +18,7 @@
 #include <set>
 #include <string>
 #include <thread>  // NOLINT
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -25,10 +26,21 @@
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+class RequestHandler;
+
 struct MonomerHandle {
   std::string var_name_;
   std::string rpc_name_;
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index b52fb93e5bf..2e9d958ebfb 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -11,17 +11,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#ifdef PADDLE_WITH_NCCL
-#include <nccl.h>
-#endif
 #include <memory>
-#include <thread>  // NOLINT
 
-#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+namespace memory {
+namespace allocation {
+class Allocation;
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
 
 DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not.");
 DEFINE_int32(rpc_retry_bind_port, 3,
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
index 5457101a5c9..8a382baa5be 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <iostream>
+#include <memory>
 #include <string>
 #include <typeindex>
 #include <vector>
@@ -27,6 +28,21 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/platform/port.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+class Variable;
+}  // namespace framework
+namespace memory {
+namespace allocation {
+class Allocation;
+}  // namespace allocation
+}  // namespace memory
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace distributed {
diff --git a/paddle/fluid/operators/distributed/varhandle_test.cc b/paddle/fluid/operators/distributed/varhandle_test.cc
index a0fcaf88647..7c52ef74b4c 100644
--- a/paddle/fluid/operators/distributed/varhandle_test.cc
+++ b/paddle/fluid/operators/distributed/varhandle_test.cc
@@ -12,11 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index d979cd8a881..be67a2396f7 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -16,17 +16,33 @@
 
 #include <string>
 
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/distributed/distributed_pb.h"
 
+namespace google {
+namespace protobuf {
+namespace io {
+class CodedInputStream;
+class ZeroCopyInputStream;
+}  // namespace io
+}  // namespace protobuf
+}  // namespace google
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 DECLARE_string(rpc_server_profile_path);
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
index 2ed2acb96dc..abc8d912840 100644
--- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
@@ -9,15 +9,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
index c9f9daf3b3c..755cbf017d9 100644
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
@@ -12,19 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCClient;
+}  // namespace distributed
+
 class FetchBarrierOp : public framework::OperatorBase {
  public:
   FetchBarrierOp(const std::string& type,
diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
index 1199a63d16a..25ad16e3fce 100644
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
@@ -31,9 +31,25 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/rpc_server.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class Executor;
+class ProgramDesc;
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCServer;
+class RequestHandler;
+}  // namespace distributed
+
 constexpr char kOptimizeBlocks[] = "optimize_blocks";
 
 void FlRunServer(std::shared_ptr<distributed::RPCServer> service);
diff --git a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
index e63f8824783..db8c2f3f2d8 100644
--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
@@ -12,18 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <nccl.h>
-#include <stdint.h>
 #include <ostream>
 #include <string>
 
+#include "glog/logging.h"
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
index b41e4e87722..bacfd32cc73 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
@@ -31,9 +31,25 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/rpc_server.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class Executor;
+class ProgramDesc;
+class Scope;
+}  // namespace framework
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCServer;
+class RequestHandler;
+}  // namespace distributed
+
 constexpr char kOptimizeBlocks[] = "optimize_blocks";
 constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
 constexpr char kCheckpointBlockId[] = "checkpint_block_id";
diff --git a/paddle/fluid/operators/distributed_ops/prefetch_op.cc b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
index 6037ab1523e..007dbbbfbf5 100644
--- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc
+++ b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
@@ -12,18 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCClient;
+}  // namespace distributed
+
 class PrefetchOp : public framework::OperatorBase {
  public:
   PrefetchOp(const std::string& type, const framework::VariableNameMap& inputs,
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 2547ba3acb1..9729d0dadd7 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -12,22 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/communicator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCClient;
+}  // namespace distributed
+
 class RecvOp : public framework::OperatorBase {
  public:
   RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
index befdf4e9388..4727b3bb249 100644
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
@@ -13,8 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
index 43dd9c3c98a..d8639627c3e 100644
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
+++ b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdio.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
index a8e9379d214..5aa2ba26aa4 100644
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
@@ -12,20 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 
-#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCClient;
+}  // namespace distributed
+
 class SendBarrierOp : public framework::OperatorBase {
  public:
   SendBarrierOp(const std::string& type,
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 53e3d70f960..a4192c18afa 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -12,23 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/communicator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
 
+namespace distributed {
+class RPCClient;
+}  // namespace distributed
+
 class SendOp : public framework::OperatorBase {
  public:
   SendOp(const std::string& type, const framework::VariableNameMap& inputs,
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index 97624944ca1..68a98e7c6bc 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -20,6 +20,19 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index 152f20273a1..c4efc4ab72d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
index 5a398fa50fe..ddd69203fd3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -13,9 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
+
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 class ElementwiseFloorDivOpMaker : public ElementwiseOpMaker {
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
index 692bc015c5b..be6a6330547 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
@@ -13,10 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
-#include <memory>
+
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
index 1b2364a5a53..bd40763e05a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
@@ -13,10 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
-#include <memory>
+
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h
index 1a49a601398..5a3e7f90f3c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
index 8c2e62bed19..d8ad0a353c9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
@@ -13,9 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
+
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 class ElementwiseModOpMaker : public ElementwiseOpMaker {
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
index 1321eee8457..ea0e8e7c013 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
@@ -10,10 +10,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
-#include <memory>
+
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.h b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
index a910c326196..535d838209d 100755
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #include <cmath>
 #include <type_traits>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index 9603b022d5d..90f4ebb99ec 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -13,10 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include <memory>
+
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index caaaf2c931d..3dcf5bf6a32 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -14,6 +14,16 @@
 
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 template <typename T>
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
index be8b8d6c2f7..15c31a4cece 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
@@ -12,20 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <algorithm>
-#include <cstdlib>
-#include <memory>
-#include <random>
-#include <string>
-#include <vector>
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
 USE_OP(elementwise_add);
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index 6ec8f2c2355..cf9e9dbb04b 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -12,14 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <algorithm>
-#include <cstdlib>
-#include <memory>
 #include <random>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/enqueue_op.cc b/paddle/fluid/operators/enqueue_op.cc
index 9b367a72fb5..a7920488f2e 100644
--- a/paddle/fluid/operators/enqueue_op.cc
+++ b/paddle/fluid/operators/enqueue_op.cc
@@ -13,13 +13,24 @@
 // limitations under the License.
 
 #include <string>
-#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 using LoDTensor = paddle::framework::LoDTensor;
 using LoDTensorBlockingQueueHolder =
     paddle::operators::reader::LoDTensorBlockingQueueHolder;
diff --git a/paddle/fluid/operators/eye_op.h b/paddle/fluid/operators/eye_op.h
index 0eefe7d2163..d5ad27596d6 100644
--- a/paddle/fluid/operators/eye_op.h
+++ b/paddle/fluid/operators/eye_op.h
@@ -51,7 +51,7 @@ class EyeKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     set_zero(dev_ctx, out_tensor, static_cast<T>(0));
 
-    int64_t num_eyes = std::min(num_rows, num_columns);
+    int64_t num_eyes = (std::min)(num_rows, num_columns);
     platform::ForRange<DeviceContext> for_range(dev_ctx, num_eyes);
     EyeFunctor<T> functor(num_columns, out_data);
     for_range(functor);
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
index 17cb4556d45..37a442a7815 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -13,10 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
-#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index f6b156eb30d..ea1fa813928 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <iostream>
-#include <string>
 
-#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index 93f9e108723..f0903bdfce9 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -15,6 +15,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index ba918b3def2..bdc5debaea7 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -14,6 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/gru_op.h"
 
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+struct CUDAPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index b6017a6eafc..47b480c11c2 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -13,8 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/hash_op.h"
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h
index c2d53000491..b5b3f3de70c 100644
--- a/paddle/fluid/operators/hash_op.h
+++ b/paddle/fluid/operators/hash_op.h
@@ -18,6 +18,7 @@ extern "C" {
 #include <xxhash.h>
 }
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index b4f33dad927..e8edfb99f9f 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -13,9 +13,23 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/increment_op.h"
-#include <memory>
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 9b92ce3e538..1ac1c26796c 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -13,8 +13,24 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/isfinite_op.h"
+
 #include <string>
-#include <vector>
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index 2fc0d58669b..a54134910d0 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -15,12 +15,19 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
index 72da43e3bc6..fcbb4c5bf6a 100644
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -13,10 +13,31 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/isfinite_v2_op.h"
+
 #include <string>
-#include <vector>
+
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
-#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace operators {
+template <typename DeviceContext, typename T, typename Functor>
+class OverflowKernel;
+}  // namespace operators
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
 
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/isfinite_v2_op.h b/paddle/fluid/operators/isfinite_v2_op.h
index 9f0aa63ce80..332c50d7551 100644
--- a/paddle/fluid/operators/isfinite_v2_op.h
+++ b/paddle/fluid/operators/isfinite_v2_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -22,6 +23,12 @@
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/jit/gen/act.h b/paddle/fluid/operators/jit/gen/act.h
index 585196e939c..cd360caa39b 100644
--- a/paddle/fluid/operators/jit/gen/act.h
+++ b/paddle/fluid/operators/jit/gen/act.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h
index ff4a13a3679..d3d9eddd2ee 100644
--- a/paddle/fluid/operators/jit/gen/blas.h
+++ b/paddle/fluid/operators/jit/gen/blas.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc
index 331a4b0d075..b4e63d87eac 100644
--- a/paddle/fluid/operators/jit/gen/embseqpool.cc
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
@@ -13,10 +13,11 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/embseqpool.h"
+
 #include <stddef.h>  // offsetof
 #include <memory>
 #include <vector>
-#include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
diff --git a/paddle/fluid/operators/jit/gen/embseqpool.h b/paddle/fluid/operators/jit/gen/embseqpool.h
index 67a39350952..8353e5846f7 100644
--- a/paddle/fluid/operators/jit/gen/embseqpool.h
+++ b/paddle/fluid/operators/jit/gen/embseqpool.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen/gru.cc b/paddle/fluid/operators/jit/gen/gru.cc
index b5b0cffa806..fbdf49d5d58 100644
--- a/paddle/fluid/operators/jit/gen/gru.cc
+++ b/paddle/fluid/operators/jit/gen/gru.cc
@@ -13,8 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/gru.h"
+
 #include <stddef.h>  // offsetof
 #include <memory>
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
diff --git a/paddle/fluid/operators/jit/gen/gru.h b/paddle/fluid/operators/jit/gen/gru.h
index e047a65cb4b..588d11820b2 100644
--- a/paddle/fluid/operators/jit/gen/gru.h
+++ b/paddle/fluid/operators/jit/gen/gru.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/act.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
diff --git a/paddle/fluid/operators/jit/gen/hopv.h b/paddle/fluid/operators/jit/gen/hopv.h
index 575dec68a58..113c66a2558 100644
--- a/paddle/fluid/operators/jit/gen/hopv.h
+++ b/paddle/fluid/operators/jit/gen/hopv.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen/lstm.cc b/paddle/fluid/operators/jit/gen/lstm.cc
index 2c3bc985e9a..211dfc5ecad 100644
--- a/paddle/fluid/operators/jit/gen/lstm.cc
+++ b/paddle/fluid/operators/jit/gen/lstm.cc
@@ -13,8 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/lstm.h"
+
 #include <stddef.h>  // offsetof
 #include <memory>
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
diff --git a/paddle/fluid/operators/jit/gen/lstm.h b/paddle/fluid/operators/jit/gen/lstm.h
index 6f232906569..c980670d482 100644
--- a/paddle/fluid/operators/jit/gen/lstm.h
+++ b/paddle/fluid/operators/jit/gen/lstm.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/act.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc
index 3a455334f58..047d0d3e1ca 100644
--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@@ -13,9 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/matmul.h"
+
 #include <stddef.h>  // offsetof
 #include <memory>
-#include <vector>
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h
index 881cea581ac..4f04f7606d2 100644
--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -17,6 +17,7 @@
 #include <stdlib.h>  // for malloc and free
 #include <string>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h
index a0f1bb8299a..cb562c4c9a6 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc
index 40f8298af39..1452d4139b0 100644
--- a/paddle/fluid/operators/jit/gen/sgd.cc
+++ b/paddle/fluid/operators/jit/gen/sgd.cc
@@ -13,9 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/gen/sgd.h"
+
 #include <stddef.h>  // offsetof
 #include <memory>
-#include <vector>
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
diff --git a/paddle/fluid/operators/jit/gen/sgd.h b/paddle/fluid/operators/jit/gen/sgd.h
index 80b1809bbbf..403d97b8fec 100644
--- a/paddle/fluid/operators/jit/gen/sgd.h
+++ b/paddle/fluid/operators/jit/gen/sgd.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.h b/paddle/fluid/operators/jit/gen/vbroadcast.h
index 27c75f6f710..7d30fe5751b 100644
--- a/paddle/fluid/operators/jit/gen/vbroadcast.h
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/operators/jit/gen/jitcode.h"
 
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
index 033c603c07c..27b85763415 100644
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -18,6 +18,7 @@
 #include <memory>  // for unique_ptr
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 DECLARE_bool(dump_jitcode);
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
index 39e5ee2be15..b6dd49b7772 100644
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -21,6 +21,7 @@
 #include <unordered_map>
 #include <utility>  // for std::move
 #include <vector>
+
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/operators/jit/kernel_key.h"
@@ -31,6 +32,8 @@ namespace paddle {
 namespace operators {
 namespace jit {
 
+class GenBase;
+
 template <typename KernelTuple, typename PlaceType>
 inline typename std::enable_if<
     std::is_same<typename KernelTuple::data_type, float>::value &&
diff --git a/paddle/fluid/operators/jit/kernel_pool.cc b/paddle/fluid/operators/jit/kernel_pool.cc
index f1719be9873..7b6b13c3d98 100644
--- a/paddle/fluid/operators/jit/kernel_pool.cc
+++ b/paddle/fluid/operators/jit/kernel_pool.cc
@@ -13,9 +13,6 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/kernel_pool.h"
-#include <memory>  // for shared_ptr
-#include <string>
-#include <unordered_map>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/jit/kernel_pool.h b/paddle/fluid/operators/jit/kernel_pool.h
index 48435cf6ef2..763be7d3cf5 100644
--- a/paddle/fluid/operators/jit/kernel_pool.h
+++ b/paddle/fluid/operators/jit/kernel_pool.h
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <utility>  // for move
 #include <vector>
+
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/operators/jit/kernel_key.h"
@@ -29,6 +30,8 @@ namespace paddle {
 namespace operators {
 namespace jit {
 
+struct KernelKey;
+
 extern std::map<size_t, std::shared_ptr<void>>& GetJITCodesMap();
 
 template <KernelType KT>
diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
index 49b1a1fea4b..6e0c972e27a 100644
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <type_traits>
+
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
index 7b9f676050d..6a44bb25612 100644
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <type_traits>
+
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h
index 035425317ed..240cbbcda4f 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <type_traits>
+
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index b38cc107b8e..ee31c8df2f8 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -17,6 +17,7 @@
 #include <cmath>
 #include <type_traits>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index 136b99e0aef..b8d5e2c2407 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -17,6 +17,7 @@
 #include <cmath>
 #include <limits>
 #include <string>
+
 #include "paddle/fluid/operators/jit/helper.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index ac0405b9a6e..231ff941278 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -13,9 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/label_smooth_op.h"
-#include <memory>
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
index 6968c1a5b13..931cd6d1794 100644
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
@@ -27,6 +28,14 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/operators/math/math_function.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+class CUDADeviceContext;
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/lod_array_length_op.cc b/paddle/fluid/operators/lod_array_length_op.cc
index 49e8cbbbaab..70da0149cad 100644
--- a/paddle/fluid/operators/lod_array_length_op.cc
+++ b/paddle/fluid/operators/lod_array_length_op.cc
@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
index 7cbfbd03e1d..a9128b15bdb 100644
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -11,8 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index b130e84933b..db07a104a89 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -11,14 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <algorithm>
-#include <map>
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index 550de1aadde..5271da91b8c 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -13,8 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/beam_search.h"
-#include <algorithm>
-#include <map>
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Tensor;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
index 7ea8eb8b00d..97ce3d3f878 100644
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/beam_search.h"
+
 #include <gtest/gtest.h>
-#include <vector>
 
 void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
                        paddle::framework::LoDTensor* scores,
diff --git a/paddle/fluid/operators/math/blas.cc b/paddle/fluid/operators/math/blas.cc
index 2a7ce83967f..3bc1b4f4048 100644
--- a/paddle/fluid/operators/math/blas.cc
+++ b/paddle/fluid/operators/math/blas.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/math/blas.h"
-
 #include <utility>
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 42a60e9220c..562e2de3bd3 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -17,6 +17,13 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 515d6a2435e..c53c453897f 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -695,9 +695,9 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM(
     CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
     T alpha, const T **A, const T **B, T beta, T **C, int batchCount) const {
 #ifdef PADDLE_WITH_MKLML
-  const int lda = std::max((transA == CblasNoTrans) ? K : M, 1);
-  const int ldb = std::max((transB == CblasNoTrans) ? N : K, 1);
-  const int ldc = std::max(N, 1);
+  const int lda = (std::max)((transA == CblasNoTrans) ? K : M, 1);
+  const int ldb = (std::max)((transB == CblasNoTrans) ? N : K, 1);
+  const int ldc = (std::max)(N, 1);
   CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha, A,
                        &lda, B, &ldb, &beta, C, &ldc, 1 /* group_count */,
                        &batchCount);
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index b6bd58d118e..3b0c3c1686a 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -13,7 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include <vector>
+
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+struct bfloat16;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index 270a9d3f80a..094e2059c4d 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <vector>
+
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 
diff --git a/paddle/fluid/operators/math/context_project.cc b/paddle/fluid/operators/math/context_project.cc
index 537d0b47868..927d610e2ce 100644
--- a/paddle/fluid/operators/math/context_project.cc
+++ b/paddle/fluid/operators/math/context_project.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/context_project.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index 051c6019d74..08bb555c593 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -16,8 +16,10 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/operators/math/blas.h"
+
 #include "paddle/fluid/operators/math/im2col.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cc b/paddle/fluid/operators/math/cos_sim_functor.cc
index cbe16999124..f7770050bee 100644
--- a/paddle/fluid/operators/math/cos_sim_functor.cc
+++ b/paddle/fluid/operators/math/cos_sim_functor.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/cos_sim_functor.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/cos_sim_functor.h b/paddle/fluid/operators/math/cos_sim_functor.h
index d74662e68e7..9a24bfc3312 100644
--- a/paddle/fluid/operators/math/cos_sim_functor.h
+++ b/paddle/fluid/operators/math/cos_sim_functor.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <stdlib.h>
+
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
@@ -43,6 +44,7 @@ struct CosSimFunctor {
         tep_x = x[i];
         tep_y = y[i];
         xx += tep_x * tep_x;
+
         yy += tep_y * tep_y;
         xy += tep_x * tep_y;
       }
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 925f3b6161a..eb7c622e596 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <cmath>
 #include <functional>
 #include <string>
+
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index 6490d81cec7..07fe9c30f39 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -16,12 +16,10 @@ limitations under the License. */
 #include <cstring>
 #include <random>
 #include <vector>
-#include "gflags/gflags.h"
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/platform/port.h"
 
 inline double GetCurrentUS() {
   struct timeval time;
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
index 7a1ed47d182..23840143a44 100644
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/cross_entropy.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc
index 07c5cbf3337..4b8a6274cce 100644
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -10,10 +10,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/gru_compute.h"
+
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 6fb393d791c..8efd35ca108 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
-#include <vector>
+
 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc
index 94bbcbb5067..7e74f688019 100644
--- a/paddle/fluid/operators/math/lstm_compute.cc
+++ b/paddle/fluid/operators/math/lstm_compute.cc
@@ -13,9 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/lstm_compute.h"
+
 #include "paddle/fluid/operators/math/detail/lstm_cpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/lstm_kernel.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index d6f51c6e5c6..7c50ba630db 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
-#include <iostream>
-#include <map>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc
index 614f89a048c..950aed0aa49 100644
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc
@@ -12,14 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/blas.h"
 
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
 
+template <typename DeviceContext, typename T>
+class MatrixInverseFunctor;
+
 template <typename T>
 class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
  public:
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
index 1e86c2e7a32..40cea7483f3 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -13,9 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/pooling.h"
 #include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/sample_prob.cc b/paddle/fluid/operators/math/sample_prob.cc
index 99aa318453e..16342493e45 100644
--- a/paddle/fluid/operators/math/sample_prob.cc
+++ b/paddle/fluid/operators/math/sample_prob.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/sample_prob.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index e5a6d84cb2b..7b08df660a0 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -16,10 +16,17 @@ limitations under the License. */
 #include <iostream>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/sampler.h"
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index 81ad620466e..5cb1cc5dc03 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
index 300a3692012..852700fa7ff 100644
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence2batch.h"
 
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 076df017642..e29313e9f74 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -14,6 +14,16 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_padding.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Tensor;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc
index eab4553ae8b..8892a17886a 100644
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_padding.h"
+
 #include <gtest/gtest.h>
-#include <vector>
 
 template <typename DeviceContext, typename T>
 void TestSequencePadding(const DeviceContext &context,
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
index ee5b22ca855..78cbdf311ad 100644
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_scale.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/sequence_scale.h b/paddle/fluid/operators/math/sequence_scale.h
index 202243985c1..d84513e024d 100644
--- a/paddle/fluid/operators/math/sequence_scale.h
+++ b/paddle/fluid/operators/math/sequence_scale.h
@@ -17,6 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -40,6 +46,7 @@ namespace math {
  *                      total_sequence_length is the sum of all sequences'
  *                      length.
  * \param scales        Array<T>. The i-th sequence will be scaled by scales[i].
+
  * \param num_seq       Number of sequence
  *
  */
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index 794fc647172..42bf1f471de 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/vol2col.h"
-#include <vector>
+
+namespace paddle {
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
index aa979c4f109..6ed5a0943eb 100644
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/vol2col.h"
+
 #include <gtest/gtest.h>
-#include <iostream>
-#include <vector>
 
 template <typename DeviceContext, typename Place>
 void testVol2col() {
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index dc83e4d9648..8cd4fa12be4 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -65,7 +65,7 @@ static void GetBroadcastFromDims(const int x_ndim, const std::int64_t* x_dims,
                                  std::int64_t* x_bd_dims,
                                  std::int64_t* y_bd_dims,
                                  std::int64_t* out_bd_dims) {
-  const int ndim = std::max(x_ndim, y_ndim);
+  const int ndim = (std::max)(x_ndim, y_ndim);
   std::fill(x_bd_dims, x_bd_dims + ndim - x_ndim, 1);
   std::fill(y_bd_dims, y_bd_dims + ndim - y_ndim, 1);
   std::copy(x_dims, x_dims + x_ndim, x_bd_dims + ndim - x_ndim);
@@ -79,7 +79,7 @@ static void GetBroadcastFromDims(const int x_ndim, const std::int64_t* x_dims,
     if (x_bd_dims[i] == 0 || y_bd_dims[i] == 0) {
       out_bd_dims[i] = 0;
     } else {
-      out_bd_dims[i] = std::max(x_bd_dims[i], y_bd_dims[i]);
+      out_bd_dims[i] = (std::max)(x_bd_dims[i], y_bd_dims[i]);
     }
   }
 }
@@ -229,7 +229,7 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
                                                  "Input(X) has error dim."));
   }
   const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = std::max(x_ndim, y_ndim);
+  const int ndim = (std::max)(x_ndim, y_ndim);
   std::vector<std::int64_t> x_broadcast_dims(ndim);
   std::vector<std::int64_t> y_broadcast_dims(ndim);
   std::vector<std::int64_t> out_broadcast_dims(ndim);
diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc
index b47ec8bc70a..4f73de086f2 100644
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -12,10 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 87d914aa797..584de34c5d3 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -13,7 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class LoDTensor;
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 487deb11b48..aecf67fc3bb 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -15,6 +15,15 @@
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index 8a02a697cbb..98f368aa7a9 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -12,10 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "mkldnn.hpp"
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 19ee8764e27..a6cda154e55 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -12,12 +12,16 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <unordered_map>
 #include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace paddle {
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 40737f4cd02..0bec5619f54 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -12,13 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <mkldnn/include/mkldnn_types.h>
 #include <memory>
-#include "paddle/fluid/framework/tensor.h"
+
 #include "paddle/fluid/operators/fc_op.h"
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Tensor;
+}  // namespace framework
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
index 00c10cecbf4..9ee653ec589 100644
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -12,10 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/lrn_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
index 5ca0ed1182e..3ae34fe0e90 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "mkldnn.hpp"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
+namespace paddle {
+namespace platform {
+class MKLDNNDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
index 1dd1ad11786..4f0b7cab47e 100644
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
@@ -13,12 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
-#include <vector>
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/memory/malloc.h"
+
 #include "paddle/fluid/operators/mul_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index 9df30b3295c..bf12c61a4d9 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/operators/pool_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 5014381a4e2..0b159f9dcfa 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <iostream>
-#include <numeric>
-#include "mkldnn.hpp"
 #include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 1e0e13abb7c..414312fe97e 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -24,13 +24,19 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
 
-#include "mkldnn.hpp"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/operators/sum_op.h"
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+class MKLDNNDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
index 70d80e26e5c..169af47e95a 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc
index 67d71fe82ad..7c1cf9109c5 100644
--- a/paddle/fluid/operators/op_debug_string_test.cc
+++ b/paddle/fluid/operators/op_debug_string_test.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include <string>
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
 
 USE_OP(elementwise_add_grad);
 
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index 3fb18365e52..80faf833be5 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -12,13 +12,21 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <algorithm>
-#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/assign_op.h"
 #include "paddle/fluid/operators/tensor_formatter.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class LoDTensor;
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 using framework::GradVarName;
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index 9b7a923fb4b..ec9d1fde453 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -13,9 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/rank_loss_op.h"
-#include <memory>
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 42c087b9e47..041d36a9343 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -18,6 +18,7 @@
 #include <memory>
 #include <queue>
 #include <vector>
+
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/reader.h"
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/reader/py_reader.cc b/paddle/fluid/operators/reader/py_reader.cc
index 2100aeb7cf4..ad79f6bbc4c 100644
--- a/paddle/fluid/operators/reader/py_reader.cc
+++ b/paddle/fluid/operators/reader/py_reader.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reader/py_reader.h"
-#include <memory>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reader/py_reader.h b/paddle/fluid/operators/reader/py_reader.h
index d827cdd0b87..3492d578048 100644
--- a/paddle/fluid/operators/reader/py_reader.h
+++ b/paddle/fluid/operators/reader/py_reader.h
@@ -17,6 +17,7 @@
 #include <atomic>
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 
@@ -24,6 +25,8 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
+class LoDTensorBlockingQueue;
+
 class PyReader : public framework::FileReader {
  public:
   explicit PyReader(
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
index dc0940ac0b7..86c9f38ad3f 100644
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -12,12 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <chrono>  // NOLINT
-#include <set>
 #include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/operators/reader/blocking_queue.h"
 
 using paddle::operators::reader::BlockingQueue;
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 952ed466288..bccaae70229 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -13,8 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
-#include <string>
-#include <vector>
+
+namespace paddle {
+namespace framework {
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index 35a0dacaec2..fec496446ac 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -18,9 +18,17 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class ReaderBase;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 namespace reader {
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index c1adaf2037a..35f52ffa522 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/recurrent_op.h"
-
 #include <algorithm>
-#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class LoDTensor;
+class OpDesc;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/recurrent_op.h b/paddle/fluid/operators/recurrent_op.h
index 1ca66527e1b..e3f512d45c0 100644
--- a/paddle/fluid/operators/recurrent_op.h
+++ b/paddle/fluid/operators/recurrent_op.h
@@ -22,6 +22,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+namespace paddle {
+namespace platform {
+class DeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
index 57df2664824..2d7cce68e81 100644
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
+++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
@@ -13,9 +13,22 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/frobenius_norm_op.h"
-#include <memory>
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
index 30265b3cc71..10095bc9550 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
@@ -14,6 +14,21 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 // kernel's device type is decided by input tensor place, to be consistent with
 // compare and logical ops
 REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all, UseInputPlace);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
index cbc18f18b8e..f288fce7538 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
@@ -14,6 +14,21 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 // kernel's device type is decided by input tensor place, to be consistent with
 // compare and logical ops
 REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any, UseInputPlace);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
index 88935107df1..f27cd6b125b 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
@@ -14,6 +14,19 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 REGISTER_REDUCE_OP(reduce_prod);
 REGISTER_OP_CPU_KERNEL(reduce_prod,
                        ops::ReduceKernel<paddle::platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 54818470b27..a3850c5e264 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -13,9 +13,22 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
-#include <memory>
+
 #include <string>
 
+namespace paddle {
+namespace framework {
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index cb1a2962d9b..d8d4e641aeb 100644
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -12,10 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class LoDRankTable;
+class LoDTensor;
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 01a33a46521..e03824ca8c3 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -13,9 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
-#include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
index eea2d2ac57a..95b23a0b8cc 100644
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -15,6 +15,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 class RNNMemoryHelperOp : public framework::OperatorBase {
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 9d51f3e292f..55e35e43eb9 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -14,9 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
 
-#include <memory>
 #include <string>
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class OpDesc;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+class CPUDeviceContext;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
index eb248e59b6c..c83726180ba 100644
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -13,10 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scatter.h"
+
 #include <gtest/gtest.h>
-#include <iostream>
-#include <string>
-#include "paddle/fluid/framework/ddim.h"
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
index 43fd84a711f..6eda8595b17 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
@@ -13,6 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+}  // namespace platform
+}  // namespace paddle
 
 template <typename T>
 using Kernel =
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index 94e54266f0f..5b857960706 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -14,9 +14,15 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/softmax_op.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
+namespace paddle {
+namespace platform {
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index aa8c0b13dbb..4adbbacc844 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -13,9 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class LoDTensor;
+class OpDesc;
+class Scope;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 142b00b4de6..48d6cf8b361 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -11,8 +11,10 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/detail/strided_memcpy.h"
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc
index 3a450773a9d..83480b44d5b 100644
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/strided_memcpy.h"
+
 #include "gtest/gtest.h"
-#include "paddle/fluid/memory/memory.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc
index 7b8b484a11e..e4fa4a96a5c 100644
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
@@ -12,11 +12,10 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include "paddle/fluid/operators/tensor_formatter.h"
 #include <algorithm>
 #include <string>
 
-#include "paddle/fluid/operators/tensor_formatter.h"
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/tensor_formatter.h b/paddle/fluid/operators/tensor_formatter.h
index 1731348479d..aee5eec0d1c 100644
--- a/paddle/fluid/operators/tensor_formatter.h
+++ b/paddle/fluid/operators/tensor_formatter.h
@@ -18,6 +18,12 @@
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/var_type.h"
 
+namespace paddle {
+namespace framework {
+class LoDTensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
index 708fccf9715..d4eb79aa0f2 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -14,9 +14,6 @@
 
 #ifdef PADDLE_WITH_CUDA
 
-#include <string>
-#include <vector>
-
 #include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 9cfe47da5db..922340b08c6 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -31,6 +31,18 @@
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+class TRTCalibratorEngine;
+class TRTCalibratorEngineManager;
+class TRTInt8Calibrator;
+}  // namespace tensorrt
+template <typename T>
+struct Singleton;
+}  // namespace inference
+}  // namespace paddle
+
 namespace paddle {
 
 namespace operators {
diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
index cc2fe4cdbdb..ce94ba1ce9e 100644
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <random>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/for_range.h"
diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h
index 742329abb2d..4460139219f 100644
--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
@@ -23,9 +23,15 @@
 #endif
 
 #include <cstring>
+
 #include "paddle/fluid/platform/hostdevice.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
+namespace Eigen {
+template <typename T>
+struct NumTraits;
+}  // namespace Eigen
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/bfloat16_test.cc b/paddle/fluid/platform/bfloat16_test.cc
index bdb508ee336..fc964d7df35 100644
--- a/paddle/fluid/platform/bfloat16_test.cc
+++ b/paddle/fluid/platform/bfloat16_test.cc
@@ -10,14 +10,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/bfloat16.h"
-
 #include <vector>
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 4cb6ee3143a..54dac976276 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -14,12 +14,8 @@
 
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-
-#include <memory>
 #include <utility>
 
-#include "paddle/fluid/platform/dynload/nccl.h"
-
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index b86fd70c9ae..e379832593c 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_info.h"
 
 #ifdef PADDLE_WITH_XBYAK
-#include "xbyak/xbyak.h"
 #include "xbyak/xbyak_util.h"
 #endif
 
diff --git a/paddle/fluid/platform/cuda_resource_pool.h b/paddle/fluid/platform/cuda_resource_pool.h
index 22b53445d84..570b68b08fc 100644
--- a/paddle/fluid/platform/cuda_resource_pool.h
+++ b/paddle/fluid/platform/cuda_resource_pool.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <type_traits>
 #include <vector>
+
 #include "paddle/fluid/platform/resource_pool.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h
index e0ba1aaa6bd..0e0218dcca3 100644
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/cudnn_desc.h
@@ -22,8 +22,15 @@
 #include <numeric>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/platform/cudnn_helper.h"
 
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace platform {
 using framework::Tensor;
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 4b9c5c429da..e983e368953 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -23,6 +23,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/macros.h"
 
+namespace paddle {
+namespace platform {
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 DECLARE_bool(cudnn_deterministic);
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc
index 9d5a0954b00..2474903edf7 100644
--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device_code.h"
+
 #include <sys/stat.h>
 #include <algorithm>
 #include <set>
 #include <utility>
+
 #include "paddle/fluid/platform/enforce.h"
 
 DECLARE_string(cuda_dir);
diff --git a/paddle/fluid/platform/device_code.h b/paddle/fluid/platform/device_code.h
index 6128d8b78db..4199317a8ce 100644
--- a/paddle/fluid/platform/device_code.h
+++ b/paddle/fluid/platform/device_code.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 28d94627f95..e1438a1eefa 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/memory/malloc.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_helper.h"
@@ -35,6 +36,7 @@ limitations under the License. */
 #endif
 
 #include <map>
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
@@ -44,6 +46,11 @@ limitations under the License. */
 #define EIGEN_USE_THREADS
 #include "unsupported/Eigen/CXX11/Tensor"
 
+namespace Eigen {
+struct DefaultDevice;
+struct GpuDevice;
+}  // namespace Eigen
+
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu_header.h"
 #endif
@@ -118,8 +125,8 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
 
 #ifdef PADDLE_WITH_CUDA
 
-class EigenCudaStreamDevice;
 class CudnnWorkspaceHandle;
+class EigenCudaStreamDevice;
 
 class CUDAContext {
  public:
diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
index 2c19a2b1062..5cc33fd31f1 100644
--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <stddef.h>
+
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 85168a046fb..9bae7a87052 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -28,6 +28,8 @@ namespace platform {
 ///////////////////////
 // WARN: Under Development. Don't depend on it yet.
 //////////////////////
+class Event;
+
 inline uint64_t PosixInNsec() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 562e7542012..045caab7488 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <cuda.h>
 #include <mutex>  // NOLINT
 #include <type_traits>
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
diff --git a/paddle/fluid/platform/dynload/cuda_driver.h b/paddle/fluid/platform/dynload/cuda_driver.h
index 056fcc069db..5799b084f5f 100644
--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ b/paddle/fluid/platform/dynload/cuda_driver.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index dd0a2e19685..88b545b48e5 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <glog/logging.h>
-
 #include <cudnn.h>
+#include <glog/logging.h>
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
diff --git a/paddle/fluid/platform/dynload/cupti.cc b/paddle/fluid/platform/dynload/cupti.cc
index a25660c6ed4..d8381580c90 100644
--- a/paddle/fluid/platform/dynload/cupti.cc
+++ b/paddle/fluid/platform/dynload/cupti.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUPTI
 
 #include "paddle/fluid/platform/dynload/cupti.h"
-#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 48076e5478a..7a160664bc2 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -14,11 +14,10 @@ limitations under the License. */
 #pragma once
 
 #include <curand.h>
-
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/port.h"
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h
index ba2d38729c1..561f20af45a 100644
--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
@@ -15,11 +15,10 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <cusolverDn.h>
-
 #include <mutex>  // NOLINT
-#include "paddle/fluid/platform/port.h"
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 82e4f6ac75e..0c8a64ccf69 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
-#include <memory>
-#include <mutex>  // NOLINT
 #include <string>
 #include <vector>
 
@@ -22,7 +20,6 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/port.h"
 
 DEFINE_string(cudnn_dir, "",
               "Specify path for loading libcudnn.so. For instance, "
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 2be95b113b2..9369cf131da 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <mkl.h>
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index 1d5fa45ecf6..407f34f0ac3 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #pragma once
 
 #include <nccl.h>
-
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h
index 9464a23ba1e..720450d28b1 100644
--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ b/paddle/fluid/platform/dynload/nvrtc.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <nvrtc.h>
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index bc1977b05de..e10a7233b62 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 #include "warpctc/include/ctc.h"
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index ce1ec507307..a3ae9e48eea 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -70,6 +70,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
+namespace paddle {
+namespace platform {
+class ErrorSummary;
+}  // namespace platform
+}  // namespace paddle
+
 DECLARE_int32(call_stack_level);
 
 namespace paddle {
diff --git a/paddle/fluid/platform/errors_test.cc b/paddle/fluid/platform/errors_test.cc
index 3c84215b5e5..a73c1ba3d34 100644
--- a/paddle/fluid/platform/errors_test.cc
+++ b/paddle/fluid/platform/errors_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <functional>
 #include <string>
 
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index 261ec68483f..ec8a98eeb1a 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -10,13 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 
-#include <vector>
-
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc
index 33d0fe62680..0be4233269e 100644
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ b/paddle/fluid/platform/lodtensor_printer.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/lodtensor_printer.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
+
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/lodtensor_printer.h b/paddle/fluid/platform/lodtensor_printer.h
index e070e3540c9..e0bd1fff197 100644
--- a/paddle/fluid/platform/lodtensor_printer.h
+++ b/paddle/fluid/platform/lodtensor_printer.h
@@ -14,8 +14,15 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/scope.h"
 
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace platform {
 void PrintVar(framework::Scope* scope, const std::string& var_name,
diff --git a/paddle/fluid/platform/lodtensor_printer_test.cc b/paddle/fluid/platform/lodtensor_printer_test.cc
index 19e85284b8f..5b2af270740 100644
--- a/paddle/fluid/platform/lodtensor_printer_test.cc
+++ b/paddle/fluid/platform/lodtensor_printer_test.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/platform/lodtensor_printer.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
 
 TEST(LodTensorPrinter, PrintVar) {
   paddle::framework::Scope scope;
diff --git a/paddle/fluid/platform/monitor.cc b/paddle/fluid/platform/monitor.cc
index e65e09f45c0..76554012bf5 100644
--- a/paddle/fluid/platform/monitor.cc
+++ b/paddle/fluid/platform/monitor.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/monitor.h"
-#include <utility>
 
 namespace paddle {
 namespace platform {}  // namespace platform
diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h
index 2f186420b41..b57fae9daac 100644
--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
@@ -22,6 +22,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "glog/logging.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/place_test.cc b/paddle/fluid/platform/place_test.cc
index 13f28c73f45..41e084efa57 100644
--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/platform/place.h"
-#include <sstream>
+
 #include "gtest/gtest.h"
 
 TEST(Place, Equality) {
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 85759bc6e2e..56a6275b582 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -12,28 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
-#include <iomanip>
 #include <limits>
-#include <map>
 #include <mutex>  // NOLINT
 #include <random>
-#include <stack>
 #include <string>
-#include <vector>
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#endif  // PADDLE_WITH_CUDA
 
-#include "glog/logging.h"
-#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/errors.h"
-#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler_helper.h"
-#include "paddle/fluid/string/printf.h"
 
 DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
 
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 12049d815cf..0185328ff32 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/event.h"
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index 4d340a06342..2ce898d4617 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -13,10 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+
 #include <string>
-#ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-#endif
+
 #include "gtest/gtest.h"
 
 TEST(Event, CpuElapsedTime) {
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
index 4272d5fd0b1..c65d107cf45 100644
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <cstdint>
 #include <memory>
+
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/platform/timer.h b/paddle/fluid/platform/timer.h
index ff0e1d95c29..09dcc4369be 100644
--- a/paddle/fluid/platform/timer.h
+++ b/paddle/fluid/platform/timer.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <stdlib.h>
+
 #include "paddle/fluid/platform/port.h"
 
 #ifdef _WIN32
diff --git a/paddle/fluid/string/piece_test.cc b/paddle/fluid/string/piece_test.cc
index 80b712b08cc..544b5985ed2 100644
--- a/paddle/fluid/string/piece_test.cc
+++ b/paddle/fluid/string/piece_test.cc
@@ -14,8 +14,6 @@
 
 #include "paddle/fluid/string/piece.h"
 
-#include <sstream>
-
 #include "gtest/gtest.h"
 
 TEST(StringPiece, Construct) {
diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h
index da4c1f326fb..5e2aedb22ad 100644
--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
@@ -18,6 +18,7 @@
 #include <sstream>
 #include <string>
 #include <utility>
+
 #include "paddle/fluid/string/printf.h"
 
 DECLARE_bool(color);
diff --git a/paddle/fluid/string/string_helper.cc b/paddle/fluid/string/string_helper.cc
index 712db90d2f4..8731e8fca8a 100644
--- a/paddle/fluid/string/string_helper.cc
+++ b/paddle/fluid/string/string_helper.cc
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/string/string_helper.h"
+
 #include <ctype.h>
 #include <stdio.h>
 #include <cstring>
 #include <string>
-#include <vector>
-#include "boost/lexical_cast.hpp"
+
 #include "glog/logging.h"
 
 namespace paddle {
diff --git a/paddle/fluid/string/string_helper.h b/paddle/fluid/string/string_helper.h
index 8bf379a6b34..499539226bd 100644
--- a/paddle/fluid/string/string_helper.h
+++ b/paddle/fluid/string/string_helper.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "boost/lexical_cast.hpp"
 #include "glog/logging.h"
 
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index c19bd56fbbf..5400c55a0b1 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -12,12 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cstring>
-
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/init.h"
 
 int main(int argc, char** argv) {
-- 
GitLab


From 081fb2f96342efa2ff273019b12aacbf83a33284 Mon Sep 17 00:00:00 2001
From: Guo Sheng <whucsgs@163.com>
Date: Thu, 24 Sep 2020 13:50:41 +0800
Subject: [PATCH 200/261] Remove dependency on nltk for paddle __init__.
 (#27388)

* Remove dependency on nltk for paddle __init__.
test=develop

* Remove nltk.movie_reivew sentiment dataset to remove dependency on nltk.
test=develop
---
 python/paddle/dataset/__init__.py             |   2 -
 python/paddle/dataset/sentiment.py            | 150 ---------------
 python/paddle/dataset/tests/test_sentiment.py |  58 ------
 .../tests/unittests/test_dataset_sentiment.py |  42 -----
 .../tests/test_dataset_movie_reviews.py       |  50 -----
 python/paddle/text/datasets/__init__.py       |   3 -
 python/paddle/text/datasets/movie_reviews.py  | 173 ------------------
 7 files changed, 478 deletions(-)
 delete mode 100644 python/paddle/dataset/sentiment.py
 delete mode 100644 python/paddle/dataset/tests/test_sentiment.py
 delete mode 100644 python/paddle/fluid/tests/unittests/test_dataset_sentiment.py
 delete mode 100644 python/paddle/tests/test_dataset_movie_reviews.py
 delete mode 100644 python/paddle/text/datasets/movie_reviews.py

diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
index 54aa3edc51d..d1e59758565 100644
--- a/python/paddle/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
@@ -22,7 +22,6 @@ import paddle.dataset.cifar
 import paddle.dataset.movielens
 import paddle.dataset.conll05
 import paddle.dataset.uci_housing
-import paddle.dataset.sentiment
 import paddle.dataset.wmt14
 import paddle.dataset.wmt16
 import paddle.dataset.mq2007
@@ -37,7 +36,6 @@ __all__ = [
     'cifar',
     'movielens',
     'conll05',
-    'sentiment',
     'uci_housing',
     'wmt14',
     'wmt16',
diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py
deleted file mode 100644
index 721cb5a8192..00000000000
--- a/python/paddle/dataset/sentiment.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# /usr/bin/env python
-# -*- coding:utf-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-The script fetch and preprocess movie_reviews data set that provided by NLTK
-
-TODO(yuyang18): Complete dataset.
-"""
-
-from __future__ import print_function
-
-import six
-import collections
-from itertools import chain
-
-import os
-import nltk
-from nltk.corpus import movie_reviews
-import zipfile
-from functools import cmp_to_key
-
-import paddle.dataset.common
-
-URL = "https://corpora.bj.bcebos.com/movie_reviews%2Fmovie_reviews.zip"
-MD5 = '155de2b77c6834dd8eea7cbe88e93acb'
-
-__all__ = ['train', 'test', 'get_word_dict']
-NUM_TRAINING_INSTANCES = 1600
-NUM_TOTAL_INSTANCES = 2000
-
-
-def download_data_if_not_yet():
-    """
-    Download the data set, if the data set is not download.
-    """
-    try:
-        # download and extract movie_reviews.zip
-        paddle.dataset.common.download(
-            URL, 'corpora', md5sum=MD5, save_name='movie_reviews.zip')
-        path = os.path.join(paddle.dataset.common.DATA_HOME, 'corpora')
-        filename = os.path.join(path, 'movie_reviews.zip')
-        zip_file = zipfile.ZipFile(filename)
-        zip_file.extractall(path)
-        zip_file.close()
-        # make sure that nltk can find the data
-        if paddle.dataset.common.DATA_HOME not in nltk.data.path:
-            nltk.data.path.append(paddle.dataset.common.DATA_HOME)
-        movie_reviews.categories()
-    except LookupError:
-        print("Downloading movie_reviews data set, please wait.....")
-        nltk.download(
-            'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
-        print("Download data set success.....")
-        print("Path is " + nltk.data.find('corpora/movie_reviews').path)
-
-
-def get_word_dict():
-    """
-    Sorted the words by the frequency of words which occur in sample
-    :return:
-        words_freq_sorted
-    """
-    words_freq_sorted = list()
-    word_freq_dict = collections.defaultdict(int)
-    download_data_if_not_yet()
-
-    for category in movie_reviews.categories():
-        for field in movie_reviews.fileids(category):
-            for words in movie_reviews.words(field):
-                word_freq_dict[words] += 1
-    words_sort_list = list(six.iteritems(word_freq_dict))
-    words_sort_list.sort(key=cmp_to_key(lambda a, b: b[1] - a[1]))
-    for index, word in enumerate(words_sort_list):
-        words_freq_sorted.append((word[0], index))
-    return words_freq_sorted
-
-
-def sort_files():
-    """
-    Sorted the sample for cross reading the sample
-    :return:
-        files_list
-    """
-    files_list = list()
-    neg_file_list = movie_reviews.fileids('neg')
-    pos_file_list = movie_reviews.fileids('pos')
-    files_list = list(
-        chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
-    return files_list
-
-
-def load_sentiment_data():
-    """
-    Load the data set
-    :return:
-        data_set
-    """
-    data_set = list()
-    download_data_if_not_yet()
-    words_ids = dict(get_word_dict())
-    for sample_file in sort_files():
-        words_list = list()
-        category = 0 if 'neg' in sample_file else 1
-        for word in movie_reviews.words(sample_file):
-            words_list.append(words_ids[word.lower()])
-        data_set.append((words_list, category))
-    return data_set
-
-
-def reader_creator(data):
-    """
-    Reader creator, generate an iterator for data set
-    :param data:
-        train data set or test data set
-    """
-    for each in data:
-        yield each[0], each[1]
-
-
-def train():
-    """
-    Default training set reader creator
-    """
-    data_set = load_sentiment_data()
-    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
-
-
-def test():
-    """
-    Default test set reader creator
-    """
-    data_set = load_sentiment_data()
-    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
-
-
-def fetch():
-    nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
diff --git a/python/paddle/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
deleted file mode 100644
index 3540ea06b07..00000000000
--- a/python/paddle/dataset/tests/test_sentiment.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# /usr/bin/env python
-# -*- coding:utf-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import nltk
-import paddle.dataset.sentiment as st
-from nltk.corpus import movie_reviews
-
-
-class TestSentimentMethods(unittest.TestCase):
-    def test_get_word_dict(self):
-        word_dict = st.get_word_dict()[0:10]
-        test_word_list = [(',', 0), ('the', 1), ('.', 2), ('a', 3), ('and', 4),
-                          ('of', 5), ('to', 6), ("'", 7), ('is', 8), ('in', 9)]
-        for idx, each in enumerate(word_dict):
-            self.assertEqual(each, test_word_list[idx])
-        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
-
-    def test_sort_files(self):
-        last_label = ''
-        for sample_file in st.sort_files():
-            current_label = sample_file.split("/")[0]
-            self.assertNotEqual(current_label, last_label)
-            last_label = current_label
-
-    def test_data_set(self):
-        data_set = st.load_sentiment_data()
-        last_label = -1
-
-        for each in st.test():
-            self.assertNotEqual(each[1], last_label)
-            last_label = each[1]
-
-        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
-        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
-        self.assertEqual(
-            len(list(st.test())),
-            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_sentiment.py b/python/paddle/fluid/tests/unittests/test_dataset_sentiment.py
deleted file mode 100644
index b5d5d33fa3f..00000000000
--- a/python/paddle/fluid/tests/unittests/test_dataset_sentiment.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-TestCases for Dataset,
-including create, config, run, etc.
-"""
-
-from __future__ import print_function
-import numpy as np
-import unittest
-import os
-import paddle
-import zipfile
-import paddle.dataset.common
-
-URL = "https://corpora.bj.bcebos.com/movie_reviews%2Fmovie_reviews.zip"
-MD5 = '155de2b77c6834dd8eea7cbe88e93acb'
-
-
-class TestDatasetSentiment(unittest.TestCase):
-    """  TestCases for Sentiment. """
-
-    def test_get_word_dict(self):
-        """ Testcase for get_word_dict. """
-        words_freq_sorted = paddle.dataset.sentiment.get_word_dict()
-        print(words_freq_sorted)
-        self.assertTrue(len(words_freq_sorted) == 39768)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/tests/test_dataset_movie_reviews.py b/python/paddle/tests/test_dataset_movie_reviews.py
deleted file mode 100644
index e6e6667013f..00000000000
--- a/python/paddle/tests/test_dataset_movie_reviews.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-
-from paddle.text.datasets import *
-
-
-class TestMovieReviewsTrain(unittest.TestCase):
-    def test_main(self):
-        movie_reviews = MovieReviews(mode='train')
-        self.assertTrue(len(movie_reviews) == 1600)
-
-        # traversal whole dataset may cost a
-        # long time, randomly check 1 sample
-        idx = np.random.randint(0, 1600)
-        data = movie_reviews[idx]
-        self.assertTrue(len(data) == 2)
-        self.assertTrue(len(data[0].shape) == 1)
-        self.assertTrue(int(data[1]) in [0, 1])
-
-
-class TestMovieReviewsTest(unittest.TestCase):
-    def test_main(self):
-        movie_reviews = MovieReviews(mode='test')
-        self.assertTrue(len(movie_reviews) == 400)
-
-        # traversal whole dataset may cost a
-        # long time, randomly check 1 sample
-        idx = np.random.randint(0, 400)
-        data = movie_reviews[idx]
-        self.assertTrue(len(data) == 2)
-        self.assertTrue(len(data[0].shape) == 1)
-        self.assertTrue(int(data[1]) in [0, 1])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/text/datasets/__init__.py b/python/paddle/text/datasets/__init__.py
index b5cea40a4f4..71571d09b5c 100644
--- a/python/paddle/text/datasets/__init__.py
+++ b/python/paddle/text/datasets/__init__.py
@@ -16,7 +16,6 @@ from . import conll05
 from . import imdb
 from . import imikolov
 from . import movielens
-from . import movie_reviews
 from . import uci_housing
 from . import wmt14
 from . import wmt16
@@ -25,7 +24,6 @@ from .conll05 import *
 from .imdb import *
 from .imikolov import *
 from .movielens import *
-from .movie_reviews import *
 from .uci_housing import *
 from .wmt14 import *
 from .wmt16 import *
@@ -34,7 +32,6 @@ __all__ = conll05.__all__ \
           + imdb.__all__ \
           + imikolov.__all__ \
           + movielens.__all__ \
-          + movie_reviews.__all__ \
           + uci_housing.__all__ \
           + wmt14.__all__ \
           + wmt16.__all__
diff --git a/python/paddle/text/datasets/movie_reviews.py b/python/paddle/text/datasets/movie_reviews.py
deleted file mode 100644
index db5b15654f9..00000000000
--- a/python/paddle/text/datasets/movie_reviews.py
+++ /dev/null
@@ -1,173 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import six
-import numpy as np
-import collections
-import nltk
-from nltk.corpus import movie_reviews
-import zipfile
-from functools import cmp_to_key
-from itertools import chain
-
-import paddle
-from paddle.io import Dataset
-
-__all__ = ['MovieReviews']
-
-URL = "https://corpora.bj.bcebos.com/movie_reviews%2Fmovie_reviews.zip"
-MD5 = '155de2b77c6834dd8eea7cbe88e93acb'
-
-NUM_TRAINING_INSTANCES = 1600
-NUM_TOTAL_INSTANCES = 2000
-
-
-class MovieReviews(Dataset):
-    """
-    Implementation of `NLTK movie reviews <http://www.nltk.org/nltk_data/>`_ dataset.
-
-    Args:
-        data_file(str): path to data tar file, can be set None if
-            :attr:`download` is True. Default None
-        mode(str): 'train' 'test' mode. Default 'train'.
-        download(bool): whether auto download cifar dataset if
-            :attr:`data_file` unset. Default True.
-
-    Returns:
-        Dataset: instance of movie reviews dataset
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            from paddle.text.datasets import MovieReviews
-
-            class SimpleNet(paddle.nn.Layer):
-                def __init__(self):
-                    super(SimpleNet, self).__init__()
-
-                def forward(self, word, category):
-                    return paddle.sum(word), category
-
-            paddle.disable_static()
-
-            movie_reviews = MovieReviews(mode='train')
-
-            for i in range(10):
-                word_list, category = movie_reviews[i]
-                word_list = paddle.to_tensor(word_list)
-                category = paddle.to_tensor(category)
-
-                model = SimpleNet()
-                word_list, category = model(word_list, category)
-                print(word_list.numpy().shape, category.numpy())
-
-    """
-
-    def __init__(self, mode='train'):
-        assert mode.lower() in ['train', 'test'], \
-            "mode should be 'train', 'test', but got {}".format(mode)
-        self.mode = mode.lower()
-
-        self._download_data_if_not_yet()
-
-        # read dataset into memory
-        self._load_sentiment_data()
-
-    def _get_word_dict(self):
-        """
-        Sorted the words by the frequency of words which occur in sample
-        :return:
-            words_freq_sorted
-        """
-        words_freq_sorted = list()
-        word_freq_dict = collections.defaultdict(int)
-
-        for category in movie_reviews.categories():
-            for field in movie_reviews.fileids(category):
-                for words in movie_reviews.words(field):
-                    word_freq_dict[words] += 1
-        words_sort_list = list(six.iteritems(word_freq_dict))
-        words_sort_list.sort(key=cmp_to_key(lambda a, b: b[1] - a[1]))
-        for index, word in enumerate(words_sort_list):
-            words_freq_sorted.append((word[0], index))
-        return words_freq_sorted
-
-    def _sort_files(self):
-        """
-        Sorted the sample for cross reading the sample
-        :return:
-            files_list
-        """
-        files_list = list()
-        neg_file_list = movie_reviews.fileids('neg')
-        pos_file_list = movie_reviews.fileids('pos')
-        files_list = list(
-            chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
-        return files_list
-
-    def _load_sentiment_data(self):
-        """
-        Load the data set
-        :return:
-            data_set
-        """
-        self.data = []
-        words_ids = dict(self._get_word_dict())
-        for sample_file in self._sort_files():
-            words_list = list()
-            category = 0 if 'neg' in sample_file else 1
-            for word in movie_reviews.words(sample_file):
-                words_list.append(words_ids[word.lower()])
-            self.data.append((words_list, category))
-
-    def _download_data_if_not_yet(self):
-        """
-        Download the data set, if the data set is not download.
-        """
-        try:
-            # download and extract movie_reviews.zip
-            paddle.dataset.common.download(
-                URL, 'corpora', md5sum=MD5, save_name='movie_reviews.zip')
-            path = os.path.join(paddle.dataset.common.DATA_HOME, 'corpora')
-            filename = os.path.join(path, 'movie_reviews.zip')
-            zip_file = zipfile.ZipFile(filename)
-            zip_file.extractall(path)
-            zip_file.close()
-            # make sure that nltk can find the data
-            if paddle.dataset.common.DATA_HOME not in nltk.data.path:
-                nltk.data.path.append(paddle.dataset.common.DATA_HOME)
-            movie_reviews.categories()
-        except LookupError:
-            print("Downloading movie_reviews data set, please wait.....")
-            nltk.download(
-                'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
-            print("Download data set success.....")
-            print("Path is " + nltk.data.find('corpora/movie_reviews').path)
-
-    def __getitem__(self, idx):
-        if self.mode == 'test':
-            idx += NUM_TRAINING_INSTANCES
-        data = self.data[idx]
-        return np.array(data[0]), np.array(data[1])
-
-    def __len__(self):
-        if self.mode == 'train':
-            return NUM_TRAINING_INSTANCES
-        else:
-            return NUM_TOTAL_INSTANCES - NUM_TRAINING_INSTANCES
-- 
GitLab


From 8f7bb52bd2e0cdf23c3441f3ec85b733a0f900f0 Mon Sep 17 00:00:00 2001
From: Shibo Tao <62922815+T8T9@users.noreply.github.com>
Date: Thu, 24 Sep 2020 13:52:48 +0800
Subject: [PATCH 201/261] fix tensorrt 6 build error. test=develop (#27511)

* fix tensorrt 6 build error. test=develop

* fix. test=develop

* bug fix

* test=develop
---
 paddle/fluid/platform/dynload/tensorrt.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index 67a79ce4bb1..cb751071062 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -35,7 +35,7 @@ extern void* tensorrt_dso_handle;
 #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                            \
   struct DynLoad__##__name {                                                  \
     template <typename... Args>                                               \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
+    void* operator()(Args... args) {                                          \
       std::call_once(tensorrt_dso_flag, []() {                                \
         tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
       });                                                                     \
@@ -44,7 +44,8 @@ extern void* tensorrt_dso_handle;
         return nullptr;                                                       \
       }                                                                       \
       using tensorrt_func = decltype(&::__name);                              \
-      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);            \
+      auto ret = reinterpret_cast<tensorrt_func>(p_##__name)(args...);        \
+      return static_cast<void*>(ret);                                         \
     }                                                                         \
   };                                                                          \
   extern DynLoad__##__name __name
-- 
GitLab


From f91c37e6655473066261b7e1d248844d9eaabb84 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Thu, 24 Sep 2020 14:04:32 +0800
Subject: [PATCH 202/261] Refine error message of MatchMatrix and PyramidHash
 (#27484)

---
 .../fluid/operators/match_matrix_tensor_op.cc | 128 ++++++++++++------
 paddle/fluid/operators/pyramid_hash_op.cc     |  16 ++-
 2 files changed, 101 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index 8bad10d7235..e95aef8eb56 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -28,34 +28,54 @@ using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;
 
 void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    "X(Input) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true,
-                    "Y(Input) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                    "W(Input) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                    "Out(Output) of MatchMatrix should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Tmp"), true,
-                    "Tmp(Output) of MatchMatrix should not be null.");
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "match_matrix_tensor");
+  OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "match_matrix_tensor");
+  OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "match_matrix_tensor");
+  OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "match_matrix_tensor");
+  OP_INOUT_CHECK(ctx->HasOutput("Tmp"), "Output", "Tmp", "match_matrix_tensor");
 
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    "The rank of Input(X) can't be less than 2.");
+                    platform::errors::InvalidArgument(
+                        "The dimensions of Input(X) should be equal to 2, "
+                        "but received %d.",
+                        x_dims.size()));
 
   auto y_dims = ctx->GetInputDim("Y");
   PADDLE_ENFORCE_EQ(y_dims.size(), 2,
-                    "The rank of Input(Y) can't be less than 2.");
+                    platform::errors::InvalidArgument(
+                        "The dimensions of Input(Y) should be equal to 2, "
+                        "but received %d.",
+                        y_dims.size()));
 
   auto w_dims = ctx->GetInputDim("W");
-  PADDLE_ENFORCE_EQ(w_dims.size(), 3UL, "W should be 3-D tensor");
+  PADDLE_ENFORCE_EQ(w_dims.size(), 3,
+                    platform::errors::InvalidArgument(
+                        "The dimensions of Input(W) should be equal to 3, "
+                        "but received %d.",
+                        w_dims.size()));
 
   int dim_t = ctx->Attrs().Get<int>("dim_t");
-  PADDLE_ENFORCE_EQ(w_dims[0], x_dims[1],
-                    "W 's shape must satisfy: W[0] = X[1]");
-  PADDLE_ENFORCE_EQ(w_dims[1], dim_t, "W 's shape must satisfy: W[1] = dim_t");
-  PADDLE_ENFORCE_EQ(w_dims[2], y_dims[1],
-                    "W 's shape must satisfy: W[2] = Y[1]");
+  PADDLE_ENFORCE_EQ(
+      w_dims[0], x_dims[1],
+      platform::errors::InvalidArgument(
+          "The first dimension of Input(W) should be equal to the second "
+          "dimension of Input(X). But received the first dimension of Input(W) "
+          "is %d, the second dimension of Input(X) is %d.",
+          w_dims[0], x_dims[1]));
+  PADDLE_ENFORCE_EQ(
+      w_dims[1], dim_t,
+      platform::errors::InvalidArgument(
+          "The second dimension of Input(W) should be equal to 'dim_t', but "
+          "received the second dimension of Input(W) is %d, 'dim_t' is %d.",
+          w_dims[1], dim_t));
+  PADDLE_ENFORCE_EQ(
+      w_dims[2], y_dims[1],
+      platform::errors::InvalidArgument(
+          "The last dimension of Input(W) should be equal to "
+          "the second dimension of Input(Y). But received the last dimension "
+          "of Input(W) is %d, the second dimension of Input(Y) is %d.",
+          w_dims[2], y_dims[1]));
 
   int64_t out_dim_0 = -1;
   int64_t tmp_dim_0 = -1;
@@ -63,27 +83,52 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
     framework::Variable* x_var =
         BOOST_GET(framework::Variable*, ctx->GetInputVarPtrs("X")[0]);
     const auto& x_lod = x_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE_EQ(x_lod.empty(), false, "The Input(X) must hold lod info.");
+    PADDLE_ENFORCE_EQ(x_lod.empty(), false,
+                      platform::errors::InvalidArgument(
+                          "The Input(X) should hold LoD information, but "
+                          "received Input(X).lod() is empty."));
     const auto& x_lod_0 = x_lod[0];
     PADDLE_ENFORCE_GE(x_lod_0.size(), 2,
-                      "The Input(X)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], static_cast<int64_t>(x_lod_0.back()),
-        "The Input(X)'s lod info mismatches the actual tensor shape.");
+                      platform::errors::InvalidArgument(
+                          "The dimensions of Input(X)'s LoD data should be "
+                          "equal to 2, but received %d.",
+                          x_lod_0.size()));
+    PADDLE_ENFORCE_EQ(x_dims[0], static_cast<int64_t>(x_lod_0.back()),
+                      platform::errors::InvalidArgument(
+                          "The last element of Input(X)'s LoD data should be "
+                          "equal to the first dimension of Input(X). "
+                          "But received the last element of Input(X)'s LoD "
+                          "data is %d, the first dimension of Input(X) is %d.",
+                          x_lod_0.back(), x_dims[0]));
 
     framework::Variable* y_var =
         BOOST_GET(framework::Variable*, ctx->GetInputVarPtrs("Y")[0]);
     const auto& y_lod = y_var->Get<LoDTensor>().lod();
-    PADDLE_ENFORCE_EQ(y_lod.empty(), false, "The Input(Y) must hold lod info.");
+    PADDLE_ENFORCE_EQ(y_lod.empty(), false,
+                      platform::errors::InvalidArgument(
+                          "The Input(Y) should hold LoD information, but "
+                          "received Input(Y).lod() is empty."));
     const auto& y_lod_0 = y_lod[0];
     PADDLE_ENFORCE_GE(y_lod_0.size(), 2,
-                      "The Input(Y)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        y_dims[0], static_cast<int64_t>(y_lod_0.back()),
-        "The Input(Y)'s lod info mismatches the actual tensor shape.");
+                      platform::errors::InvalidArgument(
+                          "The dimensions of Input(Y)'s LoD data should be "
+                          "equal to 2, but received %d.",
+                          y_lod_0.size()));
+    PADDLE_ENFORCE_EQ(y_dims[0], static_cast<int64_t>(y_lod_0.back()),
+                      platform::errors::InvalidArgument(
+                          "The last element of Input(Y)'s LoD data should be "
+                          "equal to the first dimension of Input(Y). "
+                          "But received the last element of Input(Y)'s LoD "
+                          "data is %d, the first dimension of Input(Y) is %d.",
+                          y_lod_0.back(), y_dims[0]));
 
     PADDLE_ENFORCE_EQ(x_lod_0.size(), y_lod_0.size(),
-                      "The Length of X and Y must be equal.");
+                      platform::errors::InvalidArgument(
+                          "The dimensions of Input(X)'s and Input(Y)'s LoD "
+                          "data should be equal. "
+                          "But received the dimensions of Input(X)'s LoD is "
+                          "%d, the dimensions of Input(Y)'s LoD is %d.",
+                          x_lod_0.size(), y_lod_0.size()));
 
     out_dim_0 = 0;
     for (size_t i = 1; i < x_lod_0.size(); i++) {
@@ -98,10 +143,18 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
     // compile time
     framework::VarDesc* x_desc =
         BOOST_GET(framework::VarDesc*, ctx->GetInputVarPtrs("X")[0]);
-    PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1);
+    PADDLE_ENFORCE_GE(
+        x_desc->GetLoDLevel(), 1,
+        platform::errors::InvalidArgument("The LoD level of Input(X) should be "
+                                          "greater than 1, but reviced %d.",
+                                          x_desc->GetLoDLevel()));
     framework::VarDesc* y_desc =
         BOOST_GET(framework::VarDesc*, ctx->GetInputVarPtrs("Y")[0]);
-    PADDLE_ENFORCE_GE(y_desc->GetLoDLevel(), 1);
+    PADDLE_ENFORCE_GE(
+        y_desc->GetLoDLevel(), 1,
+        platform::errors::InvalidArgument("The LoD level of Input(Y) should be "
+                                          "greater than 1, but reviced %d.",
+                                          y_desc->GetLoDLevel()));
     ctx->ShareLoD("X", "Out");
   }
 
@@ -115,14 +168,11 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
 
 void MatchMatrixTensorOpGrad::InferShape(
     framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    "Input(X) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true,
-                    "Input(Y) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                    "Input(W) of SequencePadGradOp should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                    "Input(Out@GRAD) of SequencePadGradOp should not be null.");
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "match_matrix_tensor_grad");
+  OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "match_matrix_tensor_grad");
+  OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "match_matrix_tensor_grad");
+  OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                 "Out@GRAD", "match_matrix_tensor_grad");
 
   if (ctx->HasOutput(framework::GradVarName("X"))) {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
diff --git a/paddle/fluid/operators/pyramid_hash_op.cc b/paddle/fluid/operators/pyramid_hash_op.cc
index 0cae060bc8e..c50a8b731d5 100644
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
@@ -285,13 +285,21 @@ class CPUPyramidHashOPKernel : public framework::OpKernel<T> {
     if (use_filter) {
       if (white_list_len != 0) {
         _filter = (math::bloomfilter*)_blobs_1->data<float>();
-        PADDLE_ENFORCE_EQ(math::bloomfilter_check(_filter), 1,
-                          "white filter not load");
+        PADDLE_ENFORCE_EQ(
+            math::bloomfilter_check(_filter), 1,
+            platform::errors::PreconditionNotMet(
+                "The white filter is not loaded successfully, please make sure "
+                "'white_list_len': %d is valid for Input(WhiteList).",
+                white_list_len));
       }
       if (black_list_len != 0) {
         _black_filter = (math::bloomfilter*)_blobs_2->data<float>();
-        PADDLE_ENFORCE_EQ(math::bloomfilter_check(_black_filter), 1,
-                          "black filter not load");
+        PADDLE_ENFORCE_EQ(
+            math::bloomfilter_check(_black_filter), 1,
+            platform::errors::PreconditionNotMet(
+                "The black filter is not loaded successfully, please make sure "
+                "'black_list_len': %d is valid for Input(BlackList).",
+                black_list_len));
       }
     }
 
-- 
GitLab


From 42363674010d9c8f29135dcecf316aa70004ac77 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Thu, 24 Sep 2020 14:15:16 +0800
Subject: [PATCH 203/261] fix approveals message (#27531)

* fix approveals message;test=document_fix

* fix approveals message;test=document_fix
---
 tools/check_api_approvals.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 943b8c01e8c..1db3f6d3d27 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -39,8 +39,8 @@ fi
 
 api_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5  ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5` 
 if [ "$api_spec_diff" != "" ]; then
-    echo_line="${echo_line}Related APIs: ${api_spec_diff}\n"
-    echo_line="You must have one RD (zhiqiu (Recommend) or phlrain) approval for the api change for the opreator-related api without 'core.ops'.\n"
+    echo_line="APIs without core.ops: \n${api_spec_diff}\n"
+    echo_line="${echo_line}You must have one RD (zhiqiu (Recommend) or phlrain) approval for the api change for the opreator-related api without 'core.ops'.\n"
     echo_line="${echo_line}For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/paddle_api_development_manual.md]\n"
     check_approval 1 6888866 43953930
 fi
-- 
GitLab


From e1fb77d123898e3d1d0a6e42e68e9c91dbcda746 Mon Sep 17 00:00:00 2001
From: ruri <shipeng1108@163.com>
Date: Thu, 24 Sep 2020 14:25:40 +0800
Subject: [PATCH 204/261] [2.0RC]refine error message in shuffle channel OP
 (#27505)

* refine err msg in shuffle channel op
---
 paddle/fluid/operators/shuffle_channel_op.cc | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
index c01fed108f3..119d2e72369 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
@@ -21,13 +21,13 @@ class ShuffleChannelOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ShuffleChannelOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ShuffleChannelOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ShuffleChannelOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ShuffleChannelOp");
 
     auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+    PADDLE_ENFORCE_EQ(
+        input_dims.size(), 4,
+        platform::errors::InvalidArgument("The layout of input is NCHW."));
 
     ctx->SetOutputDim("Out", input_dims);
   }
@@ -53,7 +53,8 @@ class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("group", "the number of groups.")
         .SetDefault(1)
         .AddCustomChecker([](const int& group) {
-          PADDLE_ENFORCE_GE(group, 1, "group should be larger than 0.");
+          PADDLE_ENFORCE_GE(group, 1, platform::errors::InvalidArgument(
+                                          "group should be larger than 0."));
         });
 
     AddComment(R"DOC(
@@ -76,7 +77,9 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     auto input_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+    PADDLE_ENFORCE_EQ(
+        input_dims.size(), 4,
+        platform::errors::InvalidArgument("The layout of input is NCHW."));
 
     ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
   }
-- 
GitLab


From b6ecf35627d80e18470723a8d43c11826aabd6c8 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Thu, 24 Sep 2020 14:40:55 +0800
Subject: [PATCH 205/261] disable test_paddle_save_load,test=document_fix
 (#27534)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 8d236dca22f..94bc6235ad1 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -334,6 +334,8 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
 list(REMOVE_ITEM TEST_OPS test_sampling_id_op)
+list(REMOVE_ITEM TEST_OPS test_paddle_save_load)
+
 
 
 if (APPLE OR WIN32)
-- 
GitLab


From ec4155d7d000da2660bf465b6995ac3424235dba Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Thu, 24 Sep 2020 15:40:03 +0800
Subject: [PATCH 206/261] windows lib size crop from 5.4G to 3.9G (#27477)

---
 cmake/generic.cmake                       |  9 +++
 cmake/inference_lib.cmake                 | 11 ++-
 cmake/init.cmake                          |  3 +
 cmake/paddle_win.props                    | 91 +++++++++++++++++++++++
 paddle/fluid/inference/CMakeLists.txt     |  9 +--
 paddle/fluid/inference/api/demo_ci/run.sh |  5 --
 6 files changed, 112 insertions(+), 16 deletions(-)
 create mode 100644 cmake/paddle_win.props

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index b0a6dfe2902..3bdf7c209b4 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -446,6 +446,9 @@ function(nv_library TARGET_NAME)
         message(FATAL "Please specify source file or library in nv_library.")
       endif()
     endif(nv_library_SRCS)
+    if (WIN32)
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+    endif(WIN32)
   endif()
 endfunction(nv_library)
 
@@ -461,6 +464,9 @@ function(nv_binary TARGET_NAME)
       add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
       common_link(${TARGET_NAME})
     endif()
+    if (WIN32)
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+    endif(WIN32)
   endif()
 endfunction(nv_binary)
 
@@ -482,6 +488,9 @@ function(nv_test TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+    if (WIN32)
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
+    endif(WIN32)
   endif()
 endfunction(nv_test)
 
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index f19f0eb43d3..f4603051a0e 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -19,9 +19,8 @@ set(PADDLE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_install_dir" CACHE STRING
 set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir" CACHE STRING
   "A path setting paddle inference shared and static libraries")
   
-# TODO(zhaolong)
-# At present, the size of static lib in Windows exceeds the system limit,
-# so the generation of static lib is temporarily turned off.
+# At present, the size of static lib in Windows is very large,
+# so we need to crop the library size.
 if(WIN32)
     #todo: remove the option 
     option(WITH_STATIC_LIB "Compile demo with static/shared library, default use dynamic."   OFF)
@@ -196,7 +195,11 @@ set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_insta
 copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
+if(WIN32)
+  set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/${CMAKE_BUILD_TYPE}/paddle_fluid_c.*)
+else(WIN32)
+  set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
+endif(WIN32)
 
 copy(inference_lib_dist
       SRCS  ${src_dir}/inference/capi/paddle_c_api.h  ${paddle_fluid_c_lib}
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 7dfe60f9dd8..902dfb11fc0 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -26,4 +26,7 @@ if(WITH_GPU)
     set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
 endif()
 
+if(WIN32)
+    set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
+endif()
 
diff --git a/cmake/paddle_win.props b/cmake/paddle_win.props
new file mode 100644
index 00000000000..7e434c6d907
--- /dev/null
+++ b/cmake/paddle_win.props
@@ -0,0 +1,91 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+    <ItemDefinitionGroup>
+        <CudaCompile>
+            <!-- Project schema: Host properties -->
+            <UseHostDefines>true</UseHostDefines>
+            <Emulation>false</Emulation>
+            <HostDebugInfo Condition="'$(Configuration)' == 'Debug'">true</HostDebugInfo>
+            <HostDebugInfo Condition="'$(Configuration)' != 'Debug'">false</HostDebugInfo>
+            <FastMath>false</FastMath>
+            <Optimization>InheritFromHost</Optimization>
+            <Runtime>InheritFromHost</Runtime>
+            <RuntimeChecks>InheritFromHost</RuntimeChecks>
+            <TypeInfo>InheritFromHost</TypeInfo>
+            <Warning>InheritFromHost</Warning>
+
+            <BaseCommandLineTemplate>-ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions]</BaseCommandLineTemplate>
+            <BuildCommandLineTemplate>--use-local-env</BuildCommandLineTemplate>
+            <BuildDynamicCommandLineTemplate>[CodeGeneration]</BuildDynamicCommandLineTemplate>
+            <CleanCommandLineTemplate>-clean</CleanCommandLineTemplate>
+            <!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
+            <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
+
+            <DriverApiCommandLineTemplate>%(BaseCommandLineTemplate) [CompileOut] "%(FullPath)"</DriverApiCommandLineTemplate>
+            <RuntimeApiCommandLineTemplate>%(BaseCommandLineTemplate) [HostDebugInfo] [Emulation] [FastMath] [Defines] %(HostCommandLineTemplate) [CompileOut] "%(FullPath)"</RuntimeApiCommandLineTemplate>
+
+            <CommandLineTemplate>
+# (Approximate command-line.  Settings inherited from host are not visible below.)
+# (Please see the output window after a build for the full command-line)
+
+# Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx)
+set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
+"$(CudaToolkitNvccPath)" %(BuildCommandLineTemplate) %(DriverApiCommandLineTemplate)
+
+# Runtime API (NVCC Compilation Type is hybrid object or .c file)
+set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
+"$(CudaToolkitNvccPath)" %(BuildCommandLineTemplate) %(RuntimeApiCommandLineTemplate)
+            </CommandLineTemplate>
+            <ExecutionDescription>Compiling CUDA source file %(Identity)...</ExecutionDescription>
+            <ExclusionDescription>Skipping CUDA source file %(Identity) (excluded from build).</ExclusionDescription>
+
+            <!-- Miscellaneous -->
+            <PropsCacheOutputFile>%(Filename)%(Extension).cache</PropsCacheOutputFile>
+            <PropsCacheOutputPath>$(IntDir)%(PropsCacheOutputFile)</PropsCacheOutputPath>
+
+            <CudaCompileCoreProject>$(MSBuildProjectFullPath)</CudaCompileCoreProject>
+        </CudaCompile>
+
+        <CudaLink>
+            <PerformDeviceLink>true</PerformDeviceLink>
+            <LinkOut>$(IntDir)$(TargetName).device-link.obj</LinkOut>
+
+            <AdditionalLibraryDirectories></AdditionalLibraryDirectories>
+            <UseHostLibraryDirectories>true</UseHostLibraryDirectories>
+            <AdditionalDependencies></AdditionalDependencies>
+            <UseHostLibraryDependencies>true</UseHostLibraryDependencies>
+
+            <GPUDebugInfo>InheritFromProject</GPUDebugInfo>
+            <Optimization>InheritFromProject</Optimization>
+
+            <!-- Implicitly inherited from the project via @(CudaCompile) -->
+            <CodeGeneration></CodeGeneration>
+            <RuntimeChecks></RuntimeChecks>
+            <Runtime></Runtime>
+            <TargetMachinePlatform></TargetMachinePlatform>
+            <TypeInfo></TypeInfo>
+            <Warning></Warning>
+
+            <Inputs></Inputs>
+
+            <!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
+            <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
+            <LinkCommandLineTemplate>"$(CudaToolkitNvccPath)" -dlink [LinkOut] %(HostCommandLineTemplate) [AdditionalLibraryDirectories] [AdditionalDependencies] [AdditionalOptions] [CodeGeneration] [GPUDebugInfo] [TargetMachinePlatform] [Inputs]</LinkCommandLineTemplate>
+            <CommandLineTemplate>
+# (Approximate command-line.  Settings inherited from host are not visible below.)
+# (Please see the output window after a build for the full command-line)
+
+%(LinkCommandLineTemplate)
+            </CommandLineTemplate>
+        </CudaLink>
+
+        <Link>
+            <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
+        </Link>
+
+        <ClCompile>
+            <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+        </ClCompile>
+    </ItemDefinitionGroup>
+</Project>
+
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index cf6fcb7b643..f85e1f65116 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -44,14 +44,9 @@ add_subdirectory(api)
 set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
      zero_copy_tensor reset_tensor_array 
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
-# TODO(xingzhaolong, jiweibo): remove this and create_static_lib(paddle_fluid) on windows GPU
-if(WIN32 AND WITH_GPU)
-  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_API}) 
-else()
-  create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API}) 
-endif()
+create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API})
 
-if(NOT APPLE AND NOT WIN32)
+if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
   set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index a3e7bec398a..6d283ca56cb 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -68,11 +68,6 @@ rm -rf *
 
 for WITH_STATIC_LIB in ON OFF; do
   if [ $(echo `uname` | grep "Win") != "" ]; then
-    # TODO(xingzhaolong, jiweibo): remove this if windows GPU library is ready.
-    if [ $TEST_GPU_CPU == ON] && [ $WITH_STATIC_LIB ==ON ]; then
-      return 0
-    fi
-    
     # -----simple_on_word2vec on windows-----
     cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
       -DWITH_MKL=$TURN_ON_MKL \
-- 
GitLab


From 59c049995e036f80fc7e068a432037a9c8a4a014 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 24 Sep 2020 15:40:31 +0800
Subject: [PATCH 207/261] change test main process wait time (#27516)

---
 .../fluid/tests/unittests/test_imperative_signal_handler.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
index b47834ffab8..a0da4b0efee 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
@@ -49,7 +49,7 @@ class TestDygraphDataLoaderSingalHandler(unittest.TestCase):
             test_process.start()
 
             set_child_signal_handler(id(self), test_process.pid)
-            time.sleep(3)
+            time.sleep(5)
         except core.EnforceNotMet as ex:
             self.assertIn("FatalError", cpt.get_exception_message(ex))
             exception = ex
-- 
GitLab


From d7f422c9844b33efcb02c7f388ec97f68e42ce99 Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Thu, 24 Sep 2020 16:07:57 +0800
Subject: [PATCH 208/261] fix error message in conv/conv_transpose.
 test=develop (#27464)

* fix error message in conv/conv_transpose. test=develop
---
 paddle/fluid/operators/conv_cudnn_op.cu       | 60 +++++++++++--------
 paddle/fluid/operators/conv_op.h              | 18 ++++--
 .../operators/conv_transpose_cudnn_op.cu      | 20 +++----
 paddle/fluid/operators/conv_transpose_op.h    | 12 +++-
 4 files changed, 70 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 00af724ac7f..f8b76f387cc 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -50,8 +50,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
     const Tensor* input = ctx.Input<Tensor>("Input");
     auto* filter = ctx.Input<Tensor>("Filter");
     auto* output = ctx.Output<Tensor>("Output");
@@ -60,14 +61,16 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
+
     bool exhaustive_search =
         FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
+    bool deterministic = FLAGS_cudnn_deterministic;
+    auto exhaustive_deterministic = exhaustive_search && deterministic;
+    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
+                      platform::errors::InvalidArgument(
+                          "Cann't set exhaustive_search True and "
+                          "FLAGS_cudnn_deterministic True at same time."));
 
-    if (exhaustive_search && FLAGS_cudnn_deterministic) {
-      PADDLE_THROW(
-          "Cann't set exhaustive_search True and "
-          "FLAGS_cudnn_deterministic True at same time.");
-    }
     const std::string padding_algorithm =
         ctx.Attr<std::string>("padding_algorithm");
     const std::string data_format = ctx.Attr<std::string>("data_format");
@@ -197,7 +200,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
               &transformed_input);
         } break;
         default:
-          PADDLE_THROW("ConvOp only support tensors with 4 or 5 dimensions.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "ConvOp only support tensors with 4 or 5 dimensions."));
       }
 
     } else {
@@ -317,8 +321,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
     auto input = ctx.Input<Tensor>("Input");
     auto filter = ctx.Input<Tensor>("Filter");
     auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
@@ -337,14 +342,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
     int groups = ctx.Attr<int>("groups");
+
     bool exhaustive_search =
         FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
     bool deterministic = FLAGS_cudnn_deterministic;
-    if (exhaustive_search && deterministic) {
-      PADDLE_THROW(
-          "Can't set exhaustive_search True and "
-          "FLAGS_cudnn_deterministic True at same time.");
-    }
+    auto exhaustive_deterministic = exhaustive_search && deterministic;
+    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
+                      platform::errors::InvalidArgument(
+                          "Cann't set exhaustive_search True and "
+                          "FLAGS_cudnn_deterministic True at same time."));
+
     const std::string data_format = ctx.Attr<std::string>("data_format");
     const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
 
@@ -495,7 +502,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
               &transformed_input);
         } break;
         default:
-          PADDLE_THROW("ConvOp only support tensors with 4 or 5 dimensions.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "ConvOp only support tensors with 4 or 5 dimensions."));
       }
     } else {
       transformed_input.ShareDataWith(transformed_input_channel);
@@ -701,8 +709,9 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
     auto X = ctx.Input<Tensor>("Input");
     auto W = ctx.Input<Tensor>("Filter");
     auto dO = ctx.Input<Tensor>("DOutput");
@@ -736,14 +745,16 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
     const std::vector<int>& strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
+
     bool exhaustive_search =
         FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
     bool deterministic = FLAGS_cudnn_deterministic;
-    if (exhaustive_search && deterministic) {
-      PADDLE_THROW(
-          "Can't set exhaustive_search True and "
-          "FLAGS_cudnn_deterministic True at same time.");
-    }
+    auto exhaustive_deterministic = exhaustive_search && deterministic;
+    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
+                      platform::errors::InvalidArgument(
+                          "Cann't set exhaustive_search True and "
+                          "FLAGS_cudnn_deterministic True at same time."));
+
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
 
     std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
@@ -878,7 +889,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
           }
         } break;
         default:
-          PADDLE_THROW("ConvOp only support tensors with 4 or 5 dimensions.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "ConvOp only support tensors with 4 or 5 dimensions."));
       }
 
     } else {
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 8a5345e3cf8..662fac9e77e 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -685,8 +685,9 @@ class GemmConvDoubleGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      "It must use CPUPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_cpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
     const Tensor* X = ctx.Input<Tensor>("Input");
     const Tensor* dY = ctx.Input<Tensor>("DOutput");
     const Tensor* ddX = ctx.Input<Tensor>("DDInput");
@@ -982,11 +983,20 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           output->dims()[output->dims().size() - 1] %
               input->dims()[input->dims().size() - 1],
-          0, "The output channels must be a multiple of the input channels");
+          0, platform::errors::InvalidArgument(
+                 "ShapeError: The output channels must be a multiple of the "
+                 "input channels. But receivced output channel number is %d "
+                 "and input channel number is %d",
+                 output->dims()[output->dims().size() - 1],
+                 input->dims()[input->dims().size() - 1]));
     } else {
       PADDLE_ENFORCE_EQ(
           output->dims()[1] % input->dims()[1], 0,
-          "The output channels must be a multiple of the input channels");
+          platform::errors::InvalidArgument(
+              "ShapeError: The output channels must be a multiple of the "
+              "input channels. But receivced output channel number is %d "
+              "and input channel number is %d",
+              output->dims()[1], input->dims()[1]));
     }
     // transform tensor
     Tensor transformed_input(input->type());
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index 99ec1e04810..5249264b1c9 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -51,8 +51,9 @@ template <typename T>
 class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
     auto* input = ctx.Input<Tensor>("Input");
     auto* filter = ctx.Input<Tensor>("Filter");
     auto* output = ctx.Output<Tensor>("Output");
@@ -145,9 +146,8 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
               ctx, input_pad, input_transpose, pad_value, &transformed_input);
         } break;
         default:
-          PADDLE_ENFORCE_EQ(
-              rank == 4 || rank == 5, true,
-              "Op(ConvTranspose) only supports 4-D or 5-D input Tensor.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Op(ConvTranspose) only supports 4-D or 5-D input Tensor."));
       }
     } else {
       transformed_input = input_transpose;
@@ -290,8 +290,9 @@ template <typename T>
 class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
     auto input = ctx.Input<Tensor>("Input");
     auto filter = ctx.Input<Tensor>("Filter");
     auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
@@ -393,9 +394,8 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
               &transformed_output_grad);
         } break;
         default:
-          PADDLE_ENFORCE_EQ(
-              rank == 4 || rank == 5, true,
-              "Op(ConvTranspose) only supports 4-D or 5-D input Tensor.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Op(ConvTranspose) only supports 4-D or 5-D input Tensor."));
       }
     } else {
       transformed_output_grad = output_grad_transpose;
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index 59b3677acc4..1ea869e002a 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -580,7 +580,12 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(context.GetPlace());
 
     int groups = context.Attr<int>("groups");
-    PADDLE_ENFORCE_EQ(groups, filter.dims()[0]);
+    PADDLE_ENFORCE_EQ(
+        groups, filter.dims()[0],
+        platform::errors::InvalidArgument(
+            "groups should be error to the 1st dimension of filter. But "
+            "received groups is %d and filter dimension[0] is %d",
+            groups, filter.dims()[0]));
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
@@ -588,7 +593,10 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
     std::string padding_algorithm =
         context.Attr<std::string>("padding_algorithm");
     for (auto v : dilations) {
-      PADDLE_ENFORCE_EQ(v, 1);
+      PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
+                                  "dilations should be 1 in depthwise conv. "
+                                  "But received dilations is %d",
+                                  v));
     }
 
     auto in_dims = input->dims();
-- 
GitLab


From 273f58a3c5cbd805ba4c2e4524d950cd34bb6674 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Thu, 24 Sep 2020 16:11:10 +0800
Subject: [PATCH 209/261] Decrease Random Failure Probability for
 test_parallel_executor_mnist, test=develop (#27498)

As the title, decrease random failure probability for test_parallel_executor_mnist

The old code set larger delta when comparing reduce and all reduce, but didn't set all. I added it.

On my linux machine, I run 100 times, no failure occurs. In addition, we only saw this random failure on CI two times since I worked. I thought it was rare and I just increased the delta.
---
 .../fluid/tests/unittests/test_parallel_executor_mnist.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index a2f1d774608..da7e30ff106 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -124,8 +124,10 @@ class TestMNIST(TestParallelExecutorBase):
 
     def test_simple_fc_with_new_strategy(self):
         # use_cuda, use_reduce
-        self._compare_reduce_and_allreduce(simple_fc_net, True)
-        self._compare_reduce_and_allreduce(simple_fc_net, False)
+        # NOTE: the computation result of nccl_reduce is non-deterministic,
+        # related issue: https://github.com/NVIDIA/nccl/issues/157
+        self._compare_reduce_and_allreduce(simple_fc_net, True, 1e-5, 1e-2)
+        self._compare_reduce_and_allreduce(simple_fc_net, False, 1e-5, 1e-2)
 
     def check_simple_fc_parallel_accuracy(self, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
@@ -179,7 +181,7 @@ class TestMNIST(TestParallelExecutorBase):
         # NOTE: the computation result of nccl_reduce is non-deterministic,
         # related issue: https://github.com/NVIDIA/nccl/issues/157
         self._compare_reduce_and_allreduce(fc_with_batchnorm, True, 1e-5, 1e-2)
-        self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
+        self._compare_reduce_and_allreduce(fc_with_batchnorm, False, 1e-5, 1e-2)
 
 
 if __name__ == '__main__':
-- 
GitLab


From b7319ef51893a8b575dde283360fde2dbc316ce5 Mon Sep 17 00:00:00 2001
From: ruri <shipeng1108@163.com>
Date: Thu, 24 Sep 2020 16:43:04 +0800
Subject: [PATCH 210/261] fix err msg in pixel shuffle op (#27503)

---
 paddle/fluid/operators/pixel_shuffle_op.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index 70d232ad6a5..111a82c6cce 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -46,14 +46,14 @@ class PixelShuffleOp : public framework::OperatorWithKernel {
           platform::errors::InvalidArgument(
               "The square of upscale_factor[%u] should divide the "
               "number of channel[%u]",
-              input_dims[1], upscale_factor * upscale_factor));
+              upscale_factor * upscale_factor, input_dims[1]));
     } else {
       PADDLE_ENFORCE_EQ(
           input_dims[3] % (upscale_factor * upscale_factor), 0,
           platform::errors::InvalidArgument(
               "The square of upscale_factor[%u] should divide the "
               "number of channel[%u]",
-              input_dims[3], upscale_factor * upscale_factor));
+              upscale_factor * upscale_factor, input_dims[3]));
     }
     auto output_dims = input_dims;
     output_dims[0] = input_dims[0];
-- 
GitLab


From df7fabeedc87c663b3d8e285836b3770ceb10957 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Thu, 24 Sep 2020 17:19:20 +0800
Subject: [PATCH 211/261] Fix memory leak for mkldnn. (#27493)

---
 .../fluid/inference/api/analysis_predictor.cc | 34 ++++++++++++++++---
 .../fluid/inference/api/analysis_predictor.h  | 11 ++++++
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 42e62011f84..6c68b385bcb 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -245,7 +245,18 @@ bool AnalysisPredictor::PrepareExecutor() {
 
 void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
 #ifdef PADDLE_WITH_MKLDNN
-  VLOG(2) << "AnalysisPredictor::Run get_cur_mkldnn_session_id="
+  std::vector<std::vector<int>> inputs_shape;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs_shape.emplace_back(inputs[i].shape);
+  }
+  MkldnnPreSet(inputs_shape);
+#endif
+}
+
+void AnalysisPredictor::MkldnnPreSet(
+    const std::vector<std::vector<int>> &inputs_shape) {
+#ifdef PADDLE_WITH_MKLDNN
+  VLOG(2) << "AnalysisPredictor::ZeroCopyRun get_cur_mkldnn_session_id="
           << platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id();
   // In cache clearing mode.
   if (config_.mkldnn_cache_capacity_ > 0) {
@@ -257,9 +268,9 @@ void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
         config_.mkldnn_cache_capacity_);
     // Set current_input_shape for caching dynamic shape.
     std::stringstream ss;
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      for (size_t j = 0; j < inputs[i].shape.size(); ++j) {
-        ss << inputs[i].shape[j] << "-";
+    for (size_t i = 0; i < inputs_shape.size(); ++i) {
+      for (size_t j = 0; j < inputs_shape[i].size(); ++j) {
+        ss << inputs_shape[i][j] << "-";
       }
     }
     VLOG(2) << "Set input shape=" << ss.str();
@@ -742,6 +753,18 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 
 bool AnalysisPredictor::ZeroCopyRun() {
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) {
+    std::vector<std::vector<int>> shape_vector;
+    auto names = GetInputNames();
+    for (size_t i = 0; i < names.size(); ++i) {
+      auto in_tensor = GetInputTensor(names[i]);
+      shape_vector.emplace_back(in_tensor->shape());
+    }
+    MkldnnPreSet(shape_vector);
+  }
+#endif
+
   executor_->Run();
   // Fix TensorArray reuse not cleaned bug.
   tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
@@ -750,6 +773,9 @@ bool AnalysisPredictor::ZeroCopyRun() {
   // recover the cpu_math_library_num_threads to 1, in order to avoid thread
   // conflict when integrating it into deployment service.
   paddle::platform::SetNumThreads(1);
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) MkldnnPostReset();
+#endif
 #if defined(PADDLE_WITH_MKLML)
   // Frees unused memory allocated by the Intel® MKL Memory Allocator to
   // avoid memory leak. See:
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 365f86c2110..c4a7173b010 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -317,6 +317,17 @@ class AnalysisPredictor : public PaddlePredictor {
   /// \param[in] inputs tensors
   ///
   void MkldnnPreSet(const std::vector<PaddleTensor> &inputs);
+
+  ///
+  /// \brief PreSet for Mkldnn multi-thread and dynamic shape input.
+  ///
+  /// Used in AnalysisPredictor::Run(), do not support
+  /// AnalysisPredictor::ZeroCopyRun() now.
+  ///
+  /// \param[in] inputs tensor shape
+  ///
+  void MkldnnPreSet(const std::vector<std::vector<int>> &inputs_shape);
+
   ///
   /// \brief PostReset for Mkldnn multi-thread and dynamic shape input.
   ///
-- 
GitLab


From 32ad4f90a4b9c5fc38f6480b5a024ba44f654ee2 Mon Sep 17 00:00:00 2001
From: 123malin <malin10@baidu.com>
Date: Thu, 24 Sep 2020 20:59:21 +0800
Subject: [PATCH 212/261] =?UTF-8?q?=E3=80=90paddle.fleet=E3=80=91=20Usages?=
 =?UTF-8?q?=20Change:=20from=20fleet.util()=20to=20fleet.util=20(#27468)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* test=develop, bug fix
---
 python/paddle/distributed/fleet/__init__.py   |  6 +-
 .../distributed/fleet/base/fleet_base.py      | 32 ++--------
 .../distributed/fleet/base/util_factory.py    | 63 ++++++++++---------
 python/paddle/distributed/fleet/launch.py     | 10 ++-
 .../distributed/fleet/utils/__init__.py       |  2 +
 python/paddle/distributed/fleet/utils/fs.py   | 52 +++++++--------
 .../fluid/tests/unittests/dist_fleet_ctr.py   |  7 +--
 .../tests/unittests/dist_fleet_ctr_ps_gpu.py  |  7 +--
 .../tests/unittests/dist_fleet_heter_ctr.py   |  3 +-
 .../tests/unittests/dist_fleet_simnet_bow.py  |  5 +-
 .../tests/unittests/test_dist_fleet_base.py   |  7 +--
 .../unittests/test_dist_fleet_heter_base.py   |  5 +-
 .../test_dist_fleet_heter_program.py          |  1 -
 .../fluid/tests/unittests/test_fleet_base.py  |  2 +-
 .../unittests/test_fleet_rolemaker_new.py     | 12 ++--
 .../fluid/tests/unittests/test_fleet_util.py  | 50 ++++++++-------
 16 files changed, 120 insertions(+), 144 deletions(-)

diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index f3ee09a6d9e..e89cb1f5ec4 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # TODO: define distributed api under this directory,
-from .base.role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker
+from .base.role_maker import Role, UserDefinedRoleMaker, PaddleCloudRoleMaker
 from .base.distributed_strategy import DistributedStrategy
 from .base.fleet_base import Fleet
 from .base.util_factory import UtilBase
@@ -26,6 +26,7 @@ __all__ = [
     "UserDefinedRoleMaker",
     "PaddleCloudRoleMaker",
     "Fleet",
+    "Role",
 ]
 
 fleet = Fleet()
@@ -39,8 +40,7 @@ server_num = fleet.server_num
 server_index = fleet.server_index
 server_endpoints = fleet.server_endpoints
 is_server = fleet.is_server
-set_util = fleet.set_util
-util = fleet.util
+util = UtilBase()
 barrier_worker = fleet.barrier_worker
 init_worker = fleet.init_worker
 init_server = fleet.init_server
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index d00faac8385..d0658efdca3 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -23,7 +23,6 @@ from .strategy_compiler import StrategyCompiler
 from .distributed_strategy import DistributedStrategy
 from .meta_optimizer_factory import MetaOptimizerFactory
 from .runtime_factory import RuntimeFactory
-from .util_factory import UtilFactory
 from paddle.fluid.wrapped_decorator import wrap_decorator
 from paddle.fluid.dygraph import parallel_helper
 
@@ -120,7 +119,6 @@ class Fleet(object):
         self.strategy_compiler = None
         self._is_collective = False
         self._runtime_handle = None
-        self._util = None
 
     def init(self, role_maker=None, is_collective=False):
         """
@@ -182,6 +180,9 @@ class Fleet(object):
                     format(type(role_maker)))
         self._role_maker._generate_role()
 
+        import paddle.distributed.fleet as fleet
+        fleet.util._set_role_maker(self._role_maker)
+
         self.strategy_compiler = StrategyCompiler()
         if paddle.fluid.framework.in_dygraph_mode():
             if parallel_helper._is_parallel_ctx_initialized():
@@ -353,29 +354,6 @@ class Fleet(object):
         return self._role_maker._is_server(
         ) or self._role_maker._is_heter_worker()
 
-    def set_util(self, util):
-        self._util = util
-
-    def util(self):
-        """
-        Utility functions that can be used under certain runtime
-        return util
-
-        Returns:
-            UtilBase: instance of UtilBase, can use distributed ops/tools easily.
-
-        Examples:
-
-            .. code-block:: python
-                import paddle.distributed.fleet as fleet
-                fleet.init()
-                util = fleet.util
-                files = ["1.log", "2.log", "3.log", "4.log"]
-                files = util.get_file_shard()
-
-        """
-        return self._util
-
     def barrier_worker(self):
         """
         barrier all workers
@@ -1102,7 +1080,7 @@ class Fleet(object):
         if self._runtime_handle is None:
             self._runtime_handle = RuntimeFactory()._create_runtime(context)
 
-        if self._util is None:
-            self._util = UtilFactory()._create_util(context)
+        import paddle.distributed.fleet as fleet
+        fleet.util._set_strategy(context["valid_strategy"])
 
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
index efaa854c087..bbb7d60ed9c 100644
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -73,11 +73,13 @@ class UtilBase(object):
             .. code-block:: python
 
                 # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
-                from paddle.distributed.fleet.base.util_factory import fleet_util
                 import paddle.distributed.fleet as fleet
                 from paddle.distributed.fleet import PaddleCloudRoleMaker
                 import sys
                 import numpy as np
+                import os
+
+                os.environ["PADDLE_WITH_GLOO"] = "2"
 
                 def train():
                     role = PaddleCloudRoleMaker(
@@ -85,19 +87,18 @@ class UtilBase(object):
                         init_gloo=True,
                         path="./tmp_gloo")
                     fleet.init(role)
-                    fleet_util._set_role_maker(role)
 
                     if fleet.is_server():
                         input = [1, 2]
-                        output = fleet_util.all_reduce(input, "sum", "server")
+                        output = fleet.util.all_reduce(input, "sum", "server")
                         print(output)
                         # [2, 4]
                     elif fleet.is_worker():
                         input = np.array([3, 4])
-                        output = fleet_util.all_reduce(input, "sum", "worker")
+                        output = fleet.util.all_reduce(input, "sum", "worker")
                         print(output)
                         # [6, 8]
-                    output = fleet_util.all_reduce(input, "sum", "all")
+                    output = fleet.util.all_reduce(input, "sum", "all")
                     print(output)
                     # [8, 12]
                 if __name__ == "__main__":
@@ -117,10 +118,12 @@ class UtilBase(object):
             .. code-block:: python
                 # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
 
-                from paddle.distributed.fleet.base.util_factory import fleet_util
                 import paddle.distributed.fleet as fleet
                 from paddle.distributed.fleet import PaddleCloudRoleMaker
                 import sys
+                import os
+
+                os.environ["PADDLE_WITH_GLOO"] = "2"
 
                 def train():
                     role = PaddleCloudRoleMaker(
@@ -128,15 +131,14 @@ class UtilBase(object):
                         init_gloo=True,
                         path="./tmp_gloo")
                     fleet.init(role)
-                    fleet_util._set_role_maker(role)
 
                     if fleet.is_server():
-                        fleet_util.barrier("server")
+                        fleet.util.barrier("server")
                         print("all server arrive here")
                     elif fleet.is_worker():
-                        fleet_util.barrier("worker")
+                        fleet.util.barrier("worker")
                         print("all server arrive here")
-                    fleet_util.barrier("all")
+                    fleet.util.barrier("all")
                     print("all servers and workers arrive here")
 
                 if __name__ == "__main__":
@@ -160,10 +162,12 @@ class UtilBase(object):
             .. code-block:: python
 
                 # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
-                from paddle.distributed.fleet.base.util_factory import fleet_util
                 import paddle.distributed.fleet as fleet
                 from paddle.distributed.fleet import PaddleCloudRoleMaker
                 import sys
+                import os
+
+                os.environ["PADDLE_WITH_GLOO"] = "2"
 
                 def train():
                     role = PaddleCloudRoleMaker(
@@ -171,19 +175,18 @@ class UtilBase(object):
                         init_gloo=True,
                         path="./tmp_gloo")
                     fleet.init(role)
-                    fleet_util._set_role_maker(role)
 
                     if fleet.is_server():
                         input = fleet.server_index()
-                        output = fleet_util.all_gather(input, "server")
+                        output = fleet.util.all_gather(input, "server")
                         print(output)
                         # output = [0, 1]
                     elif fleet.is_worker():
                         input = fleet.worker_index()
-                        output = fleet_util.all_gather(input, "worker")
+                        output = fleet.util.all_gather(input, "worker")
                         # output = [0, 1]
                         print(output)
-                    output = fleet_util.all_gather(input, "all")
+                    output = fleet.util.all_gather(input, "all")
                     print(output)
                     # output = [0, 1, 0, 1]
 
@@ -220,18 +223,20 @@ class UtilBase(object):
 
             .. code-block:: python
 
-                from paddle.distributed.fleet.base.util_factory import fleet_util
-                import paddle.distributed.fleet.base.role_maker as role_maker
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import UserDefinedRoleMaker
 
-                role = role_maker.UserDefinedRoleMaker(
+                role = UserDefinedRoleMaker(
                     is_collective=False,
                     init_gloo=False,
                     current_id=0,
-                    role=role_maker.Role.WORKER,
+                    role=fleet.Role.WORKER,
                     worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
                     server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
-                fleet_util._set_role_maker(role)
-                files = fleet_util.get_file_shard(["file1", "file2", "file3"])
+                fleet.init(role)
+
+                files = fleet.util.get_file_shard(["file1", "file2", "file3"])
+                print(files)
                 # files = ["file1", "file2"]
         """
         if not isinstance(files, list):
@@ -267,18 +272,19 @@ class UtilBase(object):
 
             .. code-block:: python
 
-                from paddle.distributed.fleet.base.util_factory import fleet_util
-                import paddle.distributed.fleet.base.role_maker as role_maker
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import UserDefinedRoleMaker
 
-                role = role_maker.UserDefinedRoleMaker(
+                role = UserDefinedRoleMaker(
                     is_collective=False,
                     init_gloo=False,
                     current_id=0,
-                    role=role_maker.Role.WORKER,
+                    role=fleet.Role.WORKER,
                     worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
                     server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
-                fleet_util._set_role_maker(role)
-                fleet_util.print_on_rank("I'm worker 0", 0)
+                fleet.init(role)
+
+                fleet.util.print_on_rank("I'm worker 0", 0)
         """
         if self.role_maker._worker_index() != rank_id:
             return
@@ -577,6 +583,3 @@ class UtilBase(object):
                 print("fetch_targets name: %s" % v.name)
                 print("fetch_targets: {}".format(results[i]))
             return results
-
-
-fleet_util = UtilFactory()._create_util(None)
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index d63c9f9184c..c589e4f26a0 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -181,8 +181,8 @@ def get_gpus(gpus):
             cuda_visible_devices_list = cuda_visible_devices.split(',')
             for x in gpus.split(','):
                 assert x in cuda_visible_devices_list, "Can't find "\
-                "your gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
-                % (x, cuda_visible_devices)
+                    "your gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
+                    % (x, cuda_visible_devices)
             res_gpus = [
                 cuda_visible_devices_list.index(x.strip())
                 for x in gpus.split(',')
@@ -348,8 +348,7 @@ def launch_ps(args):
             "PADDLE_PORT": cur_server.endpoint.split(":")[1],
             "TRAINING_ROLE": "PSERVER",
             "PADDLE_TRAINERS_NUM": str(worker_num),
-            "POD_IP": cur_server.endpoint.split(":")[0],
-            "PADDLE_WITH_GLOO": "1"
+            "POD_IP": cur_server.endpoint.split(":")[0]
         }
         current_env.update(proc_env)
 
@@ -388,8 +387,7 @@ def launch_ps(args):
             "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
             "PADDLE_TRAINERS_NUM": str(worker_num),
             "TRAINING_ROLE": "TRAINER",
-            "PADDLE_TRAINER_ID": str(cur_worker.rank),
-            "PADDLE_WITH_GLOO": "1"
+            "PADDLE_TRAINER_ID": str(cur_worker.rank)
         }
         current_env.update(proc_env)
 
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index abf198b97e6..a45e1682c3f 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .fs import LocalFS, HDFSClient
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index b7c50bda3ea..8d4f24fb116 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -120,7 +120,7 @@ class LocalFS(FS):
     Examples:
         .. code-block:: python
 
-            from paddle.distributed.fleet.utils.fs import LocalFS
+            from paddle.distributed.fleet.utils import LocalFS
 
             client = LocalFS()
             subdirs, files = client.ls_dir("./")
@@ -140,7 +140,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 subdirs, files = client.ls_dir("./")
@@ -160,7 +160,7 @@ class LocalFS(FS):
 
     def mkdirs(self, fs_path):
         """
-        Create a remote HDFS directory.
+        Create a local directory.
 
         Args:
             fs_path(str): The local directory path.
@@ -168,7 +168,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.mkdirs("test_mkdirs")
@@ -189,7 +189,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.touch("test_rename_src")
@@ -217,7 +217,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.mkdirs("test_localFS_mkdirs")
@@ -247,7 +247,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.touch("test_is_file")
@@ -269,7 +269,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.mkdirs("test_is_dir")
@@ -292,7 +292,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 ret = local_fs.is_exist("test_is_exist")
@@ -311,7 +311,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.touch("test_touch")
@@ -332,13 +332,11 @@ class LocalFS(FS):
             src_path(str):  Name of the file or directory, that's needed to be moved.
             dst_path(str):  Name of the file or directory to which to move to.
             overwrite(bool): Whether to re-write `dst_path` if that exists. Default is False.
-            test_exists(bool): Check the existence of `src_path` and `dst_path` . 
-            When `test_exists` is set true, if `src_path` doesn't exist or `dst_path` exists, program will throw an Excetption. 
 
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 client.touch("test_mv_src")
@@ -369,7 +367,7 @@ class LocalFS(FS):
         Examples:
             .. code-block:: python
 
-                from paddle.distributed.fleet.utils.fs import LocalFS
+                from paddle.distributed.fleet.utils import LocalFS
 
                 client = LocalFS()
                 subdirs = client.list_dirs("./")
@@ -432,7 +430,7 @@ class HDFSClient(FS):
 
         .. code-block:: text
 
-            from paddle.distributed.fleet.utils.fs import HDFSClient
+            from paddle.distributed.fleet.utils import HDFSClient
             hadoop_home = "/home/client/hadoop-client/hadoop/"
 
             configs = {
@@ -493,7 +491,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -526,7 +524,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -587,7 +585,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -629,7 +627,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -661,7 +659,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -695,7 +693,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -740,7 +738,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -784,7 +782,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -830,7 +828,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -893,7 +891,7 @@ class HDFSClient(FS):
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
@@ -919,12 +917,14 @@ class HDFSClient(FS):
 
         Args:
             fs_path(str): The HDFS file path.
+            exist_ok(bool): When `fs_path` exists, if `exist_ok` is set false,
+            program will throw an Exception. Default is true.
 
         Examples:
 
             .. code-block:: text
 
-                from paddle.distributed.fleet.utils.fs import HDFSClient
+                from paddle.distributed.fleet.utils import HDFSClient
 
                 hadoop_home = "/home/client/hadoop-client/hadoop/"
                 configs = {
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 5721445c414..f650dd0f7e9 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -28,7 +28,6 @@ import numpy as np
 
 import ctr_dataset_reader
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
-from paddle.distributed.fleet.base.util_factory import fleet_util
 
 paddle.enable_static()
 
@@ -180,13 +179,13 @@ class TestDistCTR2x2(FleetDistRunnerBase):
                                        fetch_list=[self.avg_cost.name])
                     loss_val = np.mean(loss_val)
                     # TODO(randomly fail)
-                    #   reduce_output = fleet_util.all_reduce(
+                    #   reduce_output = fleet.util.all_reduce(
                     #       np.array(loss_val), mode="sum")
-                    #   loss_all_trainer = fleet_util.all_gather(float(loss_val))
+                    #   loss_all_trainer = fleet.util.all_gather(float(loss_val))
                     #   loss_val = float(reduce_output) / len(loss_all_trainer)
                     message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
                                                                       loss_val)
-                    fleet_util.print_on_rank(message, 0)
+                    fleet.util.print_on_rank(message, 0)
 
                 pass_time = time.time() - pass_start
             except fluid.core.EOFException:
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
index 3852b225234..7accc917f80 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
@@ -29,7 +29,6 @@ import numpy as np
 import ctr_dataset_reader
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
 from dist_fleet_ctr import TestDistCTR2x2, fake_ctr_reader
-from paddle.distributed.fleet.base.util_factory import fleet_util
 
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
@@ -76,13 +75,13 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
                     loss_val = exe.run(program=fleet.main_program,
                                        fetch_list=[self.avg_cost.name])
                     loss_val = np.mean(loss_val)
-                    reduce_output = fleet_util.all_reduce(
+                    reduce_output = fleet.util.all_reduce(
                         np.array(loss_val), mode="sum")
-                    loss_all_trainer = fleet_util.all_gather(float(loss_val))
+                    loss_all_trainer = fleet.util.all_gather(float(loss_val))
                     loss_val = float(reduce_output) / len(loss_all_trainer)
                     message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
                                                                       loss_val)
-                    fleet_util.print_on_rank(message, 0)
+                    fleet.util.print_on_rank(message, 0)
 
                 pass_time = time.time() - pass_start
             except fluid.core.EOFException:
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index 470fb98d799..f62ad66e462 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -29,7 +29,6 @@ import numpy as np
 import ctr_dataset_reader
 from test_dist_fleet_heter_base import runtime_main, FleetDistHeterRunnerBase
 from dist_fleet_ctr import TestDistCTR2x2, fake_ctr_reader
-from paddle.distributed.fleet.base.util_factory import fleet_util
 
 paddle.enable_static()
 
@@ -182,7 +181,7 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
 
         thread_num = int(os.getenv("CPU_NUM", 2))
         batch_size = 128
-        filelist = fleet_util.get_file_shard(train_file_list)
+        filelist = fleet.util.get_file_shard(train_file_list)
         print("filelist: {}".format(filelist))
 
         # config dataset
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
index ff848488739..fb7ddef862d 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -32,7 +32,6 @@ import os
 import signal
 from functools import reduce
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
-from paddle.distributed.fleet.base.util_factory import fleet_util
 
 paddle.enable_static()
 
@@ -198,7 +197,7 @@ class TestDistSimnetBow2x2(FleetDistRunnerBase):
     def net(self, args, batch_size=4, lr=0.01):
         avg_cost, _, predict, self.reader = \
             train_network(batch_size=batch_size, is_distributed=False,
-                               is_sparse=True, is_self_contained_lr=False, is_pyreader=(args.reader == "pyreader"))
+                          is_sparse=True, is_self_contained_lr=False, is_pyreader=(args.reader == "pyreader"))
         self.avg_cost = avg_cost
         self.predict = predict
 
@@ -238,7 +237,7 @@ class TestDistSimnetBow2x2(FleetDistRunnerBase):
                     loss_val = np.mean(loss_val)
                     message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
                                                                       loss_val)
-                    fleet_util.print_on_rank(message, 0)
+                    fleet.util.print_on_rank(message, 0)
 
                 pass_time = time.time() - pass_start
             except fluid.core.EOFException:
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index 3a923dbf3f7..c46d1dc5b0f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -34,8 +34,7 @@ import unittest
 import paddle
 import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
-from paddle.distributed.fleet.base.util_factory import fleet_util
-from paddle.distributed.fleet import fleet
+import paddle.distributed.fleet as fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
 __all__ = ['FleetDistRunnerBase', 'TestFleetBase', 'runtime_main']
@@ -97,7 +96,7 @@ class FleetDistRunnerBase(object):
         self.dump_fields_path = os.getenv("dump_fields_path", "")
         debug = int(os.getenv("Debug", "0"))
         # TODO(update strategy to support dump params)
-        if False:  #debug:
+        if False:  # debug:
             self.strategy.set_debug_opt({
                 "dump_param": self.dump_param,
                 "dump_fields": self.dump_fields,
@@ -372,8 +371,6 @@ def runtime_main(test_class):
     strategy = model.build_strategy(args)
     avg_cost = model.net(args)
     model.build_optimizer(avg_cost, strategy)
-    fleet_util._set_strategy(strategy)
-    fleet_util._set_role_maker(role)
     if args.role == "pserver":
         model.run_pserver(args)
     else:
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
index 4d744c8299f..ba97c5079bd 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
@@ -34,8 +34,7 @@ import unittest
 import paddle
 import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
-from paddle.distributed.fleet.base.util_factory import fleet_util
-from paddle.distributed.fleet import fleet
+import paddle.distributed.fleet as fleet
 
 __all__ = ['FleetDistHeterRunnerBase', 'TestFleetHeterBase', 'runtime_main']
 
@@ -376,8 +375,6 @@ def runtime_main(test_class):
     strategy = model.build_strategy(args)
     avg_cost = model.net(args)
     model.build_optimizer(avg_cost, strategy)
-    fleet_util._set_strategy(strategy)
-    fleet_util._set_role_maker(role)
 
     if args.role == "pserver" or args.role == "heter_trainer":
         model.run_pserver(args)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
index 00301f9b1c6..7f4e5d99e02 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
@@ -19,7 +19,6 @@ import os
 import math
 import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
-from paddle.distributed.fleet.base.util_factory import fleet_util
 from paddle.distributed.fleet import fleet
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 45597e7253c..ccd57c4d515 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -107,7 +107,7 @@ class TestFleetBase(unittest.TestCase):
     def test_util(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        self.assertEqual(fleet.util(), None)
+        self.assertNotEqual(fleet.util, None)
 
     def test_barrier_worker(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
index 4dd254af251..992fbbbe266 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -436,12 +436,12 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
         optimizer.minimize(avg_cost)
 
         comm_world = "server"
-        fleet.util().barrier(comm_world)
+        fleet.util.barrier(comm_world)
 
-        gather = fleet.util().all_gather(1, comm_world)
+        gather = fleet.util.all_gather(1, comm_world)
         self.assertEqual(gather[0], 1)
 
-        all_reduce = fleet.util().all_reduce(1, "sum", comm_world)
+        all_reduce = fleet.util.all_reduce(1, "sum", comm_world)
         self.assertEqual(1, all_reduce)
 
         self.clean(tmp)
@@ -752,12 +752,12 @@ class TestGlooWithCloudRoleMaker(unittest.TestCase):
         optimizer.minimize(avg_cost)
 
         comm_world = "server"
-        fleet.util().barrier(comm_world)
+        fleet.util.barrier(comm_world)
 
-        gather = fleet.util().all_gather(1, comm_world)
+        gather = fleet.util.all_gather(1, comm_world)
         self.assertEqual(gather[0], 1)
 
-        all_reduce = fleet.util().all_reduce(1, "sum", comm_world)
+        all_reduce = fleet.util.all_reduce(1, "sum", comm_world)
         self.assertEqual(1, all_reduce)
 
         self.clean(tmp)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py
index 1570912e740..b5c22b192a1 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -22,7 +22,6 @@ import tempfile
 import os
 import sys
 from paddle.dataset.common import download, DATA_HOME
-from paddle.distributed.fleet.base.util_factory import fleet_util
 import paddle.distributed.fleet.base.role_maker as role_maker
 
 
@@ -59,8 +58,7 @@ class TestFleetUtil(unittest.TestCase):
         import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        default_util = fleet.util()
-        self.assertEqual(default_util, None)
+        self.assertNotEqual(fleet.util, None)
 
     def test_set_user_defined_util(self):
         import paddle.distributed.fleet as fleet
@@ -76,17 +74,19 @@ class TestFleetUtil(unittest.TestCase):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         my_util = UserDefinedUtil()
-        fleet.set_util(my_util)
-        user_id = fleet.util().get_user_id()
+        fleet.util = my_util
+        user_id = fleet.util.get_user_id()
         self.assertEqual(user_id, 10)
 
     def test_fs(self):
-        from paddle.distributed.fleet.utils.fs import LocalFS
+        import paddle.distributed.fleet as fleet
+        from paddle.distributed.fleet.utils import LocalFS
+
         fs = LocalFS()
         dirs, files = fs.ls_dir("test_tmp")
         dirs, files = fs.ls_dir("./")
         self.assertFalse(fs.need_upload_download())
-        fleet_util._set_file_system(fs)
+        fleet.util._set_file_system(fs)
 
     def download_files(self):
         path = download(self.proto_data_url, self.module_name,
@@ -98,7 +98,8 @@ class TestFleetUtil(unittest.TestCase):
         return unzip_folder
 
     def test_get_file_shard(self):
-        self.assertRaises(Exception, fleet_util.get_file_shard, "files")
+        import paddle.distributed.fleet as fleet
+        self.assertRaises(Exception, fleet.util.get_file_shard, "files")
         try:
             import netifaces
         except:
@@ -112,18 +113,20 @@ class TestFleetUtil(unittest.TestCase):
             role=role_maker.Role.WORKER,
             worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
             server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
-        fleet_util._set_role_maker(role)
-        files = fleet_util.get_file_shard(["1", "2", "3"])
+        fleet.init(role)
+
+        files = fleet.util.get_file_shard(["1", "2", "3"])
         self.assertTrue(len(files) == 2 and "1" in files and "2" in files)
 
     def test_program_type_trans(self):
+        import paddle.distributed.fleet as fleet
         data_dir = self.download_files()
         program_dir = os.path.join(data_dir, self.pruned_dir)
         text_program = "pruned_main_program.pbtxt"
         binary_program = "pruned_main_program.bin"
-        text_to_binary = fleet_util._program_type_trans(program_dir,
+        text_to_binary = fleet.util._program_type_trans(program_dir,
                                                         text_program, True)
-        binary_to_text = fleet_util._program_type_trans(program_dir,
+        binary_to_text = fleet.util._program_type_trans(program_dir,
                                                         binary_program, False)
         self.assertTrue(
             os.path.exists(os.path.join(program_dir, text_to_binary)))
@@ -131,6 +134,7 @@ class TestFleetUtil(unittest.TestCase):
             os.path.exists(os.path.join(program_dir, binary_to_text)))
 
     def test_prams_check(self):
+        import paddle.distributed.fleet as fleet
         data_dir = self.download_files()
 
         class config:
@@ -160,11 +164,11 @@ class TestFleetUtil(unittest.TestCase):
         # test saved var's shape
         conf.dump_program_filename = "pruned_main_program.save_var_shape_not_match"
 
-        self.assertRaises(Exception, fleet_util._params_check)
+        self.assertRaises(Exception, fleet.util._params_check)
 
         # test program.proto without feed_op and fetch_op
         conf.dump_program_filename = "pruned_main_program.no_feed_fetch"
-        results = fleet_util._params_check(conf)
+        results = fleet.util._params_check(conf)
         self.assertTrue(len(results) == 1)
         np.testing.assert_array_almost_equal(
             results[0], np.array(
@@ -172,11 +176,11 @@ class TestFleetUtil(unittest.TestCase):
 
         # test feed_var's shape
         conf.dump_program_filename = "pruned_main_program.feed_var_shape_not_match"
-        self.assertRaises(Exception, fleet_util._params_check)
+        self.assertRaises(Exception, fleet.util._params_check)
 
         # test correct case with feed_vars_filelist
         conf.dump_program_filename = "pruned_main_program.pbtxt"
-        results = fleet_util._params_check(conf)
+        results = fleet.util._params_check(conf)
         self.assertTrue(len(results) == 1)
         np.testing.assert_array_almost_equal(
             results[0], np.array(
@@ -186,13 +190,14 @@ class TestFleetUtil(unittest.TestCase):
         conf.feed_config.feeded_vars_filelist = None
         # test feed var with lod_level >= 2
         conf.dump_program_filename = "pruned_main_program.feed_lod2"
-        self.assertRaises(Exception, fleet_util._params_check)
+        self.assertRaises(Exception, fleet.util._params_check)
 
         conf.dump_program_filename = "pruned_main_program.pbtxt"
-        results = fleet_util._params_check(conf)
+        results = fleet.util._params_check(conf)
         self.assertTrue(len(results) == 1)
 
     def test_proto_check(self):
+        import paddle.distributed.fleet as fleet
         data_dir = self.download_files()
 
         class config:
@@ -210,7 +215,7 @@ class TestFleetUtil(unittest.TestCase):
                          "pruned_main_program.save_var_shape_not_match"))
         conf.is_text_pruned_program = True
         conf.draw = False
-        res = fleet_util._proto_check(conf)
+        res = fleet.util._proto_check(conf)
         self.assertFalse(res)
 
         # test match
@@ -222,10 +227,11 @@ class TestFleetUtil(unittest.TestCase):
         else:
             conf.draw = True
             conf.draw_out_name = "pruned_check"
-        res = fleet_util._proto_check(conf)
+        res = fleet.util._proto_check(conf)
         self.assertTrue(res)
 
     def test_visualize(self):
+        import paddle.distributed.fleet as fleet
         if sys.platform == 'win32' or sys.platform == 'sys.platform':
             pass
         else:
@@ -234,10 +240,10 @@ class TestFleetUtil(unittest.TestCase):
                 data_dir,
                 os.path.join(self.train_dir, "join_main_program.pbtxt"))
             is_text = True
-            program = fleet_util._load_program(program_path, is_text)
+            program = fleet.util._load_program(program_path, is_text)
             output_dir = os.path.join(data_dir, self.train_dir)
             output_filename = "draw_prog"
-            fleet_util._visualize_graphviz(program, output_dir, output_filename)
+            fleet.util._visualize_graphviz(program, output_dir, output_filename)
             self.assertTrue(
                 os.path.exists(
                     os.path.join(output_dir, output_filename + ".dot")))
-- 
GitLab


From 3d5522146e34a44aeaa9916fb46f0877cb0894af Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Thu, 24 Sep 2020 21:30:23 +0800
Subject: [PATCH 213/261] register seq_concat_fc_fuse pass. (#27479)

---
 .../framework/ir/seq_concat_fc_fuse_pass.cc   | 13 ++++++++
 .../inference/test_seq_concat_fc_fuse_pass.py | 33 +++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py

diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
index 7daa9b5eff7..4101d593086 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -255,3 +256,15 @@ void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(seq_concat_fc_fuse_pass,
               paddle::framework::ir::SeqConcatFcFusePass);
+REGISTER_PASS_CAPABILITY(seq_concat_fc_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("sequence_expand", 0)
+            .EQ("concat", 0)
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("sigmoid", 0)
+            .EQ("tanh", 0)
+            .EQ("relu", 0)
+            .EQ("identity", 0)
+            .EQ("fusion_seqexpand_concat_fc", 0));
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
new file mode 100644
index 00000000000..33f215dafda
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class SeqConcatFCFusePassTest(InferencePassTest):
+    def test_compatible(self):
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seq_concat_fc_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From 8473ee9daedf46f11089605826531ce1ddc0eeb1 Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Thu, 24 Sep 2020 22:09:11 +0800
Subject: [PATCH 214/261] make places of DataLoader optional. (#27432)

* make places of DataLoader optional. test=develop
---
 python/paddle/fluid/reader.py                 | 89 ++++++++++---------
 .../test_multiprocess_dataloader_dynamic.py   |  1 -
 ...ess_dataloader_iterable_dataset_dynamic.py |  1 -
 3 files changed, 45 insertions(+), 46 deletions(-)

diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 533222531f9..6cc00a7fd37 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -167,10 +167,10 @@ class DataLoader(object):
             The variables should be created by :code:`fluid.data()`.
             :attr:`feed_list` must be set if :attr:`return_list` is
             False. Default None.
-        places(list(Place)|tuple(Place)): a list of Place, to put data
-            onto, :attr:`places` must be set in both static graph and 
-            dynamic graph mode, in dynamic graph mode, place number must
-            be 1. Default None.
+        places(list(Place)|tuple(Place)|optional): a list of Place,
+            to put data onto, :attr:`places` can be None, if 
+            :attr:`places` is None, default place(CPUPlace or CUDAPlace(0))
+            will be used. Default None.
         return_list (bool): whether the return value on each device is 
             presented as a list. If :attr:`return_list=False`, the return
             value on each device would be a dict of str -> LoDTensor, where
@@ -222,6 +222,8 @@ class DataLoader(object):
         .. code-block:: python
 
             import numpy as np
+
+            import paddle
             import paddle.fluid as fluid
             from paddle.io import Dataset, BatchSampler, DataLoader
 
@@ -247,11 +249,48 @@ class DataLoader(object):
                 def __len__(self):
                     return self.num_samples
 
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+
             # get places
             places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
 
+            # --------------------- dygraph mode --------------------
+
+            class SimpleNet(fluid.dygraph.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
+                    self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
+
+                def forward(self, image, label=None):
+                    return self.fc(image)
+
+            with fluid.dygraph.guard(places[0]):
+                simple_net = SimpleNet()
+                opt = fluid.optimizer.SGD(learning_rate=1e-3,
+                                          parameter_list=simple_net.parameters())
+
+                loader = DataLoader(dataset,
+                                    batch_size=BATCH_SIZE,
+                                    shuffle=True,
+                                    drop_last=True,
+                                    num_workers=2)
+
+                for e in range(EPOCH_NUM):
+                    for i, (image, label) in enumerate(loader()):
+                        out = simple_net(image)
+                        loss = fluid.layers.cross_entropy(out, label)
+                        avg_loss = fluid.layers.reduce_mean(loss)
+                        avg_loss.backward()
+                        opt.minimize(avg_loss)
+                        simple_net.clear_gradients()
+                        print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
+
+            # -------------------------------------------------------
+
             # -------------------- static graph ---------------------
 
+            paddle.enable_static()
+
             def simple_net(image, label):
                 fc_tmp = fluid.layers.fc(image, size=CLASS_NUM, act='softmax')
                 cross_entropy = fluid.layers.softmax_with_cross_entropy(image, label)
@@ -270,11 +309,8 @@ class DataLoader(object):
 
             prog = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
 
-            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
-
             loader = DataLoader(dataset,
                                 feed_list=[image, label],
-                                places=places,
                                 batch_size=BATCH_SIZE, 
                                 shuffle=True,
                                 drop_last=True,
@@ -287,39 +323,6 @@ class DataLoader(object):
 
             # -------------------------------------------------------
                 
-            # --------------------- dygraph mode --------------------
-
-            class SimpleNet(fluid.dygraph.Layer):
-                def __init__(self):
-                    super(SimpleNet, self).__init__()
-                    self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
-
-                def forward(self, image, label=None):
-                    return self.fc(image)
-
-            with fluid.dygraph.guard(places[0]):
-                simple_net = SimpleNet()
-                opt = fluid.optimizer.SGD(learning_rate=1e-3,
-                                          parameter_list=simple_net.parameters())
-
-                loader = DataLoader(dataset,
-                                    places=places[0],
-                                    batch_size=BATCH_SIZE,
-                                    shuffle=True,
-                                    drop_last=True,
-                                    num_workers=2)
-
-                for e in range(EPOCH_NUM):
-                    for i, (image, label) in enumerate(loader()):
-                        out = simple_net(image)
-                        loss = fluid.layers.cross_entropy(out, label)
-                        avg_loss = fluid.layers.reduce_mean(loss)
-                        avg_loss.backward()
-                        opt.minimize(avg_loss)
-                        simple_net.clear_gradients()
-                        print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
-
-            # -------------------------------------------------------
 
     .. note::
         For reading iterable dataset with multiprocess Dataloader,
@@ -356,11 +359,9 @@ class DataLoader(object):
                     "feed_list should be set when return_list=False"
         self.feed_list = feed_list
 
-        assert places is not None, "places cannot be None"
+        if places is None:
+            places = _current_expected_place()
         self.places = _convert_places(places)
-        if in_dygraph_mode():
-            assert len(self.places) == 1, \
-                    "Number of places must be 1 in dygraph mode"
 
         assert num_workers >= 0, "num_workers should be a non-negative value"
         if num_workers > 0 and (sys.platform == 'darwin' or
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
index 0706eb53d53..1bb720673e4 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
             dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
             dataloader = DataLoader(
                 dataset,
-                places=places,
                 num_workers=num_workers,
                 batch_size=BATCH_SIZE,
                 drop_last=True)
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
index 8f0209406fd..af332d8e432 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
             dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
             dataloader = DataLoader(
                 dataset,
-                places=places,
                 num_workers=num_workers,
                 batch_size=BATCH_SIZE,
                 drop_last=True)
-- 
GitLab


From d20349b548da26200aad0110d7a5ed7b678a9f5c Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Thu, 24 Sep 2020 22:14:25 +0800
Subject: [PATCH 215/261] add unittest count ,install check on windows (#27492)

* add unittest count of windows

* Reduce the number of retries
---
 cmake/generic.cmake                     |  1 +
 paddle/scripts/installation_validate.py |  1 +
 paddle/scripts/paddle_build.bat         | 17 ++++++++++-------
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 3bdf7c209b4..a2386265367 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -721,6 +721,7 @@ function(proto_library TARGET_NAME)
   set(proto_hdrs)
   paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
   cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
+  add_dependencies(extern_xxhash ${TARGET_NAME})
 endfunction()
 
 function(py_proto_compile TARGET_NAME)
diff --git a/paddle/scripts/installation_validate.py b/paddle/scripts/installation_validate.py
index f84e2f4b176..b765291a3b8 100644
--- a/paddle/scripts/installation_validate.py
+++ b/paddle/scripts/installation_validate.py
@@ -15,4 +15,5 @@
 import paddle.fluid as fluid
 import paddle as pd
 
+fluid.install_check.run_check()
 print(pd.__version__)
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 524c086c079..7ad2787d181 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -40,6 +40,7 @@ if not defined WITH_TPCACHE set WITH_TPCACHE=ON
 
 
 rem -------set cache build work directory-----------
+rmdir build\python /s/q
 if "%WITH_CACHE%"=="OFF" (
     rmdir build /s/q
     goto :mkbuild
@@ -48,10 +49,10 @@ if "%WITH_CACHE%"=="OFF" (
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
 set day_now=%datetime:~6,2%
 set day_before=-1
-set /p day_before=<day.txt
+set /p day_before=< %work_dir%\..\day.txt
 if %day_now% NEQ %day_before% (
-    echo %day_now% > day.txt
-    type day.txt
+    echo %day_now% > %work_dir%\..\day.txt
+    type %work_dir%\..\day.txt
     rmdir build /s/q
 )
 git diff origin/develop --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
@@ -208,7 +209,7 @@ echo Build third_party the %build_times% time:
 msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1  
-    if %build_times% GTR 3 (
+    if %build_times% GTR 2 (
         exit /b 7
     ) else (
         echo Build third_party failed, will retry!
@@ -223,7 +224,7 @@ echo Build Paddle the %build_times% time:
 msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
-    if %build_times% GTR 2 (
+    if %build_times% GTR 1 (
         exit /b 7
     ) else (
         echo Build Paddle failed, will retry!
@@ -301,6 +302,7 @@ goto:eof
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
+call :timestamp "%start%" "%end%" "1 card TestCases Total"
 call :timestamp "%start%" "%end%" "TestCases Total"
 echo Running unit tests failed, will exit!
 exit /b 8
@@ -313,6 +315,7 @@ echo    ========================================
 
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
+call :timestamp "%start%" "%end%" "1 card TestCases Total"
 call :timestamp "%start%" "%end%" "TestCases Total"
 
 cd %work_dir%\paddle\fluid\inference\api\demo_ci
@@ -345,6 +348,8 @@ echo     ============================================ >>  check_change_of_unitte
 echo EOF>>  check_change_of_unittest.sh
 echo spec_path=$(pwd)/UNITTEST_PR.spec>>  check_change_of_unittest.sh
 echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
+echo num=$(awk 'END{print NR}' ${spec_path})>> check_change_of_unittest.sh
+echo echo "Windows 1 card TestCases count is $num">> check_change_of_unittest.sh
 echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>>  check_change_of_unittest.sh
 echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>>  check_change_of_unittest.sh
 echo if [ "$origin_upstream_url" == "" ]; then>>  check_change_of_unittest.sh
@@ -455,8 +460,6 @@ taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
 taskkill /f /im python.exe  2>NUL
-call paddle_winci\Scripts\deactivate.bat 2>NUL
-del %PADDLE_WHL_FILE_WIN%
 taskkill /f /im python.exe  2>NUL
 echo Windows CI run successfully!
 exit /b 0
-- 
GitLab


From c83ade6d6b0b3ea931af6131990a70243eab3817 Mon Sep 17 00:00:00 2001
From: mapingshuo <mps2012@yeah.net>
Date: Fri, 25 Sep 2020 09:39:49 +0800
Subject: [PATCH 216/261] add AsDuplicable for sync_comm op(#27515)

---
 paddle/fluid/operators/collective/c_sync_comm_stream_op.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index d8617492fb1..7e5311a2103 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -54,8 +54,10 @@ class CSyncCommStreamOp : public framework::OperatorBase {
 class CSyncCommStreamOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
-    AddInput("X", "(Tensor) Dependency of the variable need to sync");
-    AddOutput("Out", "(Tensor) Dependency of the variable need to sync");
+    AddInput("X", "(Tensor) Dependency of the variable need to sync")
+        .AsDuplicable();
+    AddOutput("Out", "(Tensor) Dependency of the variable need to sync")
+        .AsDuplicable();
     AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
     AddComment(R"DOC(
 CSyncCommStream Operator
-- 
GitLab


From a2e0b7cb4ad01f8ceb48920a721697c7ca92b8c5 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Fri, 25 Sep 2020 10:02:11 +0800
Subject: [PATCH 217/261] update gcc8 in python3 ci docker (#26979)

* update gcc8 in python3 ci docker

* change cuda 10.2

* update cudnn8

* nvidia error cuda10.2-cudnn8-centos6 images

* fix third cache
---
 cmake/third_party.cmake            |  1 +
 tools/dockerfile/Dockerfile.centos |  6 ++----
 tools/dockerfile/ci_dockerfile.sh  | 11 ++++++++++-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 9edfcb967ab..ffd32cc78f0 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -39,6 +39,7 @@ set(third_party_deps)
 #            REPOSITORY ${TARGET_REPOSITORY}
 #            TAG        ${TARGET_TAG}
 #            DIR        ${TARGET_SOURCE_DIR})
+
 FUNCTION(cache_third_party TARGET)
     SET(options "")
     SET(oneValueArgs URL REPOSITORY TAG DIR)
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index b10e76a4b4d..a50d08354b8 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -80,9 +80,7 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
     make -j8 && make install && \
     ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
 
-# gcc4.8 TRT
-RUN mkdir -p /opt/compiler && cd /opt/compiler && \
-    wget -q https://paddle-ci.gz.bcebos.com/gcc-4.8.2.tar.gz && \
-    tar xf gcc-4.8.2.tar.gz && rm -f gcc-4.8.2.tar.gz
+# Downgrade gcc&&g++
+<install_gcc>
 
 CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index 3716084487e..9c8f8f563ab 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -21,7 +21,7 @@ function make_ubuntu_dockerfile(){
 
 function make_centos_dockerfile(){
   dockerfile_name="Dockerfile.cuda9_cudnn7_gcc48_py35_centos6"
-  sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.centos >${dockerfile_name}
+  sed 's/<baseimg>/10.2-cudnn7-devel-centos6/g' Dockerfile.centos >${dockerfile_name}
   sed -i 's#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g' ${dockerfile_name} 
   dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'`
   sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so && \
@@ -29,6 +29,15 @@ function make_centos_dockerfile(){
      rm -rf /usr/include/NvInfer*" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN wget --no-check-certificate  -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
      tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
+  sed -i 's#<install_gcc>#WORKDIR /usr/bin \
+      COPY tools/dockerfile/build_scripts /build_scripts \
+      RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
+      RUN cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \
+      ENV PATH=/usr/local/gcc-8.2/bin:$PATH #g' ${dockerfile_name}
 }
 
 
-- 
GitLab


From 41a7ce8347dcf154f3f33707d96fc4217b5dbd60 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 25 Sep 2020 10:09:31 +0800
Subject: [PATCH 218/261] fix random failure of
 test_buffer_sahred_memory_reuse_pass (#27551)

---
 .../test_buffer_shared_memory_reuse_pass.py        | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index 2c9168df472..9dd617f90b6 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -115,8 +115,15 @@ class InplaceTestBase(unittest.TestCase):
                         fetch_val2, = exe.run(compiled_prog,
                                               feed=feed_dict,
                                               fetch_list=[fetch_var])
-
-                        self.assertTrue(np.array_equal(fetch_val1, fetch_val2))
+                        #NOTE(zhiqiu): Temporally changed from array_equal to allclose. 
+                        # The real root is fuse_all_reduce and fuse_all_optimizer_opss may 
+                        # result in diff because of the instruction set on the virtual machine.
+                        # And the related unit tests: test_fuse_all_reduce_pass and test_fuse_optimizer_pass use "almostEqual" in their checks.
+                        # There are also some related issues:
+                        # https://github.com/PaddlePaddle/Paddle/issues/21270
+                        # https://github.com/PaddlePaddle/Paddle/issues/21046
+                        # https://github.com/PaddlePaddle/Paddle/issues/21045
+                        self.assertTrue(np.allclose(fetch_val1, fetch_val2))
 
     def check_multi_card_fetch_var(self):
         if self.is_invalid_test():
@@ -160,7 +167,8 @@ class InplaceTestBase(unittest.TestCase):
                         fetch_vals.append(fetch_val)
 
                 for item in fetch_vals:
-                    self.assertTrue(np.array_equal(fetch_vals[0], item))
+                    # save above
+                    self.assertTrue(np.allclose(fetch_vals[0], item))
 
 
 class CUDAInplaceTest(InplaceTestBase):
-- 
GitLab


From 059bfd69a88be5eaae11f3067e2cbc3b5033ceaa Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu21@163.com>
Date: Fri, 25 Sep 2020 10:34:09 +0800
Subject: [PATCH 219/261] change wlist.json, add annotation (#27438)

* modify ci api white list, add annotation

* modify ci api white list, add annotation

* move gpu_not_white list to wlist.json

* add null line
---
 tools/sampcd_processor.py | 23 ++++++++-------
 tools/wlist.json          | 62 ++++++++++++++++++++++++---------------
 2 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 033b4b8723a..d23c18a44e9 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -534,13 +534,6 @@ def get_incrementapi():
                 f.write('\n')
 
 
-# only white on CPU
-gpu_not_white = [
-    "deformable_conv", "cuda_places", "CUDAPinnedPlace", "CUDAPlace",
-    "cuda_profiler", 'DGCMomentumOptimizer'
-]
-
-
 def get_wlist():
     '''
     this function will get the white list of API.
@@ -552,17 +545,25 @@ def get_wlist():
     '''
     wlist = []
     wlist_file = []
+    # only white on CPU
+    gpu_not_white = []
     with open("wlist.json", 'r') as load_f:
         load_dict = json.load(load_f)
         for key in load_dict:
-            if key == 'wlist_file':
-                wlist_file = wlist_file + load_dict[key]
+            if key == 'wlist_dir':
+                for item in load_dict[key]:
+                    wlist_file.append(item["name"])
+            elif key == "gpu_not_white":
+                gpu_not_white = load_dict[key]
+            elif key == "wlist_api":
+                for item in load_dict[key]:
+                    wlist.append(item["name"])
             else:
                 wlist = wlist + load_dict[key]
-    return wlist, wlist_file
+    return wlist, wlist_file, gpu_not_white
 
 
-wlist, wlist_file = get_wlist()
+wlist, wlist_file, gpu_not_white = get_wlist()
 
 if len(sys.argv) < 2:
     print("Error: inadequate number of arguments")
diff --git a/tools/wlist.json b/tools/wlist.json
index 5591f90da4b..0ed0b4e4069 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -1,11 +1,29 @@
 {
-    "wlist_file" : [
-        "../python/paddle/fluid/contrib", 
-        "../python/paddle/verison.py",
-        "../python/paddle/fluid/core_avx.py",
-        "../python/paddle/distributed"
+    "wlist_dir":[
+        {
+            "name":"../python/paddle/fluid/contrib",
+            "annotation":""
+        },
+        {
+            "name":"../python/paddle/verison.py",
+            "annotation":""
+        },
+        {
+            "name":"../python/paddle/fluid/core_avx.py",
+            "annotation":""
+        },
+        {
+            "name":"../python/paddle/distributed",
+            "annotation":""
+        }
     ],
-    "wlist_inneed":[
+    "wlist_api":[
+        {
+            "name":"xxxxx",
+            "annotation":"not a real api, just for example"
+        }
+    ],
+    "wlist_temp_api":[
         "append_LARS",
         "BuildStrategy.debug_graphviz_path",
         "BuildStrategy.enable_sequential_execution",
@@ -63,9 +81,7 @@
         "cuda_places",
         "CUDAPinnedPlace",
         "CUDAPlace",
-        "Program.parse_from_string"
-    ],
-    "wlist_nosample":[
+        "Program.parse_from_string",
         "Compressor",
         "Compressor.config",
         "Compressor.run",
@@ -159,13 +175,9 @@
         "RNN",
         "BiRNN",
         "RNNCellBase",
-        "RNNCellBase.get_initial_states"
-    ],
-    "wlist_no_op_pass":[
+        "RNNCellBase.get_initial_states",
         "gelu",
-        "erf"
-    ],
-    "wlist_ci_nopass":[
+        "erf",
         "DecodeHelper",
         "DecodeHelper.initialize",
         "DecodeHelper.sample",
@@ -188,9 +200,7 @@
         "SampleEmbeddingHelper",
         "BasicDecoder",
         "lstm",
-        "partial_sum"
-    ],
-    "wlist_nopass":[
+        "partial_sum",
         "StateCell",
         "StateCell.compute_state",
         "TrainingDecoder",
@@ -242,9 +252,7 @@
         "GroupNorm",
         "SpectralNorm",
         "TreeConv",
-        "prroi_pool"
-    ],
-    "wlist_temp":[
+        "prroi_pool",
         "to_tensor",
         "ChunkEvaluator",
         "EditDistance",
@@ -322,9 +330,7 @@
         "Conv2DTranspose",
         "QueueDataset.local_shuffle",
         "save_persistables@dygraph/checkpoint.py",
-        "load_persistables@dygraph/checkpoint.py"
-    ],
-    "wlist_ignore":[
+        "load_persistables@dygraph/checkpoint.py",
         "elementwise_pow",
         "WeightedAverage.reset",
         "ChunkEvaluator.eval",
@@ -401,5 +407,13 @@
         "LinearChainCRF.forward",
         "CRFDecoding.forward",
         "SequenceTagging.forward"
+    ],
+    "gpu_not_white":[
+        "deformable_conv",
+        "cuda_places",
+        "CUDAPinnedPlace",
+        "CUDAPlace",
+        "cuda_profiler",
+        "DGCMomentumOptimizer"
     ]
 }
-- 
GitLab


From c5c13473c65520c0439eb009e2e709cca0c4a249 Mon Sep 17 00:00:00 2001
From: cc <52520497+juncaipeng@users.noreply.github.com>
Date: Fri, 25 Sep 2020 10:45:18 +0800
Subject: [PATCH 220/261] Add compatibility check for four mkldnn pass (#27364)

* Add pass compatibility check for four mkldnn pass, test=develop
---
 .../conv_activation_mkldnn_fuse_pass.cc       |  21 ++++
 .../conv_concat_relu_mkldnn_fuse_pass.cc      |   8 ++
 .../matmul_transpose_reshape_fuse_pass.cc     |   8 ++
 .../ir/mkldnn/scale_matmul_fuse_pass.cc       |   7 ++
 .../test_mkldnn_conv_activation_fuse_pass.py  | 106 ++++++++++++++++++
 ...kldnn_conv_concat_relu_mkldnn_fuse_pass.py |  92 +++++++++++++++
 ...ldnn_matmul_transpose_reshape_fuse_pass.py |  81 +++++++++++++
 ... => test_mkldnn_scale_matmul_fuse_pass.py} |  40 ++++++-
 8 files changed, 357 insertions(+), 6 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
 rename python/paddle/fluid/tests/unittests/ir/inference/{test_mkldnn_conv_relu_fuse_pass.py => test_mkldnn_scale_matmul_fuse_pass.py} (50%)

diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
index a5beec87c39..c33398553ec 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
 #include <vector>
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -103,12 +104,32 @@ REGISTER_PASS(conv_activation_mkldnn_fuse_pass,
 
 REGISTER_PASS(conv_relu_mkldnn_fuse_pass,
               paddle::framework::ir::ConvActivationFusePass);
+REGISTER_PASS_CAPABILITY(conv_relu_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("relu", 0));
 
 REGISTER_PASS(conv_leaky_relu_mkldnn_fuse_pass,
               paddle::framework::ir::Conv2DLeakyReLUFusePass);
+REGISTER_PASS_CAPABILITY(conv_leaky_relu_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .LE("leaky_relu", 1));
 
 REGISTER_PASS(conv_relu6_mkldnn_fuse_pass,
               paddle::framework::ir::Conv2DReLU6FusePass);
+REGISTER_PASS_CAPABILITY(conv_relu6_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("relu6", 0));
 
 REGISTER_PASS(conv_swish_mkldnn_fuse_pass,
               paddle::framework::ir::Conv2DSwishFusePass);
+REGISTER_PASS_CAPABILITY(conv_swish_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("swish", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
index 5fadd9607e9..76e10212550 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
 #include <vector>
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -123,3 +124,10 @@ void ConvConcatReLUFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(conv_concat_relu_mkldnn_fuse_pass,
               paddle::framework::ir::ConvConcatReLUFusePass);
+
+REGISTER_PASS_CAPABILITY(conv_concat_relu_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("concat", 0)
+            .EQ("relu", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
index 37c14e1d8e3..41b859f0af6 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h"
 #include <paddle/fluid/string/pretty_log.h>
 #include <vector>
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -98,3 +99,10 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(matmul_transpose_reshape_fuse_pass,
               paddle::framework::ir::MatmulTransposeReshapeMKLDNNPass);
+
+REGISTER_PASS_CAPABILITY(matmul_transpose_reshape_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("matmul", 0)
+            .EQ("transpose", 0)
+            .EQ("reshape", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
index 790821e3fa4..0784a1a024c 100644
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
@@ -90,3 +91,9 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(scale_matmul_fuse_pass,
               paddle::framework::ir::ScaleMatmulFusePass);
+
+REGISTER_PASS_CAPABILITY(scale_matmul_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("scale", 0)
+            .EQ("matmul", 0));
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
new file mode 100644
index 00000000000..5d96994a33b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class ConvActivationMkldnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                data,
+                num_filters=self.conv_num_filters,
+                filter_size=self.conv_filter_size,
+                bias_attr=self.conv_bias_attr,
+                act=self.act)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def set_params(self):
+        self.conv_num_filters = 3
+        self.conv_filter_size = 3
+        self.conv_bias_attr = False
+        self.act = "relu"
+        self.pass_name = 'conv_relu_mkldnn_fuse_pass'
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+class ConvActivationMkldnnFusePassTest_1(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 5
+        self.conv_filter_size = 5
+        self.conv_bias_attr = True
+        self.act = "relu"
+        self.pass_name = 'conv_relu_mkldnn_fuse_pass'
+
+
+class ConvActivationMkldnnFusePassTest_2(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 3
+        self.conv_filter_size = 3
+        self.conv_bias_attr = False
+        self.act = "leaky_relu"
+        self.pass_name = 'conv_leaky_relu_mkldnn_fuse_pass'
+
+
+class ConvActivationMkldnnFusePassTest_3(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 5
+        self.conv_filter_size = 5
+        self.conv_bias_attr = True
+        self.act = "leaky_relu"
+        self.pass_name = 'conv_leaky_relu_mkldnn_fuse_pass'
+
+
+class ConvActivationMkldnnFusePassTest_4(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 3
+        self.conv_filter_size = 3
+        self.conv_bias_attr = False
+        self.act = "relu6"
+        self.pass_name = 'conv_relu6_mkldnn_fuse_pass'
+
+
+class ConvActivationMkldnnFusePassTest_4(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 5
+        self.conv_filter_size = 5
+        self.conv_bias_attr = True
+        self.act = "swish"
+        self.pass_name = 'conv_swish_mkldnn_fuse_pass'
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
new file mode 100644
index 00000000000..45097f6b819
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class ConvConcatReluMkldnnFusePassTest_0(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_1 = fluid.data(
+                name="data_1", shape=[-1, 3, 100, 100], dtype="float32")
+            data_2 = fluid.data(
+                name="data_2", shape=[-1, 3, 100, 100], dtype="float32")
+            conv_1 = fluid.layers.conv2d(
+                data_1,
+                num_filters=self.conv1_num_filters,
+                filter_size=self.conv1_filter_size,
+                padding=self.conv1_padding,
+                bias_attr=self.conv1_bias_attr)
+            conv_2 = fluid.layers.conv2d(
+                data_2,
+                num_filters=self.conv2_num_filters,
+                filter_size=self.conv2_filter_size,
+                padding=self.conv2_padding,
+                bias_attr=self.conv2_bias_attr)
+            concat = fluid.layers.concat(
+                [conv_1, conv_2], axis=self.concat_axis)
+            out = fluid.layers.relu(concat)
+
+        self.feeds = {
+            "data_1": np.random.random((1, 3, 100, 100)).astype("float32"),
+            "data_2": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [out]
+        self.enable_mkldnn = True
+
+    def set_params(self):
+        self.conv1_num_filters = 3
+        self.conv1_filter_size = 3
+        self.conv1_padding = 0
+        self.conv1_bias_attr = False
+        self.conv2_num_filters = 3
+        self.conv2_filter_size = 3
+        self.conv2_padding = 0
+        self.conv2_bias_attr = False
+        self.concat_axis = 0
+        self.pass_name = "conv_concat_relu_mkldnn_fuse_pass"
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+class ConvConcatReluMkldnnFusePassTest_1(ConvConcatReluMkldnnFusePassTest_0):
+    def set_params(self):
+        self.conv1_num_filters = 3
+        self.conv1_filter_size = 3
+        self.conv1_padding = 0
+        self.conv1_bias_attr = False
+        self.conv2_num_filters = 5
+        self.conv2_filter_size = 5
+        self.conv2_padding = 1
+        self.conv2_bias_attr = True
+        self.concat_axis = 1
+        self.pass_name = "conv_concat_relu_mkldnn_fuse_pass"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
new file mode 100644
index 00000000000..a6b5e0e5473
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class MatmulTransposeReshapeMkldnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=self.data_shape, dtype="float32")
+            weight = fluid.layers.create_parameter(
+                shape=self.weight_shape, dtype="float32")
+            matmul = fluid.layers.matmul(
+                data,
+                weight,
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y)
+            transpose = fluid.layers.transpose(matmul, self.tranpose_perm)
+            reshape = fluid.layers.reshape(transpose, shape=self.reshape_shape)
+
+        self.fetch_list = [reshape]
+        self.enable_mkldnn = True
+
+    def set_params(self):
+        self.data_shape = [-1, 3, 100, 110]
+        self.weight_shape = [1, 3, 110, 100]
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 110)).astype("float32")
+        }
+        self.transpose_x = False
+        self.transpose_y = False
+        self.tranpose_perm = [0, 2, 1, 3]
+        self.reshape_shape = [3, 100, 100]
+        self.pass_name = 'matmul_transpose_reshape_fuse_pass'
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+class MatmulTransposeReshapeMkldnnFusePassTest_1(
+        MatmulTransposeReshapeMkldnnFusePassTest):
+    def set_params(self):
+        self.data_shape = [-1, 3, 100, 100]
+        self.weight_shape = [1, 3, 100, 100]
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.transpose_x = True
+        self.transpose_y = True
+        self.tranpose_perm = [0, 2, 1, 3]
+        self.reshape_shape = [6, 50, 100]
+        self.pass_name = 'matmul_transpose_reshape_fuse_pass'
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_relu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
similarity index 50%
rename from python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_relu_fuse_pass.py
rename to python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
index 2346e93d64d..55a6b543f0a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_relu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
@@ -20,26 +20,54 @@ from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
 
 
-class ConvBnFusePassMKLDNNTest(InferencePassTest):
+class ScaleMatmulMkldnnFusePassTest(InferencePassTest):
     def setUp(self):
+        self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32")
-            conv_out = fluid.layers.conv2d(
-                data, num_filters=3, filter_size=3, bias_attr=False, act="relu")
+                name="data", shape=[1, 3, 100, 100], dtype="float32")
+            weight = fluid.layers.create_parameter(
+                shape=[1, 3, 100, 100], dtype="float32")
+            scale = fluid.layers.scale(data, scale=self.scale_scale)
+            matmul = fluid.layers.matmul(
+                scale,
+                weight,
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y)
 
+        self.fetch_list = [matmul]
+        self.enable_mkldnn = True
+
+    def set_params(self):
         self.feeds = {
             "data": np.random.random((1, 3, 100, 100)).astype("float32")
         }
-        self.fetch_list = [conv_out]
-        self.enable_mkldnn = True
+        self.scale_scale = 2.0
+        self.transpose_x = False
+        self.transpose_y = False
+        self.pass_name = "scale_matmul_fuse_pass"
 
     def test_check_output(self):
         use_gpu = False
         self.check_output_with_option(use_gpu)
 
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+class ScaleMatmulMkldnnFusePassTest_1(ScaleMatmulMkldnnFusePassTest):
+    def set_params(self):
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.scale_scale = 5.0
+        self.transpose_x = True
+        self.transpose_y = True
+        self.pass_name = "scale_matmul_fuse_pass"
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From e550fc02ae93450e9acb6c238f55733dca269c61 Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Fri, 25 Sep 2020 10:58:17 +0800
Subject: [PATCH 221/261] fleet2.0 add fp16 grad compression (#27480)

---
 .../framework/distributed_strategy.proto      |   1 +
 .../fleet/base/distributed_strategy.py        |  23 +++
 .../fleet/meta_optimizers/__init__.py         |   1 +
 .../fp16_allreduce_optimizer.py               | 146 ++++++++++++++++++
 .../fluid/tests/unittests/CMakeLists.txt      |   2 +
 .../unittests/dist_mnist_fp16_allreduce.py    |  63 ++++++++
 .../test_dist_mnist_fp16_allreduce.py         |  33 ++++
 .../test_fleet_distributed_strategy.py        |  10 ++
 ...est_fleet_fp16_allreduce_meta_optimizer.py |  91 +++++++++++
 9 files changed, 370 insertions(+)
 create mode 100755 python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
 create mode 100644 python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index df482f43346..c9ae5a67950 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -127,6 +127,7 @@ message DistributedStrategy {
   optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
   optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
   optional bool adaptive_localsgd = 24 [ default = false ];
+  optional bool fp16_allreduce = 25 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index f1c836468da..316b6494e34 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -845,6 +845,29 @@ class DistributedStrategy(object):
         check_configs_key(self.strategy.dgc_configs, configs, "dgc_configs")
         assign_configs_value(self.strategy.dgc_configs, configs)
 
+    @property
+    def fp16_allreduce(self):
+        """
+        Indicating whether we are using fp16 gradient allreduce training
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.fp16_allreduce = True # by default this is false
+
+        """
+        return self.strategy.fp16_allreduce
+
+    @fp16_allreduce.setter
+    @is_strict_auto
+    def fp16_allreduce(self, flag):
+        if not isinstance(flag, bool):
+            raise TypeError('fp16_allreduce must be value of bool type')
+        self.strategy.fp16_allreduce = flag
+
     @property
     def gradient_merge(self):
         """
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index a3a2dee7038..2e63e82e630 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -23,3 +23,4 @@ from .lars_optimizer import LarsOptimizer
 from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer
 from .dgc_optimizer import DGCOptimizer
 from .lamb_optimizer import LambOptimizer
+from .fp16_allreduce_optimizer import FP16AllReduceOptimizer
diff --git a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
new file mode 100755
index 00000000000..411980ed013
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
@@ -0,0 +1,146 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from paddle.fluid import core, framework, unique_name
+from .meta_optimizer_base import MetaOptimizerBase
+
+
+class FP16AllReduceOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(FP16AllReduceOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        # we do not allow meta optimizer to be inner optimizer currently
+        self.meta_optimizers_white_list = [
+            "LarsOptimizer",
+            "LambOptimizer",
+            "RecomputeOptimizer",
+            "LocalSGDOptimizer",
+            "GradientMergeOptimizer",
+            "GraphExecutionOptimizer",
+            "AdaptiveLocalSGDOptimizer",
+        ]
+        self.meta_optimizers_black_list = ["DGCOptimizer"]
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(FP16AllReduceOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+
+    def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
+        if self.user_defined_strategy.fp16_allreduce:
+            return True
+
+        return False
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.fp16_allreduce = False
+
+    def _enable_strategy(self, dist_strategy, context=None):
+        dist_strategy.fp16_allreduce = True
+
+    @staticmethod
+    def fp16_compression(param_and_grads):
+        """
+        Compress fp32 gradients to fp16 during allreduce.
+        """
+        op_maker = core.op_proto_and_checker_maker
+
+        new_param_and_grads = []  # param, grad, is_cast
+        # cast grad from fp32->fp16 before allreduce,
+        for param, grad in param_and_grads:
+            if grad is None or grad.dtype != core.VarDesc.VarType.FP32:
+                new_param_and_grads.append((param, grad, False))
+                continue
+
+            op = grad.op
+            block = grad.block
+            var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()]
+            if param.name not in var_attr:
+                new_param_and_grads.append((param, grad, False))
+                continue
+
+            # remove (param, grad) from op_role_var
+            var_attr.remove(param.name)
+            var_attr.remove(grad.name)
+            if len(var_attr) > 1:
+                op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr)
+            else:
+                op._remove_attr(op_maker.kOpRoleVarAttrName())
+
+            new_grad = block.create_var(
+                name=unique_name.generate(grad.name + ".cast_fp16"),
+                dtype=core.VarDesc.VarType.FP16,
+                persistable=False,
+                stop_gradient=True)
+
+            with block.program._backward_role_guard():
+                cast_op = block.append_op(
+                    type="cast",
+                    inputs={"X": grad},
+                    outputs={"Out": new_grad},
+                    attrs={
+                        "in_dtype": core.VarDesc.VarType.FP32,
+                        "out_dtype": core.VarDesc.VarType.FP16
+                    },
+                    stop_gradient=True)
+
+                backward = op_maker.OpRole.Backward
+                cast_op._set_attr(op_maker.kOpRoleAttrName(), backward)
+                cast_op._set_attr(op_maker.kOpRoleVarAttrName(),
+                                  [param.name, new_grad.name])
+                new_grad.op = cast_op
+
+            new_param_and_grads.append((param, new_grad, True))
+
+        ret_param_and_grads = []
+        # cast grad from fp16->fp32 after allreduce.
+        # NOTE. Now we split fp16 compression into two for loops,
+        # if we do not separate them, fuse allreduce will wrong.
+        # This must be the problem of fuse allreduce pass, need
+        # fixed in future.
+        for param, grad, cast in new_param_and_grads:
+            if not cast:
+                ret_param_and_grads.append((param, grad))
+                continue
+
+            block = grad.block
+            new_grad = block.create_var(
+                name=unique_name.generate(grad.name + ".cast_fp32"),
+                dtype=core.VarDesc.VarType.FP32,
+                persistable=False,
+                stop_gradient=True)
+
+            with block.program._optimized_guard(
+                [param, grad]), framework.name_scope('fp16_allreduce'):
+                cast_op = block.append_op(
+                    type="cast",
+                    inputs={"X": grad},
+                    outputs={"Out": new_grad},
+                    attrs={
+                        "in_dtype": core.VarDesc.VarType.FP16,
+                        "out_dtype": core.VarDesc.VarType.FP32
+                    },
+                    stop_gradient=True)
+            ret_param_and_grads.append((param, new_grad))
+
+        return ret_param_and_grads
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        new_params_grads = self.fp16_compression(params_grads)
+        return self.inner_opt.apply_optimize(
+            loss,
+            startup_program=startup_program,
+            params_grads=new_params_grads)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 94bc6235ad1..2f8952a4431 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -45,6 +45,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_fp16_allreduce_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
@@ -458,6 +459,7 @@ if(WITH_DISTRIBUTE)
 	       py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
            py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS})
            py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
+           py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
 	   py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
new file mode 100644
index 00000000000..3198c6cac86
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from paddle.distributed.fleet.meta_optimizers import FP16AllReduceOptimizer as FP16AllReduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+from dist_mnist import cnn_model
+
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+class TestDistMnist2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        # Input data
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_acc = fluid.layers.accuracy(
+            input=predict, label=label, total=batch_size_tensor)
+
+        inference_program = fluid.default_main_program().clone()
+        # Optimization
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = FP16AllReduce(opt)
+
+        # Reader
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        opt.minimize(avg_cost)
+        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py
new file mode 100644
index 00000000000..d74d08681c1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistMnist2x2FP16AllReduce(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._nccl2_mode = True
+
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_mnist_fp16_allreduce.py", delta=1e-5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index b20f33e11b6..deaf342da12 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -102,6 +102,16 @@ class TestStrategyConfig(unittest.TestCase):
         strategy.dgc = "True"
         self.assertEqual(strategy.dgc, False)
 
+    def test_fp16_allreduce(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.fp16_allreduce = True
+        self.assertEqual(strategy.fp16_allreduce, True)
+        strategy.fp16_allreduce = False
+        self.assertEqual(strategy.fp16_allreduce, False)
+        with self.assertRaises(TypeError):
+            strategy.fp16_allreduce = "True"
+        self.assertEqual(strategy.fp16_allreduce, False)
+
     def test_sync_nccl_allreduce(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.sync_nccl_allreduce = True
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
new file mode 100644
index 00000000000..efffa9fa88f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import unittest
+import paddle
+import paddle.fluid as fluid
+import os
+
+paddle.enable_static()
+
+
+class TestFleetFP16CompressOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+
+    def net(self, main_prog, startup_prog, dtype='float32'):
+        with fluid.program_guard(main_prog, startup_prog):
+            input_x = paddle.fluid.layers.data(
+                name="x", shape=[32], dtype=dtype)
+            input_y = paddle.fluid.layers.data(
+                name="y", shape=[1], dtype='int64')
+
+            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                size=2,
+                                                act='softmax')
+            cost = paddle.fluid.layers.cross_entropy(
+                input=prediction, label=input_y)
+            avg_cost = paddle.fluid.layers.mean(x=cost)
+
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            strategy.fp16_allreduce = True
+        return avg_cost, strategy
+
+    def test_fp16_allreduce_optimizer(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        cast_out = [
+            op.output('Out')[0] for op in avg_cost.block.ops
+            if op.type == 'cast'
+        ]
+
+        cast_op_count = 0
+        for name in ops:
+            if name == 'cast':
+                cast_op_count += 1
+        self.assertIn('cast', ops)
+        self.assertEqual(cast_op_count, 12)  # 6 + 6, cast_fp16 + cast_fp32
+
+        for name in cast_out:
+            self.assertIn('cast_fp16', name)
+
+    def test_fp16_allreduce_not_apply_fp16_net(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog, dtype='float16')
+
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertNotIn('cast', ops)
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab


From dd04b160d9492969715f3dc5caff5e73ab321b27 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 25 Sep 2020 11:32:28 +0800
Subject: [PATCH 222/261] fix test imperative se resnext failed (#27538)

---
 .../unittests/test_imperative_se_resnext.py   | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index a04e1e4e5aa..e47a70054be 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -25,6 +25,9 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 
+if fluid.is_compiled_with_cuda():
+    fluid.set_flags({'FLAGS_cudnn_deterministic': True})
+
 batch_size = 8
 train_parameters = {
     "input_size": [3, 224, 224],
@@ -340,7 +343,9 @@ class TestImperativeResneXt(unittest.TestCase):
                     label.stop_gradient = True
 
                     out = se_resnext(img)
-                    loss = fluid.layers.cross_entropy(input=out, label=label)
+                    softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+                    loss = fluid.layers.cross_entropy(
+                        input=softmax_out, label=label)
                     avg_loss = fluid.layers.mean(x=loss)
 
                     dy_out = avg_loss.numpy()
@@ -386,7 +391,8 @@ class TestImperativeResneXt(unittest.TestCase):
                 name='pixel', shape=[3, 224, 224], dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             out = se_resnext(img)
-            loss = fluid.layers.cross_entropy(input=out, label=label)
+            softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+            loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
             avg_loss = fluid.layers.mean(x=loss)
             optimizer.minimize(avg_loss)
 
@@ -443,7 +449,9 @@ class TestImperativeResneXt(unittest.TestCase):
                         static_grad_value[static_grad_name_list[
                             i - grad_start_pos]] = out[i]
 
-        self.assertTrue(np.allclose(static_out, dy_out))
+        self.assertTrue(
+            np.allclose(static_out, dy_out),
+            "\nstatic_out: {}\ndy_out: {}".format(static_out, dy_out))
 
         self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
 
@@ -455,16 +463,23 @@ class TestImperativeResneXt(unittest.TestCase):
         self.assertEqual(len(dy_grad_value), len(static_grad_value))
 
         for key, value in six.iteritems(static_grad_value):
-            self.assertTrue(np.allclose(value, dy_grad_value[key]))
+            self.assertTrue(
+                np.allclose(value, dy_grad_value[key]),
+                "\nstatic_grad_value: {}\ndy_grad_value: {}".format(
+                    value, dy_grad_value[key]))
             self.assertTrue(np.isfinite(value.all()))
             self.assertFalse(np.isnan(value.any()))
 
         self.assertEqual(len(dy_param_value), len(static_param_value))
         for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            self.assertTrue(
+                np.allclose(value, dy_param_value[key]),
+                "\nstatic_param_value: {}\ndy_param_value: {}".format(
+                    value, dy_param_value[key]))
             self.assertTrue(np.isfinite(value.all()))
             self.assertFalse(np.isnan(value.any()))
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
-- 
GitLab


From 597345d17b5f6252878bfb0f62133f77f485d7ef Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 25 Sep 2020 11:58:53 +0800
Subject: [PATCH 223/261] fix cuda atomic for ARCH<350 for the automic_max

fix cuda atomic for ARCH<350 for the automic_max
---
 paddle/fluid/platform/cuda_primitives.h | 38 +++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index f7c77071b12..a5dd19d4363 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -134,7 +134,26 @@ USE_CUDA_ATOMIC(Max, int);
 USE_CUDA_ATOMIC(Max, unsigned int);
 // CUDA API uses unsigned long long int, we cannot use uint64_t here.
 // It because unsigned long long int is not necessarily uint64_t
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
+#else
+CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {
+  if (*address >= val) {
+    return;
+  }
+
+  unsigned long long int old = *address, assumed;
+
+  do {
+    assumed = old;
+    if (assumed >= val) {
+      break;
+    }
+
+    old = atomicCAS(address, assumed, val);
+  } while (assumed != old);
+}
+#endif
 
 CUDA_ATOMIC_WRAPPER(Max, int64_t) {
   // Here, we check long long int must be int64_t.
@@ -187,7 +206,26 @@ USE_CUDA_ATOMIC(Min, int);
 USE_CUDA_ATOMIC(Min, unsigned int);
 // CUDA API uses unsigned long long int, we cannot use uint64_t here.
 // It because unsigned long long int is not necessarily uint64_t
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
+#else
+CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {
+  if (*address <= val) {
+    return;
+  }
+
+  unsigned long long int old = *address, assumed;
+
+  do {
+    assumed = old;
+    if (assumed <= val) {
+      break;
+    }
+
+    old = atomicCAS(address, assumed, val);
+  } while (assumed != old);
+}
+#endif
 
 CUDA_ATOMIC_WRAPPER(Min, int64_t) {
   // Here, we check long long int must be int64_t.
-- 
GitLab


From 6e16a0997c7017f3167fe6b672d92c4e5bfefc42 Mon Sep 17 00:00:00 2001
From: zhang wenhui <frankwhzhang@126.com>
Date: Fri, 25 Sep 2020 13:05:22 +0800
Subject: [PATCH 224/261] fix unittest_group_norm_op_v2, test=develop (#27486)

* fix unittest_group_norm_op_v2, test=develop

* fix unittest_group_norm_op_v2, test=develop
---
 python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
index a46b9b0ca78..833eeb33641 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -56,7 +56,10 @@ class TestDygraphGroupNormv2(unittest.TestCase):
             x = np.random.randn(*shape).astype("float32")
             y1 = compute_v1(x)
             y2 = compute_v2(x)
-            self.assertTrue(np.allclose(y1, y2))
+            result = np.allclose(y1, y2)
+            if not result:
+                print("y1:", y1, "\ty2:", y2)
+            self.assertTrue(result)
             test_weight_bias_false()
 
     def test_static(self):
-- 
GitLab


From 77a36f8997e0c2297144a871d04331e9c9478896 Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Fri, 25 Sep 2020 13:27:40 +0800
Subject: [PATCH 225/261] [buf fix]:fix some unittests  error (#27540)

* [buf fix]:fix unittest test_activation_op error

* split long-time unittests to smaller ones

* rename some unittests
---
 .../fluid/inference/tests/api/CMakeLists.txt  | 22 +++++-
 ...yzer_seq_pool1_compare_determine_tester.cc | 40 ++++++++++
 .../api/analyzer_seq_pool1_compare_tester.cc  | 39 +++++++++
 ...seq_pool1_fuse_compare_zero_copy_tester.cc | 46 +++++++++++
 .../analyzer_seq_pool1_fuse_statis_tester.cc  | 48 +++++++++++
 .../api/analyzer_seq_pool1_profile_tester.cc  | 42 ++++++++++
 ....cc => analyzer_seq_pool1_tester_helper.h} | 79 ++-----------------
 .../analyzer_transformer_compare_tester.cc    | 44 +++++++++++
 .../api/analyzer_transformer_fuse_tester.cc   | 36 +++++++++
 .../analyzer_transformer_profile_tester.cc    | 45 +++++++++++
 ...c => analyzer_transformer_tester_helper.h} | 61 ++------------
 .../tests/unittests/test_activation_op.py     | 53 +++++++++++++
 12 files changed, 427 insertions(+), 128 deletions(-)
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
 rename paddle/fluid/inference/tests/api/{analyzer_seq_pool1_tester.cc => analyzer_seq_pool1_tester_helper.h} (70%)
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc
 rename paddle/fluid/inference/tests/api/{analyzer_transformer_tester.cc => analyzer_transformer_tester_helper.h} (82%)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 28211d0ce08..5d6970fc4e3 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -132,9 +132,17 @@ if(NOT APPLE AND WITH_MKLML)
     # seq_pool1
     set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
     download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
-    inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1_compare_determine ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_determine_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1_fuse_statis ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_statis_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1_profile ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_profile_tester.cc)
     if(NOT WIN32)
-        set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
+        set_tests_properties(test_analyzer_seq_pool1_compare_determine PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_analyzer_seq_pool1_fuse_compare_zero_copy PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_analyzer_seq_pool1_fuse_statis PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_analyzer_seq_pool1_profile PROPERTIES TIMEOUT 120)
     endif()
 else()
     # TODO: fix this test on MACOS and OPENBLAS, the reason is that
@@ -215,7 +223,15 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
 # transformer, the dataset only works on batch_size=8 now
 set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
 download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" "temp/transformer_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc 
+inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_compare_tester.cc 
+  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
+       --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
+inference_analysis_test(test_analyzer_transformer_fuse SRCS analyzer_transformer_fuse_tester.cc 
+  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
+       --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
+inference_analysis_test(test_analyzer_transformer_profile SRCS analyzer_transformer_profile_tester.cc 
   EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
   ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
        --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
new file mode 100644
index 00000000000..8f0778b83e5
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+
+// Compare Deterministic result
+TEST(Analyzer_seq_pool1_compare_determine, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
new file mode 100644
index 00000000000..099ff1f31a7
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+
+TEST(Analyzer_seq_pool1_compare, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
new file mode 100644
index 00000000000..1fbcbf1a3f4
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+
+// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
+TEST(Analyzer_seq_pool1_compare_zero_copy, compare_zero_copy) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  AnalysisConfig cfg1;
+  SetConfig(&cfg1);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  std::vector<std::string> outputs_name;
+  outputs_name.emplace_back(out_var_name);
+  CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
+                             reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
+                             input_slots_all, outputs_name);
+}
+
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
new file mode 100644
index 00000000000..b8ccb8cee50
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+
+// Check the fuse status
+TEST(Analyzer_seq_pool1_fuse_statis, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
+  ASSERT_TRUE(fuse_statis.count("squared_mat_sub_fuse"));
+  ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse"));
+  ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
+  EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
+  EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2);
+  EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2);
+  LOG(INFO) << "num_ops: " << num_ops;
+  EXPECT_EQ(num_ops, 171);
+}
+
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
new file mode 100644
index 00000000000..0ccd95f2a17
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
@@ -0,0 +1,42 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+
+void profile(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg, use_mkldnn);
+
+  std::vector<std::vector<PaddleTensor>> outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+}
+
+TEST(Analyzer_seq_pool1_profile, profile) { profile(); }
+
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
similarity index 70%
rename from paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
rename to paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
index 9f1556cdb87..0dac11bc345 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
@@ -11,15 +11,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
+#pragma once
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
+namespace seq_pool1_tester {
 
 // diff: similarity_norm.tmp_0, for speed: fc_4.tmp_1
 static const char out_var_name[] = "reduce_sum_0.tmp_0";
@@ -164,77 +169,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
   cfg->pass_builder()->InsertPass(2, "seqpool_concat_fuse_pass");
 }
 
-void profile(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg, use_mkldnn);
-
-  std::vector<std::vector<PaddleTensor>> outputs;
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-}
-
-TEST(Analyzer_seq_pool1, profile) { profile(); }
-
-// Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_seq_pool1, compare) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-// Compare Deterministic result
-TEST(Analyzer_seq_pool1, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-
-// Check the fuse status
-TEST(Analyzer_seq_pool1, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
-  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
-  ASSERT_TRUE(fuse_statis.count("squared_mat_sub_fuse"));
-  ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse"));
-  ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
-  EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
-  EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2);
-  EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2);
-  LOG(INFO) << "num_ops: " << num_ops;
-  EXPECT_EQ(num_ops, 171);
-}
-
-// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
-TEST(Analyzer_seq_pool1, compare_zero_copy) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  AnalysisConfig cfg1;
-  SetConfig(&cfg1);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  std::vector<std::string> outputs_name;
-  outputs_name.emplace_back(out_var_name);
-  CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
-                             reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
-                             input_slots_all, outputs_name);
-}
-
+}  // namespace seq_pool1_tester
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc
new file mode 100644
index 00000000000..f26ec57103b
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace transformer_tester {
+
+void compare(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
+  }
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
+TEST(Analyzer_Transformer, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+
+}  // namespace transformer_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc
new file mode 100644
index 00000000000..4e5484c9ea0
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace transformer_tester {
+
+// Check the fuse status
+TEST(Analyzer_Transformer, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+}
+
+}  // namespace transformer_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc
new file mode 100644
index 00000000000..caeba327716
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace transformer_tester {
+
+void profile(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> outputs;
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
+  }
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+}
+
+TEST(Analyzer_Transformer, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
+#endif
+
+}  // namespace transformer_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
similarity index 82%
rename from paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
rename to paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
index 9726109bf89..e43456ed832 100644
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
@@ -11,11 +11,16 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
+#pragma once
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
 namespace inference {
+namespace analysis {
+namespace transformer_tester {
 
 struct DataRecord {
   std::vector<std::vector<int64_t>> src_word, src_pos, trg_word, init_idx;
@@ -182,57 +187,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
   }
 }
 
-// Easy for profiling independently.
-void profile(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  std::vector<std::vector<PaddleTensor>> outputs;
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-}
-
-TEST(Analyzer_Transformer, profile) { profile(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
-#endif
-
-// Check the fuse status
-TEST(Analyzer_Transformer, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(
-      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-void compare(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-
-TEST(Analyzer_Transformer, compare) { compare(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
-#endif
-
+}  // namespace transformer_tester
+}  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index f6ba03194aa..791f1ee2dfa 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -28,6 +28,7 @@ from paddle.fluid import compiler, Program, program_guard
 
 class TestSqrtOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input type of sqrt op must be Variable or numpy.ndarray.
             in1 = 1
@@ -44,6 +45,7 @@ class TestSqrtOpError(unittest.TestCase):
 
 class TestActivation(OpTest):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "exp"
         self.init_dtype()
         self.init_kernel_type()
@@ -71,6 +73,7 @@ class TestActivation(OpTest):
 
 class TestParameter(object):
     def test_out_name(self):
+        paddle.enable_static()
         with fluid.program_guard(fluid.Program()):
             np_x = np.array([0.1])
             data = fluid.layers.data(name="X", shape=[1])
@@ -92,6 +95,7 @@ class TestParameter(object):
 
 class TestSigmoid(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "sigmoid"
         self.init_dtype()
 
@@ -112,6 +116,7 @@ class TestSigmoid(TestActivation):
 
 class TestLogSigmoid(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "logsigmoid"
         self.init_dtype()
 
@@ -180,6 +185,7 @@ class TestLogSigmoidAPI(unittest.TestCase):
 
 class TestTanh(TestActivation, TestParameter):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "tanh"
         self.init_dtype()
         x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
@@ -255,6 +261,7 @@ class TestTanhAPI(unittest.TestCase):
 
 class TestAtan(TestActivation, TestParameter):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "atan"
         self.init_dtype()
 
@@ -291,6 +298,7 @@ class TestAtan(TestActivation, TestParameter):
 
 class TestSinh(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "sinh"
         self.init_dtype()
 
@@ -349,6 +357,7 @@ class TestSinh(TestActivation):
 
 class TestSinhOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.sinh, 1)
@@ -362,6 +371,7 @@ class TestSinhOpError(unittest.TestCase):
 
 class TestCosh(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "cosh"
         self.init_dtype()
 
@@ -420,6 +430,7 @@ class TestCosh(TestActivation):
 
 class TestCoshOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.cosh, 1)
@@ -438,6 +449,7 @@ def ref_tanhshrink(x):
 
 class TestTanhshrink(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "tanh_shrink"
         self.init_dtype()
 
@@ -512,6 +524,7 @@ def ref_hardshrink(x, threshold):
 
 class TestHardShrink(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "hard_shrink"
         self.init_dtype()
 
@@ -541,6 +554,7 @@ class TestHardShrink_threshold_negative(TestHardShrink):
 class TestHardShrinkAPI(unittest.TestCase):
     # test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink
     def setUp(self):
+        paddle.enable_static()
         self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
         self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
             else paddle.CPUPlace()
@@ -662,6 +676,7 @@ def ref_softshrink(x, threshold=0.5):
 
 class TestSoftshrink(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "softshrink"
         self.init_dtype()
 
@@ -736,6 +751,7 @@ class TestSoftshrinkAPI(unittest.TestCase):
 
 class TestSqrt(TestActivation, TestParameter):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "sqrt"
         self.init_dtype()
 
@@ -753,6 +769,7 @@ class TestSqrt(TestActivation, TestParameter):
 
 class TestRsqrt(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "rsqrt"
         self.init_dtype()
 
@@ -770,6 +787,7 @@ class TestRsqrt(TestActivation):
 
 class TestAbs(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "abs"
         self.init_dtype()
 
@@ -792,6 +810,7 @@ class TestAbs(TestActivation):
 
 class TestCeil(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "ceil"
         self.init_dtype()
 
@@ -808,6 +827,7 @@ class TestCeil(TestActivation):
 
 class TestFloor(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "floor"
         self.init_dtype()
 
@@ -826,6 +846,7 @@ class TestFloor(TestActivation):
 
 class TestCos(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "cos"
         self.init_dtype()
 
@@ -843,6 +864,7 @@ class TestCos(TestActivation):
 
 class TestAcos(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "acos"
         self.init_dtype()
 
@@ -860,6 +882,7 @@ class TestAcos(TestActivation):
 
 class TestSin(TestActivation, TestParameter):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "sin"
         self.init_dtype()
 
@@ -877,6 +900,7 @@ class TestSin(TestActivation, TestParameter):
 
 class TestAsin(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "asin"
         self.init_dtype()
 
@@ -894,6 +918,7 @@ class TestAsin(TestActivation):
 
 class TestRound(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "round"
         self.init_dtype()
 
@@ -909,6 +934,7 @@ class TestRound(TestActivation):
 
 class TestRelu(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "relu"
         self.init_dtype()
 
@@ -979,6 +1005,7 @@ class TestLeakyRelu(TestActivation):
         return 0.02
 
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "leaky_relu"
         self.init_dtype()
         alpha = self.get_alpha()
@@ -1084,6 +1111,7 @@ def gelu(x, approximate):
 
 class TestGeluApproximate(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "gelu"
         self.init_dtype()
         approximate = True
@@ -1102,6 +1130,7 @@ class TestGeluApproximate(TestActivation):
 
 class TestGelu(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "gelu"
         self.init_dtype()
         approximate = False
@@ -1169,6 +1198,7 @@ class TestGELUAPI(unittest.TestCase):
 
 class TestBRelu(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "brelu"
         self.init_dtype()
 
@@ -1194,6 +1224,7 @@ class TestBRelu(TestActivation):
 
 class TestBReluOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.brelu, 1)
@@ -1215,6 +1246,7 @@ def ref_relu6(x, threshold=6.0):
 
 class TestRelu6(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "relu6"
         self.init_dtype()
 
@@ -1286,6 +1318,7 @@ class TestRelu6API(unittest.TestCase):
 
 class TestHardSwish(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = 'hard_swish'
         self.init_dtype()
 
@@ -1310,6 +1343,7 @@ class TestHardSwish(TestActivation):
 
 class TestHardSwishOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.hard_swish, 1)
@@ -1323,6 +1357,7 @@ class TestHardSwishOpError(unittest.TestCase):
 
 class TestSoftRelu(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "soft_relu"
         self.init_dtype()
 
@@ -1348,6 +1383,7 @@ class TestSoftRelu(TestActivation):
 
 class TestSoftReluOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.soft_relu, 1)
@@ -1366,6 +1402,7 @@ def elu(x, alpha):
 
 class TestELU(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "elu"
         self.init_dtype()
 
@@ -1435,6 +1472,7 @@ class TestELUAPI(unittest.TestCase):
 
 class TestReciprocal(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "reciprocal"
         self.init_dtype()
 
@@ -1452,6 +1490,7 @@ class TestReciprocal(TestActivation):
 
 class TestLog(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "log"
         self.init_dtype()
 
@@ -1478,6 +1517,7 @@ class TestLog(TestActivation):
 
 class TestLog1p(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "log1p"
         self.init_dtype()
 
@@ -1522,6 +1562,7 @@ class TestLog1p(TestActivation):
 
 class TestSquare(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "square"
         self.init_dtype()
 
@@ -1539,6 +1580,7 @@ class TestSquare(TestActivation):
 
 class TestPow(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "pow"
         self.init_dtype()
 
@@ -1557,6 +1599,7 @@ class TestPow(TestActivation):
 
 class TestPow_factor_tensor(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "pow"
         self.init_dtype()
 
@@ -1633,6 +1676,7 @@ class TestPow_factor_tensor(TestActivation):
 
 class TestSTanh(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "stanh"
         self.init_dtype()
 
@@ -1653,6 +1697,7 @@ class TestSTanh(TestActivation):
 
 class TestSTanhOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.stanh, 1)
@@ -1673,6 +1718,7 @@ def ref_softplus(x, beta=1, threshold=20):
 
 class TestSoftplus(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "softplus"
         self.init_dtype()
 
@@ -1751,6 +1797,7 @@ def ref_softsign(x):
 
 class TestSoftsign(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "softsign"
         self.init_dtype()
 
@@ -1818,6 +1865,7 @@ class TestSoftsignAPI(unittest.TestCase):
 
 class TestThresholdedRelu(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "thresholded_relu"
         self.init_dtype()
 
@@ -1841,6 +1889,7 @@ class TestThresholdedRelu(TestActivation):
 
 class TestThresholdedReluOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.thresholded_relu, 1)
@@ -1854,6 +1903,7 @@ class TestThresholdedReluOpError(unittest.TestCase):
 
 class TestHardSigmoid(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "hard_sigmoid"
         self.init_dtype()
 
@@ -1883,6 +1933,7 @@ class TestHardSigmoid(TestActivation):
 
 class TestHardSigmoidOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.hard_sigmoid, 1)
@@ -1896,6 +1947,7 @@ class TestHardSigmoidOpError(unittest.TestCase):
 
 class TestSwish(TestActivation):
     def setUp(self):
+        paddle.enable_static()
         self.op_type = "swish"
         self.init_dtype()
 
@@ -1915,6 +1967,7 @@ class TestSwish(TestActivation):
 
 class TestSwishOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.swish, 1)
-- 
GitLab


From 6bb02e8e3c14cd3ba7bdd80ee44fd93c8a9ade6b Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 25 Sep 2020 13:41:54 +0800
Subject: [PATCH 226/261] increase retry time (#27553)

---
 paddle/fluid/memory/allocation/retry_allocator_test.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index 0e81f5f2238..5d3e133f97d 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
+
 #include <algorithm>
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
@@ -20,6 +21,7 @@
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
@@ -45,7 +47,7 @@ TEST(RetryAllocator, RetryAllocator) {
 
   size_t thread_num = 4;
   size_t sleep_time = 40;
-  size_t extra_time = 10;
+  size_t extra_time = 20;
 
   // Reserve to perform more tests in the future
   std::vector<std::shared_ptr<Allocator>> allocators;
-- 
GitLab


From effd51b6bebdd79df798ab66eeba5465886f560e Mon Sep 17 00:00:00 2001
From: liym27 <33742067+liym27@users.noreply.github.com>
Date: Fri, 25 Sep 2020 14:06:03 +0800
Subject: [PATCH 227/261] Fix error message in operator/utils.h (#27532)

---
 paddle/fluid/operators/utils.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index aec995304a7..05d077b173a 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -41,7 +41,9 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
     // NOTE: Converting int64 to int32 may cause data overflow.
     vec_new_data = std::vector<T>(data, data + x->numel());
   } else {
-    PADDLE_THROW("The dtype of Tensor must be int32 or int64.");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The dtype of Tensor must be int32 or int64, but received: %s",
+        x->type()));
   }
   return vec_new_data;
 }
@@ -53,10 +55,11 @@ inline std::vector<T> GetDataFromTensorList(
   for (size_t i = 0; i < list_tensor.size(); ++i) {
     auto tensor = list_tensor[i];
     PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
-                      "ShapeError: The shape of Tensor in list must be [1]. "
-                      "But received the shape "
-                      "is [%s]",
-                      tensor->dims());
+                      platform::errors::InvalidArgument(
+                          "The shape of Tensor in list must be [1]. "
+                          "But received its shape "
+                          "is [%s]",
+                          tensor->dims()));
 
     if (tensor->type() == framework::proto::VarType::INT32) {
       if (platform::is_gpu_place(tensor->place())) {
@@ -76,7 +79,10 @@ inline std::vector<T> GetDataFromTensorList(
         vec_new_data.push_back(static_cast<T>(*tensor->data<int64_t>()));
       }
     } else {
-      PADDLE_THROW("The dtype of Tensor in list must be int32 or int64.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The dtype of Tensor in list must be int32 or int64, but received: "
+          "%s",
+          tensor->type()));
     }
   }
   return vec_new_data;
-- 
GitLab


From 36ed83d27005c22a8af19211638c343c4dc5b759 Mon Sep 17 00:00:00 2001
From: GaoWei8 <53294385+GaoWei8@users.noreply.github.com>
Date: Fri, 25 Sep 2020 14:09:10 +0800
Subject: [PATCH 228/261] Refine PADDLE_ENFORCE (#27360)

* refine PADDLE_ENFORCE
---
 paddle/fluid/operators/benchmark/op_tester.cc | 32 ++++++----
 .../operators/benchmark/op_tester_config.cc   | 20 ++++--
 paddle/fluid/operators/jit/benchmark.cc       | 12 +++-
 paddle/fluid/operators/jit/gen/embseqpool.cc  | 30 +++++++--
 paddle/fluid/operators/jit/gen/matmul.cc      | 24 ++++++--
 paddle/fluid/operators/jit/gen/matmul.h       |  5 +-
 paddle/fluid/operators/jit/gen/seqpool.cc     | 10 ++-
 paddle/fluid/operators/jit/gen/seqpool.h      |  9 ++-
 paddle/fluid/operators/jit/gen/sgd.cc         | 21 ++++++-
 paddle/fluid/operators/jit/gen/vbroadcast.cc  |  6 +-
 paddle/fluid/operators/jit/gen_base.cc        | 11 +++-
 paddle/fluid/operators/jit/helper.cc          | 23 +++++--
 paddle/fluid/operators/jit/helper.h           | 25 +++++---
 paddle/fluid/operators/jit/more/mix/mix.cc    |  3 +-
 paddle/fluid/operators/jit/more/mkl/mkl.h     | 61 ++++++++++++++++---
 paddle/fluid/operators/jit/refer/refer.h      | 51 +++++++++++++---
 paddle/fluid/operators/jit/test.cc            | 11 +++-
 17 files changed, 278 insertions(+), 76 deletions(-)

diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
index 5ec34e57450..654df5ccd5e 100644
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -47,8 +47,8 @@ void OpTester::Init(const OpTesterConfig &config) {
     CreateInputVarDesc();
     CreateOutputVarDesc();
   } else {
-    PADDLE_THROW(platform::errors::NotFound("Operator '%s' is not registered.",
-                                            config_.op_type));
+    PADDLE_THROW(platform::errors::NotFound(
+        "Operator '%s' is not registered in OpTester.", config_.op_type));
   }
 
   if (config_.device_id >= 0) {
@@ -81,7 +81,8 @@ void OpTester::Run() {
       platform::EnableProfiler(platform::ProfilerState::kAll);
       platform::SetDeviceId(config_.device_id);
 #else
-      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "'CUDAPlace' is not supported in CPU only device."));
 #endif
     }
 
@@ -162,7 +163,8 @@ framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
   } else if (str == "fp64") {
     return framework::proto::VarType::FP64;
   } else {
-    PADDLE_THROW("Unsupported dtype %s.", str.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported dtype %s in OpTester.", str.c_str()));
   }
 }
 
@@ -233,8 +235,8 @@ void OpTester::CreateOpDesc() {
       case framework::proto::AttrType::INTS:
       case framework::proto::AttrType::FLOATS:
       case framework::proto::AttrType::STRINGS:
-        PADDLE_THROW(
-            platform::errors::Unimplemented("Not supported STRINGS type yet."));
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported STRINGS type in OpTester yet."));
         break;
       case framework::proto::AttrType::LONG: {
         int64_t value = StringTo<int64_t>(value_str);
@@ -242,7 +244,8 @@ void OpTester::CreateOpDesc() {
       } break;
       case framework::proto::AttrType::LONGS:
       default:
-        PADDLE_THROW("Unsupport attr type %d", type);
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupport attr type %d in OpTester.", type));
     }
   }
 }
@@ -299,7 +302,8 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
     }
     is.close();
   } else {
-    PADDLE_THROW("Unsupported initializer %s.", initializer.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported initializer %s in OpTester.", initializer.c_str()));
   }
 
   if (!platform::is_cpu_place(place_)) {
@@ -351,7 +355,8 @@ void OpTester::CreateVariables(framework::Scope *scope) {
                           static_cast<double>(1.0), item.second.initializer,
                           item.second.filename);
     } else {
-      PADDLE_THROW("Unsupported dtype %d.", data_type);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported dtype %d in OpTester.", data_type));
     }
 
     VLOG(3) << "Set lod for tensor " << var_name;
@@ -473,7 +478,8 @@ std::string OpTester::DebugString() {
            << "\n";
       } break;
       default:
-        PADDLE_THROW("Unsupport attr type %d", attr_type);
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupport attr type %d in OpTester.", attr_type));
     }
     ss << GenSpaces(--count) << "}\n";
   }
@@ -484,8 +490,10 @@ std::string OpTester::DebugString() {
 TEST(op_tester, base) {
   if (!FLAGS_op_config_list.empty()) {
     std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
-                   FLAGS_op_config_list.c_str());
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(fin), true,
+        platform::errors::InvalidArgument("OpTester cannot open file %s",
+                                          FLAGS_op_config_list.c_str()));
     std::vector<OpTesterConfig> op_configs;
     while (!fin.eof()) {
       VLOG(4) << "Reading config " << op_configs.size() << "...";
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc
index 818e5f64edc..e9477798858 100644
--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -78,7 +78,8 @@ void OpInputConfig::ParseDType(std::istream& is) {
   } else if (dtype_str == "fp64" || dtype_str == "double") {
     dtype = "fp64";
   } else {
-    PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported dtype %s in OpInputConfig.", dtype_str.c_str()));
   }
   VLOG(4) << "dtype of input " << name << " is: " << dtype;
 }
@@ -91,7 +92,9 @@ void OpInputConfig::ParseInitializer(std::istream& is) {
   const std::vector<std::string> supported_initializers = {"random", "natural",
                                                            "zeros", "file"};
   if (!Has(supported_initializers, initializer_str)) {
-    PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported initializer %s in OpInputConfig.",
+        initializer_str.c_str()));
   }
 
   initializer = initializer_str;
@@ -126,7 +129,12 @@ void OpInputConfig::ParseLoD(std::istream& is) {
     }
   }
   EraseEndSep(&lod_str);
-  PADDLE_ENFORCE_GE(lod_str.length(), 4U);
+  PADDLE_ENFORCE_GE(
+      lod_str.length(), 4U,
+      platform::errors::InvalidArgument(
+          "The length of lod string should be "
+          "equal to or larger than 4. But length of lod string is %zu.",
+          lod_str.length()));
   VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length();
 
   // Parse the lod_str
@@ -153,8 +161,10 @@ void OpInputConfig::ParseLoD(std::istream& is) {
 
 OpTesterConfig::OpTesterConfig(const std::string& filename) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
-                 filename.c_str());
+  PADDLE_ENFORCE_EQ(
+      static_cast<bool>(fin), true,
+      platform::errors::InvalidArgument("OpTesterConfig cannot open file %s.",
+                                        filename.c_str()));
 
   Init(fin);
 }
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 898f27f9afe..d65cdc6c150 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -136,7 +136,6 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
 }
 
 using Tensor = paddle::framework::Tensor;
-
 template <typename KernelTuple, typename PlaceType>
 void BenchKernelXYZN() {
   using T = typename KernelTuple::data_type;
@@ -320,8 +319,15 @@ void BenchKernelSgd() {
   const T lr = 0.1;
   auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
                                   const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
+    PADDLE_ENFORCE_LE(
+        static_cast<size_t>(upper - lower), n - 1,
+        paddle::platform::errors::InvalidArgument(
+            "The range of Sgd (upper - lower) should be equal to or lower "
+            "than n-1 (Sgd size -1). But upper - lower is %d and n-1 is %d.",
+            static_cast<size_t>(upper - lower), (n - 1)));
+    PADDLE_ENFORCE_GT(
+        n, 0, paddle::platform::errors::InvalidArgument(
+                  "The Sgd size should be larger than 0. But the n is %d.", n));
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc
index b4e63d87eac..c549fec0970 100644
--- a/paddle/fluid/operators/jit/gen/embseqpool.cc
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
@@ -132,11 +132,31 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const emb_seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.table_height, 0);
-    PADDLE_ENFORCE_GT(attr.table_width, 0);
-    PADDLE_ENFORCE_GT(attr.index_height, 0);
-    PADDLE_ENFORCE_GT(attr.index_width, 0);
-    PADDLE_ENFORCE_GT(attr.out_width, 0);
+    PADDLE_ENFORCE_GT(attr.table_height, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute table_height of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.table_height));
+    PADDLE_ENFORCE_GT(attr.table_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute table_width of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.table_width));
+    PADDLE_ENFORCE_GT(attr.index_height, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute index_height of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.index_height));
+    PADDLE_ENFORCE_GT(attr.index_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute index_width of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.index_width));
+    PADDLE_ENFORCE_GT(attr.out_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute out_width of EmbSeqPool should be "
+                          "larger than 0. But it is %d.",
+                          attr.out_width));
     return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc
index 047d0d3e1ca..3139b252cad 100644
--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@@ -29,7 +29,11 @@ void MatMulJitCode::genCode() {
   preCode();
   int block, rest;
   const auto groups = packed_groups(n_, k_, &block, &rest);
-  PADDLE_ENFORCE_GT(groups.front(), 0);
+  PADDLE_ENFORCE_GT(
+      groups.front(), 0,
+      platform::errors::InvalidArgument("The number of rest registers should "
+                                        "be larger than 0. But it is %d.",
+                                        groups.front()));
 
   const int block_len = sizeof(float) * block;
   const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
@@ -118,9 +122,21 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const matmul_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.m, 0);
-    PADDLE_ENFORCE_GT(attr.n, 0);
-    PADDLE_ENFORCE_GT(attr.k, 0);
+    PADDLE_ENFORCE_GT(
+        attr.m, 0, platform::errors::InvalidArgument(
+                       "The attribute m (first matrix's row) of MatMul should "
+                       "be larger than 0. But it is %d.",
+                       attr.m));
+    PADDLE_ENFORCE_GT(
+        attr.n, 0, platform::errors::InvalidArgument(
+                       "The attribute n (first matrix's col) of MatMul should "
+                       "be larger than 0. But it is %d.",
+                       attr.n));
+    PADDLE_ENFORCE_GT(
+        attr.k, 0, platform::errors::InvalidArgument(
+                       "The attribute k (second matrix's col) of MatMul should "
+                       "be larger than 0. But it is %d.",
+                       attr.k));
     return make_unique<MatMulJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h
index 4f04f7606d2..eb7328d7e06 100644
--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -33,7 +33,10 @@ class MatMulJitCode : public JitCode {
                          size_t code_size = 256 * 1024,
                          void* code_ptr = nullptr)
       : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
-    PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
+    PADDLE_ENFORCE_EQ(m_, 1, platform::errors::Unimplemented(
+                                 "Jitcode of matmul only support m==1 (first "
+                                 "matrix's row) now. But m is %d.",
+                                 m_));
     this->genCode();
   }
 
diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc
index ec8e4e98274..d8c7b3cdb7b 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
@@ -70,8 +70,14 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.w, 0);
-    PADDLE_ENFORCE_GT(attr.h, 0);
+    PADDLE_ENFORCE_GT(attr.w, 0, platform::errors::InvalidArgument(
+                                     "The attribute width of SeqPool should "
+                                     "be larger than 0. But it is %d.",
+                                     attr.w));
+    PADDLE_ENFORCE_GT(attr.h, 0, platform::errors::InvalidArgument(
+                                     "The attribute height of SeqPool should "
+                                     "be larger than 0. But it is %d.",
+                                     attr.h));
     return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h
index cb562c4c9a6..d4e7b2e29ce 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
@@ -127,8 +127,13 @@ class SeqPoolJitCode : public JitCode {
         vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
         reg_idx++;
       }
-      PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs,
-                        "All heights should use same regs");
+      PADDLE_ENFORCE_EQ(
+          reg_idx, rest_used_num_regs,
+          platform::errors::InvalidArgument(
+              "All heights of SeqPool should use the same number of registers."
+              "It equals to the numbr of rest registers. But use %d registers "
+              "and the numbr of rest registers is %d.",
+              reg_idx, rest_used_num_regs));
       for (int i = 0; i < reg_idx; ++i) {
         vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
       }
diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc
index 1452d4139b0..7fe93fdb6a5 100644
--- a/paddle/fluid/operators/jit/gen/sgd.cc
+++ b/paddle/fluid/operators/jit/gen/sgd.cc
@@ -116,9 +116,24 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
   size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; }
   std::unique_ptr<GenBase> CreateJitCode(
       const sgd_attr_t& attr) const override {
-    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
-    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height);
-    PADDLE_ENFORCE_GE(attr.selected_rows_size, 0);
+    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width,
+                      platform::errors::InvalidArgument(
+                          "The attribute param_width of Sgd should be "
+                          "equal to the attribute grad_width. But param_width "
+                          "is %d and grad_width is %d.",
+                          attr.param_width, attr.grad_width));
+    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height,
+                      platform::errors::InvalidArgument(
+                          "The attribute selected_rows_size of Sgd should be "
+                          "equal to or less than the attribute grad_height. "
+                          "But selected_rows_size is %d and grad_height is %d.",
+                          attr.selected_rows_size, attr.grad_height));
+    PADDLE_ENFORCE_GE(
+        attr.selected_rows_size, 0,
+        platform::errors::InvalidArgument(
+            "The attribute selected_rows_size of Sgd should be "
+            "equal to or larger than 0. But selected_rows_size is %d.",
+            attr.selected_rows_size));
     return make_unique<SgdJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.cc b/paddle/fluid/operators/jit/gen/vbroadcast.cc
index 66a8d75fd4d..4084d68c2a8 100644
--- a/paddle/fluid/operators/jit/gen/vbroadcast.cc
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc
@@ -76,7 +76,11 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
     return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
   }
   std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
-    PADDLE_ENFORCE_GT(w, 0);
+    PADDLE_ENFORCE_GT(
+        w, 0,
+        platform::errors::InvalidArgument(
+            "The width of VBroadcast should be larger than 0. But w is %d.",
+            w));
     return make_unique<VBroadcastJitCode>(w, CodeSize(w));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc
index 4c49eff49e3..2ae71256cdd 100644
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -49,9 +49,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
 void* GenBase::operator new(size_t size) {
   void* ptr;
   constexpr size_t alignment = 32ul;
-  PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0,
-                    "GenBase Alloc %ld error!", size);
-  PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
+  PADDLE_ENFORCE_EQ(
+      posix_memalign(&ptr, alignment, size), 0,
+      platform::errors::InvalidArgument(
+          "Jitcode generator (GenBase) allocate %ld memory error!", size));
+  PADDLE_ENFORCE_NOT_NULL(ptr, platform::errors::InvalidArgument(
+                                   "Fail to allocate jitcode generator "
+                                   "(GenBase) CPU memory: size = %d .",
+                                   size));
   return ptr;
 }
 
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
index 2952cdb8714..c66e8092d5e 100644
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -66,7 +66,8 @@ const char* to_string(KernelType kt) {
     ONE_CASE(kEmbSeqPool);
     ONE_CASE(kSgd);
     default:
-      PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "JIT kernel do not support type: %d.", kt));
       return "NOT JITKernel";
   }
   return nullptr;
@@ -79,7 +80,8 @@ const char* to_string(SeqPoolType tp) {
     ONE_CASE(kAvg);
     ONE_CASE(kSqrt);
     default:
-      PADDLE_THROW("Not support type: %d, or forget to add it.", tp);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "SeqPool JIT kernel do not support type: %d.", tp));
       return "NOT PoolType";
   }
   return nullptr;
@@ -100,7 +102,8 @@ KernelType to_kerneltype(const std::string& act) {
   } else if (lower == "tanh" || lower == "vtanh") {
     return kVTanh;
   }
-  PADDLE_THROW("Not support type: %s, or forget to add this case", act);
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Act JIT kernel do not support type: %s.", act));
   return kNone;
 }
 
@@ -109,12 +112,19 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
   int block, rest;
   const auto groups = packed_groups(n, k, &block, &rest);
   std::for_each(groups.begin(), groups.end(), [&](int i) {
-    PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
+    PADDLE_ENFORCE_GT(i, 0, platform::errors::InvalidArgument(
+                                "Each element of groups should be larger than "
+                                "0. However the element: %d doesn't satify.",
+                                i));
   });
   int sum = std::accumulate(groups.begin(), groups.end(), 0);
   std::memset(dst, 0, k * sum * block * sizeof(float));
   PADDLE_ENFORCE_GE(sum * block, n,
-                    "The packed n should be equal to or larger than n");
+                    platform::errors::InvalidArgument(
+                        "The packed n (sum * block) should be equal to or "
+                        "larger than n (matmul row size). "
+                        "However, the packed n is %d and n is %d.",
+                        sum * block, n));
 
   const int block_len = sizeof(float) * block;
   int n_offset = 0;
@@ -136,7 +146,8 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
 template <typename T>
 typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
     const T* src, T* dst, int n, int k) {
-  PADDLE_THROW("Only support pack with float type.");
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Only supports pack weights with float type."));
 }
 
 }  // namespace jit
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
index b6dd49b7772..0791bb58105 100644
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -85,8 +85,10 @@ inline const Kernel* GetReferKernel() {
   auto& ref_pool = ReferKernelPool::Instance().AllKernels();
   KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace());
   auto ref_iter = ref_pool.find(kkey);
-  PADDLE_ENFORCE(ref_iter != ref_pool.end(),
-                 "Every Kernel should have reference function.");
+  PADDLE_ENFORCE_NE(
+      ref_iter, ref_pool.end(),
+      platform::errors::PreconditionNotMet(
+          "Every Refer Kernel of jitcode should have reference function."));
   auto& ref_impls = ref_iter->second;
   for (auto& impl : ref_impls) {
     auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
@@ -101,7 +103,9 @@ template <typename KernelTuple>
 inline typename KernelTuple::func_type GetReferFunc() {
   auto ker = GetReferKernel<KernelTuple>();
   auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
-  PADDLE_ENFORCE(p, "The Refer kernel should exsit");
+  PADDLE_ENFORCE_NOT_NULL(p, platform::errors::InvalidArgument(
+                                 "Get the reference code of kernel in CPU "
+                                 "failed. The Refer kernel should exsit."));
   return p->GetFunc();
 }
 
@@ -132,7 +136,9 @@ std::vector<const Kernel*> GetAllCandidateKernels(
 
   // The last implementation should be reference function on CPUPlace.
   auto ref = GetReferKernel<KernelTuple>();
-  PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
+  PADDLE_ENFORCE_NOT_NULL(ref, platform::errors::InvalidArgument(
+                                   "Get all candicate kernel in CPU failed. "
+                                   "The Refer Kernel can not be empty."));
   res.emplace_back(ref);
   return res;
 }
@@ -147,11 +153,14 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
     std::string name = k->ImplType();
     if (name == "JitCode") {
       auto i = dynamic_cast<const GenBase*>(k);
-      PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
+      PADDLE_ENFORCE_NOT_NULL(i,
+                              platform::errors::InvalidArgument(
+                                  "Generate jitcode kernel (GenBase) failed."));
       res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
     } else {
       auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
-      PADDLE_ENFORCE(i, "kernel cast can not fail.");
+      PADDLE_ENFORCE_NOT_NULL(i, platform::errors::InvalidArgument(
+                                     "Kernel cast (KernelMore) failed."));
       res.emplace_back(std::make_pair(name, i->GetFunc()));
     }
   }
@@ -173,7 +182,9 @@ template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
 typename KernelTuple::func_type GetDefaultBestFunc(
     const typename KernelTuple::attr_type& attr) {
   auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
-  PADDLE_ENFORCE_GE(funcs.size(), 1UL);
+  PADDLE_ENFORCE_GE(funcs.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "The candicate jit kernel is at least one in CPU."));
   // Here could do some runtime benchmark of this attr and return the best one.
   // But yet just get the first one as the default best one,
   // which is searched in order and tuned by offline.
diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc
index f5b7bfff898..5d63f4848e6 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -95,7 +95,8 @@ void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
   } else if (type == kVIdentity) {
     return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d);
   }
-  PADDLE_THROW("Not support type: %s", type);
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Act JIT kernel do not support type: %s", type));
   return nullptr;
 }
 
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index ee31c8df2f8..5f3c29ad5ef 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -103,11 +103,24 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
 template <typename T>
 void EmbSeqPool(const T* table, const int64_t* idx, T* out,
                 const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  PADDLE_ENFORCE_EQ(
+      attr->table_width * attr->index_width, attr->out_width,
+      platform::errors::InvalidArgument(
+          "The attribute table_width * index_width of EmbSeqPool should "
+          "be equal to out_width. But table_width * index_width is %d, "
+          "out_width is %d.",
+          attr->table_width * attr->index_width, attr->out_width));
   auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
-                      idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+    PADDLE_ENFORCE_LT(
+        idx[i], attr->table_height,
+        platform::errors::InvalidArgument(
+            "The idx shoud be lower than the attribute table_height of "
+            "EmbSeqPool. But %dth of idx is %d and table_height is %d.",
+            i, idx[i], attr->table_height));
+    PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
+                                     "The idx shoud be equal to or larger than "
+                                     "the 0. But %dth of idx is %d.",
+                                     i, idx[i]));
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -168,22 +181,50 @@ void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
 template <typename T>
 void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
          T* out, const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width,
+                    platform::errors::InvalidArgument(
+                        "The attribute param_width of Sgd should be "
+                        "equal to the attribute grad_width. But param_width "
+                        "is %d and grad_width is %d.",
+                        attr->param_width, attr->grad_width));
+  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height,
+                    platform::errors::InvalidArgument(
+                        "The attribute selected_rows_size of Sgd should be "
+                        "equal to or less than the attribute grad_height. "
+                        "But selected_rows_size is %d and grad_height is %d.",
+                        attr->selected_rows_size, attr->grad_height));
   T scalar = -lr[0];
   int width = attr->grad_width;
   if (out == param) {
     for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
       auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
+      PADDLE_ENFORCE_LT(h_idx, attr->param_height,
+                        platform::errors::InvalidArgument(
+                            "The rows of Sgd should be "
+                            "less than the attribute. But %dth of rows "
+                            "is %d and grad_width is %d.",
+                            i, h_idx, attr->param_height));
+      PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
+                                      "The rows of Sgd should be "
+                                      "larger than 0. But %dth of rows "
+                                      "is %d.",
+                                      i, h_idx));
       VAXPY(scalar, grad + i * width, out + h_idx * width, width);
     }
   } else {
     for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
       auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
+      PADDLE_ENFORCE_LT(h_idx, attr->param_height,
+                        platform::errors::InvalidArgument(
+                            "The rows of Sgd should be "
+                            "less than the attribute. But %dth of rows "
+                            "is %d and grad_width is %d.",
+                            i, h_idx, attr->param_height));
+      PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
+                                      "The rows of Sgd should be "
+                                      "larger than 0. But %dth of rows "
+                                      "is %d.",
+                                      i, h_idx));
       VScal(&scalar, grad + i * width, out + h_idx * width, width);
       VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width,
            width);
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index b8d5e2c2407..42fb7b4f279 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -147,7 +147,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) {  // NOLINT
   } else if (type == kVIdentity) {
     return VIdentity<T>;
   }
-  PADDLE_THROW("Not support type: %s", type);
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Act JIT kernel do not support type: %s.", type));
   return nullptr;
 }
 
@@ -465,12 +466,25 @@ void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
 template <typename T>
 void EmbSeqPool(const T* table, const int64_t* idx, T* out,
                 const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  PADDLE_ENFORCE_EQ(
+      attr->table_width * attr->index_width, attr->out_width,
+      platform::errors::InvalidArgument(
+          "The attribute table_width * index_width of EmbSeqPool should "
+          "be equal to out_width. But table_width * index_width is %d and "
+          "out_width is %d.",
+          attr->table_width * attr->index_width, attr->out_width));
 
   auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
-                      idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+    PADDLE_ENFORCE_LT(
+        idx[i], attr->table_height,
+        platform::errors::InvalidArgument(
+            "The idx shoud be lower than the attribute table_height of "
+            "EmbSeqPool. But %dth of idx is %d and table_height is %d.",
+            i, idx[i], attr->table_height));
+    PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
+                                     "The idx shoud be equal to or larger than "
+                                     "the 0. But %dth of idx is %d.",
+                                     i, idx[i]));
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -505,12 +519,31 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
 template <typename T>
 void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
          T* out, const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width,
+                    platform::errors::InvalidArgument(
+                        "The attribute param_width of Sgd should be "
+                        "equal to the attribute grad_width. But param_width "
+                        "is %d and grad_width is %d.",
+                        attr->param_width, attr->grad_width));
+  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height,
+                    platform::errors::InvalidArgument(
+                        "The attribute selected_rows_size of Sgd should be "
+                        "equal to or less than the attribute grad_height. "
+                        "But selected_rows_size is %d and grad_height is %d.",
+                        attr->selected_rows_size, attr->grad_height));
   for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
     auto h_idx = rows[i];
-    PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-    PADDLE_ENFORCE_GE(h_idx, 0);
+    PADDLE_ENFORCE_LT(h_idx, attr->param_height,
+                      platform::errors::InvalidArgument(
+                          "The rows of Sgd should be "
+                          "less than the attribute. But %dth of rows "
+                          "is %d and grad_width is %d.",
+                          i, h_idx, attr->param_height));
+    PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
+                                    "The rows of Sgd should be "
+                                    "larger than 0. But %dth of rows "
+                                    "is %d.",
+                                    i, h_idx));
     for (int64_t j = 0; j < attr->grad_width; ++j) {
       out[h_idx * attr->grad_width + j] =
           param[h_idx * attr->grad_width + j] -
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index eb56f111f08..0cc62720b87 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -850,8 +850,15 @@ void TestKernelSgd() {
   const T lr = 0.1;
   auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
                                   const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
+    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1,
+                      paddle::platform::errors::InvalidArgument(
+                          "The range of Sgd (upper - lower) should be lower "
+                          "than n-1 (Sgd size -1). But the upper - lower is %d "
+                          "and n-1 is %d.",
+                          static_cast<size_t>(upper - lower), n - 1));
+    PADDLE_ENFORCE_GT(
+        n, 0, paddle::platform::errors::InvalidArgument(
+                  "The Sgd size should be larger than 0. But the n is %d.", n));
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
-- 
GitLab


From fab4e6d08f55ccbd07749d5df8eaf4d0d760e1b4 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 25 Sep 2020 14:49:48 +0800
Subject: [PATCH 229/261] add abs support double grad

add abs support double grad for the api 2.0
---
 paddle/fluid/operators/activation_op.cc       | 42 +++++++++++++++++--
 paddle/fluid/operators/activation_op.cu       | 14 ++++++-
 paddle/fluid/operators/activation_op.h        | 21 ++++++++++
 .../unittests/test_activation_nn_grad.py      | 24 +++++++++++
 4 files changed, 97 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 5a3660cee85..95214484dca 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -763,10 +763,28 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
   }
 };
 
-//
+// AbsGrad: dx=dy if x >=0 else -dy
+// AbsDoubleGrad: ddy = ddx if x >=0 else -ddx
+template <typename T>
+class AbsDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
+ public:
+  using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("abs_grad_grad");
+    // input1: x
+    op->SetInput("X", this->Input("X"));
+    // input2: ddx
+    op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    op->SetAttrMap(this->Attrs());
+    // output: ddy
+    op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+  }
+};
+
 // ReluGrad: dx = dy if y >= 0 else 0
 // ReluGradGrad: ddy = ddx if y >= 0 else 0
-//
 template <typename T>
 class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
  public:
@@ -1214,7 +1232,13 @@ REGISTER_OPERATOR(
     std::conditional<ops::CanInplaceAct<ops::AbsGradFunctor<float>>(),
                      ops::ActFwdInplaceInferer, void>::type);
 REGISTER_OPERATOR(abs_grad, ops::ActivationOpGrad,
-                  ops::ActivationGradOpInplaceInferer);
+                  ops::ActivationGradOpInplaceInferer,
+                  ops::AbsDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::AbsDoubleGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(
+    abs_grad_grad,
+    ops::ActivationOpDoubleGrad<ops::AbsGradGradFunctor<float>::FwdDeps()>,
+    ops::ActivationDoubleGradOpInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(abs,
                        ops::ActivationKernel<paddle::platform::CPUDeviceContext,
@@ -1234,6 +1258,18 @@ REGISTER_OP_CPU_KERNEL(
                               ops::AbsGradFunctor<int>>,
     ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
                               ops::AbsGradFunctor<int64_t>>);
+REGISTER_OP_CPU_KERNEL(
+    abs_grad_grad,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<plat::float16>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<int>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<int64_t>>);
 /* ========================================================================== */
 
 /* ==========================  register checkpoint ===========================*/
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 48ec90471f0..072d952d261 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -160,7 +160,7 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::ExpGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ==========================   exp register  ============================ */
+/* ==========================   abs register  ============================ */
 
 REGISTER_OP_CUDA_KERNEL(
     abs, ops::ActivationKernel<plat::CUDADeviceContext, ops::AbsFunctor<float>>,
@@ -180,4 +180,16 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::AbsGradFunctor<int64_t>>,
     ops::ActivationGradKernel<plat::CUDADeviceContext,
                               ops::AbsGradFunctor<plat::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    abs_grad_grad,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<plat::float16>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<int>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<int64_t>>);
 /* ========================================================================== */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 00a7c063c91..646f546bffb 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -1430,6 +1430,27 @@ class ActivationDoubleGradKernel
   }
 };
 
+template <typename T>
+struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* X,
+                  const framework::Tensor* Out, const framework::Tensor* ddX,
+                  framework::Tensor* ddOut, framework::Tensor* dOut,
+                  framework::Tensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "AbsGradGrad"));
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "AbsGradGrad"));
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "AbsGradGrad"));
+      ddout.device(*d) = ddx * x.sign();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 template <typename T>
 struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device>
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index db9e8d2c6bd..e8b8a45fb67 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -147,5 +147,29 @@ class TestSquareDoubleGradCheck(unittest.TestCase):
             self.func(p)
 
 
+class TestAbsDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        shape = [2, 3, 7, 9]
+        eps = 1e-6
+        dtype = np.float64
+
+        x = layers.data('x', shape, False, dtype)
+        x.persistable = True
+        y = layers.abs(x)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From 6fc74bbaf614cf8501a812b7044191df8f21117d Mon Sep 17 00:00:00 2001
From: ShenLiang <shenliang03@baidu.com>
Date: Fri, 25 Sep 2020 15:18:35 +0800
Subject: [PATCH 230/261] add fp16 for matmul (#27523)

* add fp16 for matmul
---
 paddle/fluid/operators/math/blas_impl.cu.h    | 29 ++++++
 paddle/fluid/operators/matmul_v2_op.cu        | 10 +-
 paddle/fluid/operators/matmul_v2_op.h         | 55 ++++++-----
 .../tests/unittests/test_matmul_v2_op.py      | 99 ++++++++++++++-----
 python/paddle/tensor/linalg.py                |  4 +-
 5 files changed, 142 insertions(+), 55 deletions(-)

diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index a0464cf70e2..aeafe22235c 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -420,6 +420,22 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
   });
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMV(
+    bool trans_a, int M, int N, platform::float16 alpha,
+    const platform::float16 *A, const platform::float16 *B,
+    platform::float16 beta, platform::float16 *C) const {
+  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                           alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                           alpha, A, B, beta, C);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -479,6 +495,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
   }
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::float16 alpha, const platform::float16 **A,
+    const platform::float16 **B, platform::float16 beta, platform::float16 **C,
+    int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
+                                           B[k], beta, C[k]);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
diff --git a/paddle/fluid/operators/matmul_v2_op.cu b/paddle/fluid/operators/matmul_v2_op.cu
index 64ec65a2341..91958513ddb 100644
--- a/paddle/fluid/operators/matmul_v2_op.cu
+++ b/paddle/fluid/operators/matmul_v2_op.cu
@@ -17,10 +17,12 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plf = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(matmul_v2,
-                        ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
-                        ops::MatMulV2Kernel<plf::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul_v2, ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
+    ops::MatMulV2Kernel<plf::CUDADeviceContext, double>,
+    ops::MatMulV2Kernel<plf::CUDADeviceContext, plf::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     matmul_v2_grad, ops::MatMulV2GradKernel<plf::CUDADeviceContext, float>,
-    ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>);
+    ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>,
+    ops::MatMulV2GradKernel<plf::CUDADeviceContext, plf::float16>);
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index 8cd4fa12be4..ee485bd1711 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -163,17 +163,20 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
     if (trans_y) {
       const int M = Y->numel() / N;
       VLOG(3) << "MatMul's case 2";
-      blas.GEMV(false, M, N, 1., y_data, x_data, 0., Out->data<T>());
+      blas.GEMV(false, M, N, static_cast<T>(1), y_data, x_data,
+                static_cast<T>(0), Out->data<T>());
     } else {
       const int M = y_dims[y_ndim - 1];
       const int batch_size = Y->numel() / (M * N);
       if (batch_size == 1) {
         VLOG(3) << "MatMul's case 3";
-        blas.GEMV(true, N, M, 1., y_data, x_data, 0., Out->data<T>());
+        blas.GEMV(true, N, M, static_cast<T>(1), y_data, x_data,
+                  static_cast<T>(0), Out->data<T>());
       } else {
         VLOG(3) << "MatMul's case 4";
-        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, y_data,
-                         x_data, 0, Out->data<T>(), batch_size, M * N, 0);
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
+                         y_data, x_data, static_cast<T>(0), Out->data<T>(),
+                         batch_size, M * N, 0);
       }
     }
     return;
@@ -205,16 +208,19 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
       const int batch_size = X->numel() / (M * N);
       if (batch_size == 1) {
         VLOG(3) << "MatMul's case 5";
-        blas.GEMV(true, N, M, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+        blas.GEMV(true, N, M, static_cast<T>(1), x_data, y_data,
+                  static_cast<T>(0), Out->data<T>());
       } else {
         VLOG(3) << "MatMul's case 6";
-        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, x_data,
-                         y_data, 0, Out->data<T>(), batch_size, M * N, 0);
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
+                         x_data, y_data, static_cast<T>(0), Out->data<T>(),
+                         batch_size, M * N, 0);
       }
     } else {
       const int M = X->numel() / N;
       VLOG(3) << "MatMul's case 7";
-      blas.GEMV(false, M, N, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+      blas.GEMV(false, M, N, static_cast<T>(1), x_data, y_data,
+                static_cast<T>(0), Out->data<T>());
     }
     return;
   }
@@ -263,37 +269,38 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
   if (x_batch_size == 1 && y_batch_size == 1) {
     VLOG(3) << "MatMul's case 8";
     blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
-              trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
-              y_data, 0.0f, Out->data<T>());
+              trans_y ? CblasTrans : CblasNoTrans, M, N, K, static_cast<T>(1),
+              x_data, y_data, static_cast<T>(0), Out->data<T>());
   } else if (x_batch_size == 1) {
     if (M == 1 && trans_y) {
       VLOG(3) << "MatMul's case 9";
-      blas.GEMV(false, y_batch_size * N, K, 1.0f, y_data, x_data, 0.0f,
-                Out->data<T>());
+      blas.GEMV(false, y_batch_size * N, K, static_cast<T>(1), y_data, x_data,
+                static_cast<T>(0), Out->data<T>());
     } else {
       VLOG(3) << "MatMul's case 10";
       blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                       trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
-                       x_data, y_data, 0, Out->data<T>(), out_batch_size, 0,
-                       K * N);
+                       trans_y ? CblasTrans : CblasNoTrans, M, N, K,
+                       static_cast<T>(1), x_data, y_data, static_cast<T>(0),
+                       Out->data<T>(), out_batch_size, 0, K * N);
     }
   } else if (y_batch_size == 1) {
     if (!trans_x) {
       VLOG(3) << "MatMul's case 11";
       blas.GEMM(CblasNoTrans, trans_y ? CblasTrans : CblasNoTrans,
-                x_batch_size * M, N, K, 1.0f, x_data, y_data, 0.0f,
-                Out->data<T>());
+                x_batch_size * M, N, K, static_cast<T>(1), x_data, y_data,
+                static_cast<T>(0), Out->data<T>());
     } else {
       VLOG(3) << "MatMul's case 12";
       blas.BatchedGEMM(CblasTrans, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
-                       1.0f, x_data, y_data, 0, Out->data<T>(), out_batch_size,
-                       M * K, 0);
+                       static_cast<T>(1), x_data, y_data, static_cast<T>(0),
+                       Out->data<T>(), out_batch_size, M * K, 0);
     }
   } else if (!is_broadcast_dims) {
     VLOG(3) << "MatMul's case 13";
     blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
-                     y_data, 0, Out->data<T>(), out_batch_size, M * K, K * N);
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K,
+                     static_cast<T>(1), x_data, y_data, static_cast<T>(0),
+                     Out->data<T>(), out_batch_size, M * K, K * N);
   } else {
     // in the case, can't use stridedgemm
     std::vector<const T*> x_ptr(out_batch_size);
@@ -314,9 +321,9 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
     }
     VLOG(3) << "MatMul's case 14";
     blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
-                     x_ptr.data(), y_ptr.data(), 0.0f, out_ptr.data(),
-                     out_batch_size);
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K,
+                     static_cast<T>(1), x_ptr.data(), y_ptr.data(),
+                     static_cast<T>(0), out_ptr.data(), out_batch_size);
   }
 }
 
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 884139a23d5..640771df23b 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -65,15 +65,21 @@ class TestMatMulV2Op(OpTest):
         self.y_shape = (100, )
         self.trans_x = False
         self.trans_y = False
+
+    def init_kernel_type(self):
         self.dtype = "float64"
 
     def setUp(self):
+        self.init_kernel_type()
         self.config()
         self.op_type = "matmul_v2"
         x = np.random.random(self.x_shape).astype(self.dtype)
         y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        x = -0.1 + 0.2 * x
+        y = -0.1 + 0.2 * y
         result = reference_matmul(x, y, self.trans_x, self.trans_y)
-
+        result = result.astype(self.dtype)
         self.inputs = {
             'X': x,
             'Y': y,
@@ -98,7 +104,6 @@ class TestMatMuklOp2(TestMatMulV2Op):
         self.y_shape = (1, 3, 2, 100)
         self.trans_x = False
         self.trans_y = True
-        self.dtype = "float64"
 
 
 class TestMatMuklOp3(TestMatMulV2Op):
@@ -111,7 +116,6 @@ class TestMatMuklOp3(TestMatMulV2Op):
         self.y_shape = (1, 1, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp4(TestMatMulV2Op):
@@ -124,7 +128,6 @@ class TestMatMuklOp4(TestMatMulV2Op):
         self.y_shape = (1, 2, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp5(TestMatMulV2Op):
@@ -133,11 +136,10 @@ class TestMatMuklOp5(TestMatMulV2Op):
     """
 
     def config(self):
-        self.x_shape = (1, 1, 100, 2)
+        self.x_shape = (1, 1, 100, 1)
         self.y_shape = (100, )
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp6(TestMatMulV2Op):
@@ -150,7 +152,6 @@ class TestMatMuklOp6(TestMatMulV2Op):
         self.y_shape = (100, )
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp7(TestMatMulV2Op):
@@ -163,7 +164,6 @@ class TestMatMuklOp7(TestMatMulV2Op):
         self.y_shape = (100, )
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp8(TestMatMulV2Op):
@@ -176,7 +176,6 @@ class TestMatMuklOp8(TestMatMulV2Op):
         self.y_shape = (1, 1, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp9(TestMatMulV2Op):
@@ -189,7 +188,6 @@ class TestMatMuklOp9(TestMatMulV2Op):
         self.y_shape = (2, 1, 2, 100)
         self.trans_x = False
         self.trans_y = True
-        self.dtype = "float64"
 
 
 class TestMatMuklOp10(TestMatMulV2Op):
@@ -198,11 +196,10 @@ class TestMatMuklOp10(TestMatMulV2Op):
     """
 
     def config(self):
-        self.x_shape = (1, 1, 2, 100)
-        self.y_shape = (1, 2, 100, 2)
+        self.x_shape = (1, 1, 25, 4)
+        self.y_shape = (1, 2, 4, 25)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp11(TestMatMulV2Op):
@@ -215,7 +212,6 @@ class TestMatMuklOp11(TestMatMulV2Op):
         self.y_shape = (1, 1, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp12(TestMatMulV2Op):
@@ -224,11 +220,10 @@ class TestMatMuklOp12(TestMatMulV2Op):
     """
 
     def config(self):
-        self.x_shape = (2, 1, 100, 2)
-        self.y_shape = (1, 1, 100, 2)
+        self.x_shape = (2, 1, 4, 25)
+        self.y_shape = (1, 1, 4, 25)
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp13(TestMatMulV2Op):
@@ -237,11 +232,10 @@ class TestMatMuklOp13(TestMatMulV2Op):
     """
 
     def config(self):
-        self.x_shape = (2, 2, 100, 2)
-        self.y_shape = (2, 2, 100, 2)
+        self.x_shape = (2, 2, 2, 50)
+        self.y_shape = (2, 2, 2, 50)
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp14(TestMatMulV2Op):
@@ -254,7 +248,6 @@ class TestMatMuklOp14(TestMatMulV2Op):
         self.y_shape = (1, 2, 2, 100, 2)
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp15(TestMatMulV2Op):
@@ -267,7 +260,6 @@ class TestMatMuklOp15(TestMatMulV2Op):
         self.y_shape = (1, 2, 2, 100, 1)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp16(TestMatMulV2Op):
@@ -277,10 +269,9 @@ class TestMatMuklOp16(TestMatMulV2Op):
 
     def config(self):
         self.x_shape = (100)
-        self.y_shape = (1, 2, 2, 100, 1)
+        self.y_shape = (1, 2, 2, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp17(TestMatMulV2Op):
@@ -293,7 +284,54 @@ class TestMatMuklOp17(TestMatMulV2Op):
         self.y_shape = (100)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
+
+
+#--------------------test matmul fp16--------------------
+
+
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place, atol=atol)
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_grad_with_place(
+                    place, ['X', 'Y'],
+                    'Out',
+                    max_relative_error=max_relative_error)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+
+
+create_test_fp16_class(TestMatMulV2Op)
+create_test_fp16_class(TestMatMuklOp2)
+create_test_fp16_class(TestMatMuklOp3)
+create_test_fp16_class(TestMatMuklOp4)
+create_test_fp16_class(TestMatMuklOp5)
+create_test_fp16_class(TestMatMuklOp6)
+create_test_fp16_class(TestMatMuklOp7)
+create_test_fp16_class(TestMatMuklOp8)
+create_test_fp16_class(TestMatMuklOp9)
+create_test_fp16_class(TestMatMuklOp10)
+create_test_fp16_class(TestMatMuklOp11)
+create_test_fp16_class(TestMatMuklOp12)
+create_test_fp16_class(TestMatMuklOp13)
+create_test_fp16_class(TestMatMuklOp14)
+create_test_fp16_class(TestMatMuklOp15)
+create_test_fp16_class(TestMatMuklOp16)
+create_test_fp16_class(TestMatMuklOp17)
 
 
 class TestMatMulV2API(unittest.TestCase):
@@ -331,6 +369,17 @@ class TestMatMulV2API(unittest.TestCase):
                 y = paddle.to_tensor(input_y)
                 result = paddle.matmul(x, y)
 
+    def test_dygraph_fp16(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                with fluid.dygraph.guard(place):
+                    input_x = np.random.random([4, 3]).astype("float16")
+                    input_y = np.random.random([3, 4]).astype("float16")
+                    x = paddle.to_tensor(input_x)
+                    y = paddle.to_tensor(input_y)
+                    result = paddle.matmul(x, y)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index f27cfba487d..26624d3b5ff 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -156,8 +156,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     def __check_input(x, y):
         var_names = {'x': x, 'y': y}
         for name, val in var_names.items():
-            check_variable_and_dtype(val, name, ['float32', 'float64'],
-                                     'matmul')
+            check_variable_and_dtype(
+                val, name, ['float16', 'float32', 'float64'], 'matmul')
 
     __check_input(x, y)
 
-- 
GitLab


From c143326df5cf397451f1f4b931c955742c3e7f16 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 25 Sep 2020 15:47:30 +0800
Subject: [PATCH 231/261] try to fix test_paddle_save_load unknown timeout
 (#27536)

* try to fix paddle save load test

* open paddle save load

* replace dataloader

* remove dataloader
---
 .../fluid/tests/unittests/CMakeLists.txt      |  3 --
 .../tests/unittests/test_paddle_save_load.py  | 49 +++++++++----------
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 2f8952a4431..09797576801 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -335,9 +335,6 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
 list(REMOVE_ITEM TEST_OPS test_sampling_id_op)
-list(REMOVE_ITEM TEST_OPS test_paddle_save_load)
-
-
 
 if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_dataset)
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index 74d44d0f8b6..fee34945586 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -29,19 +29,23 @@ IMAGE_SIZE = 784
 CLASS_NUM = 10
 
 
-# define a random dataset
-class RandomDataset(paddle.io.Dataset):
-    def __init__(self, num_samples):
-        self.num_samples = num_samples
-
-    def __getitem__(self, idx):
+def random_batch_reader():
+    def _get_random_inputs_and_labels():
         np.random.seed(SEED)
-        image = np.random.random([IMAGE_SIZE]).astype('float32')
-        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+        image = np.random.random([BATCH_SIZE, IMAGE_SIZE]).astype('float32')
+        label = np.random.randint(0, CLASS_NUM - 1, (
+            BATCH_SIZE,
+            1, )).astype('int64')
         return image, label
 
-    def __len__(self):
-        return self.num_samples
+    def __reader__():
+        for _ in range(BATCH_NUM):
+            batch_image, batch_label = _get_random_inputs_and_labels()
+            batch_image = paddle.to_tensor(batch_image)
+            batch_label = paddle.to_tensor(batch_label)
+            yield batch_image, batch_label
+
+    return __reader__
 
 
 class LinearNet(nn.Layer):
@@ -66,8 +70,7 @@ def train(layer, loader, loss_fn, opt):
 class TestSaveLoad(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
-        self.place = paddle.CPUPlace()
-        paddle.disable_static(self.place)
+        paddle.disable_static()
 
         # config seed
         paddle.manual_seed(SEED)
@@ -81,14 +84,8 @@ class TestSaveLoad(unittest.TestCase):
         adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
 
         # create data loader
-        dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
-        loader = paddle.io.DataLoader(
-            dataset,
-            places=self.place,
-            batch_size=BATCH_SIZE,
-            shuffle=True,
-            drop_last=True,
-            num_workers=2)
+        # TODO: using new DataLoader cause unknown Timeout on windows, replace it
+        loader = random_batch_reader()
 
         # train
         train(layer, loader, loss_fn, adam)
@@ -103,8 +100,8 @@ class TestSaveLoad(unittest.TestCase):
         layer, opt = self.build_and_train_model()
 
         # save
-        layer_save_path = "linear.pdparams"
-        opt_save_path = "linear.pdopt"
+        layer_save_path = "test_paddle_save_load.linear.pdparams"
+        opt_save_path = "test_paddle_save_load.linear.pdopt"
         layer_state_dict = layer.state_dict()
         opt_state_dict = opt.state_dict()
 
@@ -120,7 +117,7 @@ class TestSaveLoad(unittest.TestCase):
 
         # test save load in static mode
         paddle.enable_static()
-        static_save_path = "static_mode_test/linear.pdparams"
+        static_save_path = "static_mode_test/test_paddle_save_load.linear.pdparams"
         paddle.save(layer_state_dict, static_save_path)
         load_static_state_dict = paddle.load(static_save_path)
         self.check_load_state_dict(layer_state_dict, load_static_state_dict)
@@ -133,15 +130,15 @@ class TestSaveLoad(unittest.TestCase):
 
         # 2. test save path format error
         with self.assertRaises(ValueError):
-            paddle.save(layer_state_dict, "linear.model/")
+            paddle.save(layer_state_dict, "test_paddle_save_load.linear.model/")
 
         # 3. test load path not exist error
         with self.assertRaises(ValueError):
-            paddle.load("linear.params")
+            paddle.load("test_paddle_save_load.linear.params")
 
         # 4. test load old save path error
         with self.assertRaises(ValueError):
-            paddle.load("linear")
+            paddle.load("test_paddle_save_load.linear")
 
 
 if __name__ == '__main__':
-- 
GitLab


From 8daccc9ea7dbabec034882575b3738cf5c4c1dcc Mon Sep 17 00:00:00 2001
From: ceci3 <ceci3@users.noreply.github.com>
Date: Fri, 25 Sep 2020 16:25:49 +0800
Subject: [PATCH 232/261] Fix batch norm double grad compute (#27549)

* fix bn double grad, test=develop

* update, test=develop
---
 paddle/fluid/operators/batch_norm_op.cc       | 55 ++++++++------
 paddle/fluid/operators/instance_norm_op.cc    |  6 +-
 paddle/fluid/operators/norm_utils.cu.h        | 75 ++++++++++++++-----
 .../tests/unittests/test_norm_nn_grad.py      | 36 +++++++++
 4 files changed, 131 insertions(+), 41 deletions(-)

diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index dcfe8bb1bb4..7a88403aa9d 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -839,6 +839,7 @@ void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const {
   op->SetInput("SavedMean", this->Input("SavedMean"));
   op->SetInput("SavedVariance", this->Input("SavedVariance"));
   if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) {
+    op->SetInput("Mean", this->Input("Mean"));
     op->SetInput("Variance", this->Input("Variance"));
   }
   op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
@@ -868,14 +869,19 @@ void BatchNormDoubleGradOp::InferShape(
                    "BatchNormDoubleGrad");
   }
 
-  OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX", "BatchNormDoubleGrad");
   OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "BatchNormDoubleGrad");
 
   // check output
   OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX", "BatchNormDoubleGrad");
 
   const auto x_dims = ctx->GetInputDim("X");
-  const int C = x_dims[1];
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+  const int C =
+      ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+           ? x_dims[1]
+           : x_dims[x_dims.size() - 1]);
+
   if (ctx->HasOutput("DX")) {
     ctx->SetOutputDim("DX", x_dims);
   }
@@ -957,7 +963,9 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
 
     Tensor inv_var_tensor;
     if (use_global_stats) {
+      const auto *running_mean = ctx.Input<Tensor>("Mean");
       const auto *running_variance = ctx.Input<Tensor>("Variance");
+      mean_data = running_mean->data<T>();
       inv_var_tensor.Resize({C});
 
       T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
@@ -1077,12 +1085,12 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
         //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
         //          np.sum(dy,
         //          axis=(n,h,w)) * (x - mean) *
-        //          (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
+        //          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
         //          inv_var
         //          *
         //          np.mean(dy, axis=(n,h,w)) -
         //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-        //          axis=(n,h,w))))
+        //          axis=(n,h,w)))
 
         if (ddX) {
           dx_arr +=
@@ -1176,7 +1184,8 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
                                C, sample_size);
       ddy_arr.setZero();
       if (use_global_stats) {
-        // math: ddy = r * ddx * inv_var
+        // math: ddy = r * ddx * inv_var + ddbias +
+        //           ddscale * (x - mean) * inv_var
         if (ddX) {
           ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
         }
@@ -1196,25 +1205,29 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
                        .replicate(1, sample_size) /
                    sample_size);
         }
-        if (ddScale && ddBias) {
-          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-          Tensor ddscale_tile;
-          ddscale_tile.Resize({C, sample_size});
-          EigenArrayMap<T> ddscale_tile_data(
-              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+      }
+      if (ddScale) {
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+        Tensor ddscale_tile;
+        ddscale_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddscale_tile_data(
+            ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+        ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+      }
 
-          ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
-          Tensor ddbias_tile;
-          ddbias_tile.Resize({C, sample_size});
-          EigenArrayMap<T> ddbias_tile_data(
-              ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
+      if (ddBias) {
+        ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+        Tensor ddbias_tile;
+        ddbias_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddbias_tile_data(
+            ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+        ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
 
-          ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
-          ddy_arr += ddbias_tile_data;
-        }
+        ddy_arr += ddbias_tile_data;
       }
+
       if (data_layout == DataLayout::kNCHW) {
         VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
         TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index a5b270c1dfe..03279a9b2c1 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -520,11 +520,11 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
     //          (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
     //          np.sum(dy,
     //          axis=(h,w)) * (x - mean) *
-    //          (np.mean(ddx, axis=(h,w)) - ddx) + ddr * (dy * inv_var - inv_var
-    //          *
+    //          (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var -
+    //          inv_var *
     //          np.mean(dy, axis=(h,w)) -
     //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-    //          axis=(h,w))))
+    //          axis=(h,w)))
 
     Tensor x_sub_mean_mul_invstd;
     x_sub_mean_mul_invstd.Resize({sample_size, NxC});
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 07333f1ae11..02dcb4045f4 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -40,12 +40,12 @@ using DataLayout = framework::DataLayout;
 //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
 //          np.sum(dy,
 //          axis=(n,h,w)) * (x - mean) *
-//          (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
+//          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
 //          inv_var
 //          *
 //          np.mean(dy, axis=(n,h,w)) -
 //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-//          axis=(n,h,w))))
+//          axis=(n,h,w)))
 
 template <typename T, int BlockDim, framework::DataLayout layout>
 __global__ void DoubleGradComputeDX(const T *x, const T *mean,
@@ -138,7 +138,7 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
                 ? (j / sample_size * C + i) * sample_size + j % sample_size
                 : j * outer_size + i;
         dx[index] += (dy[index] * var_val - dy_sum_val / inner_size * var_val -
-                      (x[index] - mean_val) * var_val *
+                      (x[index] - mean_val) * var_val * var_val *
                           dy_mul_x_sub_mean_sum_val * var_val / inner_size) *
                      ddscale[i];
       }
@@ -326,19 +326,57 @@ __global__ void DoubleGradComputeDScaleWithGlobal(
 }
 
 // math: dx = ddscale * dy * inv_var
-// math: ddy = scale * ddx * inv_var
 template <typename T, framework::DataLayout layout>
-__global__ void DoubleGradComputeDataWithGlobal(
-    const T *dy, const T *scale, const T *variance, const double epsilon,
-    const int C, const int sample_size, const int num, T *dx) {
+__global__ void DoubleGradComputeDXWithGlobal(const T *dy, const T *ddscale,
+                                              const T *variance,
+                                              const double epsilon, const int C,
+                                              const int sample_size,
+                                              const int num, T *dx) {
   int gid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
-  if (scale != nullptr) {
+  if (ddscale != nullptr) {
     for (int i = gid; i < num; i += stride) {
       const int c =
           layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
       T inv_var = 1.0 / sqrt(variance[c] + epsilon);
-      dx[i] = dy[i] * scale[c] * inv_var;
+      dx[i] = dy[i] * ddscale[c] * inv_var;
+    }
+  }
+}
+
+// math: ddy = scale * ddx * inv_var + ddbias +
+//             ddscale * (x - mean) * inv_var
+template <typename T, framework::DataLayout layout>
+__global__ void DoubleGradComputeDDYWithGlobal(
+    const T *ddx, const T *scale, const T *mean, const T *variance, const T *x,
+    const T *ddbias, const T *ddscale, const double epsilon, const int C,
+    const int sample_size, const int num, T *ddy) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  if (ddx != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      T inv_var = 1.0 / sqrt(variance[c] + epsilon);
+      ddy[i] += ddx[i] * scale[c] * inv_var;
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      T inv_var = 1.0 / sqrt(variance[c] + epsilon);
+      ddy[i] += (x[i] - mean[c]) * inv_var * ddscale[c];
+    }
+  }
+  __syncthreads();
+  if (ddbias != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      ddy[i] += ddbias[c];
     }
   }
 }
@@ -383,8 +421,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
 
   const T *mean_data, *variance_data;
   if (use_global_stats) {
+    const auto *running_mean = ctx.Input<Tensor>("Mean");
     const auto *running_var = ctx.Input<Tensor>("Variance");
+    const auto *running_mean_data = running_mean->template data<T>();
     const auto *running_var_data = running_var->template data<T>();
+    mean_data = running_mean_data;
     variance_data = running_var_data;
   } else {
     const T *smean_data = Saved_mean->data<T>();
@@ -398,12 +439,12 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     set_constant(dev_ctx, dX, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDXWithGlobal<
             T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
       } else {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDXWithGlobal<
             T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
@@ -456,15 +497,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     set_constant(dev_ctx, ddY, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDDYWithGlobal<
             T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
-            ddx_data, scale_data, variance_data, epsilon, C, sample_size, num,
-            ddy_data);
+            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
+            ddscale_data, epsilon, C, sample_size, num, ddy_data);
       } else {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDDYWithGlobal<
             T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
-            ddx_data, scale_data, variance_data, epsilon, C, sample_size, num,
-            ddy_data);
+            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
+            ddscale_data, epsilon, C, sample_size, num, ddy_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index a89b9fde7f9..cb4bd16ce21 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -130,5 +130,41 @@ class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck):
         self.shape = [2, 2, 3, 4, 5]
 
 
+class TestBatchNormDoubleGradCheckCase5(TestBatchNormDoubleGradCheck):
+    @prog_scope()
+    def func(self, place):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+            np.random.seed()
+            dtype = "float32"
+            eps = 0.005
+            atol = 2e-4
+            chn = self.shape[1] if self.data_layout == 'NCHW' else self.shape[
+                -1]
+            x = layers.create_parameter(dtype=dtype, shape=self.shape, name='x')
+            z = fluid.layers.batch_norm(
+                input=x,
+                data_layout=self.data_layout,
+                use_global_stats=self.use_global_stats)
+            x_arr = np.random.uniform(-1, 1, self.shape).astype(dtype)
+            w, b = prog.global_block().all_parameters()[1:3]
+            w_arr = np.ones(chn).astype(dtype)
+            b_arr = np.zeros(chn).astype(dtype)
+            gradient_checker.double_grad_check(
+                [x, w, b],
+                z,
+                x_init=[x_arr, w_arr, b_arr],
+                atol=atol,
+                place=place,
+                eps=eps)
+
+
+class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5):
+    def init_test(self):
+        self.data_layout = 'NCHW'
+        self.use_global_stats = True
+        self.shape = [2, 3, 4, 5]
+
+
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From 6f69a4cb059119176f556a0aac0253d2899c6b59 Mon Sep 17 00:00:00 2001
From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com>
Date: Fri, 25 Sep 2020 16:39:22 +0800
Subject: [PATCH 233/261] add xpu in heter mode (#27000)

* add xpu in heter mode
test=develop

* BOOST_CONST_GET; PADDLE_THROW
test=develop

* code style
test=develop

* code style
test=develop

* code style
test=develop

* refine
test=develop

* refine
test=develop

* refine
test=develop

* refine code
test=develop
---
 cmake/third_party.cmake                       |   8 +-
 paddle/fluid/framework/device_worker.h        |   5 +-
 paddle/fluid/framework/fleet/fleet_wrapper.cc |  50 +++++++
 paddle/fluid/framework/fleet/fleet_wrapper.h  |   8 ++
 paddle/fluid/framework/fleet/heter_wrapper.cc |  59 ++++++--
 paddle/fluid/framework/heterxpu_trainer.cc    | 127 ++++++++++++++++--
 paddle/fluid/framework/pull_dense_worker.cc   |  20 ++-
 paddle/fluid/framework/trainer.h              |  17 ++-
 paddle/fluid/framework/trainer_factory.cc     |   3 +-
 python/paddle/fluid/executor.py               |   2 +-
 10 files changed, 268 insertions(+), 31 deletions(-)

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index ffd32cc78f0..1eb2096af91 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -270,6 +270,10 @@ if(WITH_PSLIB)
     endif()
 endif(WITH_PSLIB)
 
+if(NOT WIN32 AND NOT APPLE)
+    include(external/gloo)
+    list(APPEND third_party_deps extern_gloo)
+endif()
 
 if(WITH_BOX_PS)
     include(external/box_ps)
@@ -277,10 +281,6 @@ if(WITH_BOX_PS)
 endif(WITH_BOX_PS)
 
 if(WITH_DISTRIBUTE)
-    if(WITH_GLOO)
-        include(external/gloo)
-        list(APPEND third_party_deps extern_gloo)
-    endif()
 
     if(WITH_GRPC)
         list(APPEND third_party_deps extern_grpc)
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index ee2ef9a0c3d..f6f3098613b 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -74,7 +74,9 @@ class PullDenseWorker {
   virtual void Initialize(const TrainerDesc& param);
 #ifdef PADDLE_WITH_CUDA
   void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
+#endif
 
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
   void AddPlace(const paddle::platform::Place place) {
     places_.push_back(place);
   }
@@ -135,9 +137,9 @@ class PullDenseWorker {
 
 #ifdef PADDLE_WITH_CUDA
   std::vector<cudaStream_t> copy_streams_;
+#endif
   std::vector<paddle::platform::Place> places_;
   std::vector<Scope*> thread_scopes_;
-#endif
 };
 
 // should incorporate different type of device
@@ -161,6 +163,7 @@ class DeviceWorker {
   virtual void SetDataFeed(DataFeed* data_feed);
   virtual void SetWorkerNum(int num) {}
   virtual void CacheProgram(const ProgramDesc& main_program) {}
+  virtual void GetXpuOpIndex() {}
   virtual void SetNeedDumpField(bool need_dump_field) {
     need_dump_field_ = need_dump_field;
   }
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 3c076805932..693073d1fc7 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -745,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync(
     push_sparse_status->push_back(std::move(status));
   }
 }
+#endif
+
+#ifdef PADDLE_WITH_XPU
+void FleetWrapper::PushDenseVarsAsync(
+    const Scope& scope, const uint64_t table_id,
+    const std::vector<std::string>& var_names,
+    std::vector<::std::future<int32_t>>* push_sparse_status,
+    float scale_datanorm, int batch_size,
+    const paddle::platform::Place& place) {
+#ifdef PADDLE_WITH_PSLIB
+  std::vector<paddle::ps::Region> regions;
+  for (auto& t : var_names) {
+    Variable* var = scope.FindVar(t);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int count = tensor->numel();
+    float* g_data = tensor->data<float>();
+
+    Variable* pin_var = scope.FindVar(t + "pin");
+    LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
+    float* pin_g =
+        pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
+    memory::Copy(platform::CPUPlace(), pin_g,
+                 BOOST_GET_CONST(platform::XPUPlace, place), g_data,
+                 sizeof(float) * count);
+
+    float* g = pin_g;
+    if (scale_datanorm >= 0) {
+      if (t.find(".batch_size@GRAD") != std::string::npos ||
+          t.find(".batch_sum@GRAD") != std::string::npos) {
+        Eigen::Map<Eigen::MatrixXf> mat(g, 1, count);
+        float scale = 1.0 / batch_size;
+        mat *= scale;
+      } else if (t.find(".batch_square_sum@GRAD") != std::string::npos) {
+        VLOG(3) << "epsilon: " << scale_datanorm;
+        for (int i = 0; i < count; ++i) {
+          g[i] = (g[i] - batch_size * scale_datanorm) / batch_size +
+                 batch_size * scale_datanorm;
+        }
+      }
+    }
+    paddle::ps::Region reg(g, count);
+    regions.emplace_back(std::move(reg));
+  }
 
+  auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(),
+                                                    regions.size(), table_id);
+  if (push_sparse_status) {
+    push_sparse_status->push_back(std::move(status));
+  }
+#endif
+}
 #endif
 void FleetWrapper::PushDenseVarsAsync(
     const Scope& scope, const uint64_t table_id,
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index be87bdf1e75..ae86835f38d 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -160,6 +160,14 @@ class FleetWrapper {
       float scale_datanorm, int batch_size,
       const paddle::platform::Place& place, cudaStream_t stream,
       cudaEvent_t event);
+#endif
+#ifdef PADDLE_WITH_XPU
+  void PushDenseVarsAsync(
+      const Scope& scope, const uint64_t table_id,
+      const std::vector<std::string>& var_names,
+      std::vector<::std::future<int32_t>>* push_sparse_status,
+      float scale_datanorm, int batch_size,
+      const paddle::platform::Place& place);
 #endif
   void PushDenseVarsAsync(
       const Scope& scope, const uint64_t table_id,
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index 7a27b6a9d7a..8e232560ab6 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -113,30 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
   if (platform::is_cpu_place(tensor->place())) {
     memcpy(data_ptr, tensor->data<void>(),
            tensor->numel() * SizeOfType(tensor->type()));
-#ifdef PADDLE_WITH_CUDA
   } else {
+#ifdef PADDLE_WITH_CUDA
     memory::Copy(platform::CPUPlace(), data_ptr,
                  BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
                  tensor->data<void>(),
                  tensor->numel() * SizeOfType(tensor->type()), nullptr);
-  }
-#else
-  }
 #endif
+#ifdef PADDLE_WITH_XPU
+    memory::Copy(platform::CPUPlace(), data_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
+                 tensor->data<void>(),
+                 tensor->numel() * SizeOfType(tensor->type()));
+#endif
+  }
 }
 
-// void HeterWrapper::DeSerializeToTensor(Scope* scope,
-// const HeterRequest* request) {
 #ifdef PADDLE_WITH_CUDA
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                        const VariableMessage& req_var,
                                        platform::Place place,
                                        cudaStream_t stream) {
+  // const VariableMessage& req_var = request->vars();
+  auto* var = scope->FindVar(req_var.varname());
+  auto* tensor = var->GetMutable<LoDTensor>();
+
+  std::vector<int> vec_dim;
+  for (auto& x : req_var.dims()) {
+    vec_dim.push_back(x);
+  }
+  tensor->Resize(make_ddim(vec_dim));
+
+  LoD lod;
+  for (int i = 0; i < req_var.lod_level(); ++i) {
+    framework::Vector<size_t> v;
+    for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) {
+      v.push_back(req_var.lod(i).lod_data(j));
+    }
+    lod.push_back(v);
+  }
+  tensor->set_lod(lod);
+
+  void* tensor_data =
+      tensor->mutable_data(place, ToVarType(req_var.data_type()));
+
+#ifdef PADDLE_WITH_CUDA
+  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
+               platform::CPUPlace(), req_var.data().data(),
+               tensor->numel() * SizeOfType(tensor->type()), stream);
 #else
+  memcpy(tensor_data, req_var.data().data(),
+         tensor->numel() * SizeOfType(tensor->type()));
+#endif
+}
+#endif
+
+// void HeterWrapper::DeSerializeToTensor(Scope* scope,
+// const HeterRequest* request) {
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                        const VariableMessage& req_var,
                                        platform::Place place) {
-#endif
   // const VariableMessage& req_var = request->vars();
   auto* var = scope->FindVar(req_var.varname());
   auto* tensor = var->GetMutable<LoDTensor>();
@@ -160,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   void* tensor_data =
       tensor->mutable_data(place, ToVarType(req_var.data_type()));
 
-#ifdef PADDLE_WITH_CUDA
-  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
+#ifdef PADDLE_WITH_XPU
+  memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data,
                platform::CPUPlace(), req_var.data().data(),
-               tensor->numel() * SizeOfType(tensor->type()), stream);
+               tensor->numel() * SizeOfType(tensor->type()));
 #else
   memcpy(tensor_data, req_var.data().data(),
          tensor->numel() * SizeOfType(tensor->type()));
@@ -184,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType(
     case VariableMessage::BOOL:
       return framework::proto::VarType::BOOL;  // NOLINT
     default:
-      VLOG(0) << "Not support type " << type;
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "ToVarType:Unsupported type %d", type));
   }
 }
 
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index fbed74800b4..6bbbaacdde3 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#include <cstdlib>
+#include <ctime>
+#include <string>
+#include <vector>
+#include "io/fs.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/data_set.h"
+#include "paddle/fluid/framework/device_worker_factory.h"
+#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
+#include "paddle/fluid/framework/trainer.h"
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
-
+#endif
 namespace paddle {
 namespace framework {
 
@@ -34,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
   int place_num = trainer_desc.worker_places_size();
   for (int i = 0; i < place_num; ++i) {
     int num = trainer_desc.worker_places(i);
+#ifdef PADDLE_WITH_CUDA
     platform::CUDAPlace place = platform::CUDAPlace(num);
     platform::CUDADeviceGuard guard(place.device);
     cudaStream_t stream;
@@ -44,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
     events_.push_back(event);
+#endif
+#ifdef PADDLE_WITH_XPU
+    platform::XPUPlace place = platform::XPUPlace(num);
+    places_.push_back(place);
+#endif
   }
   // thread_num_ = trainer_desc.thread_num();
   // SetDataset(dataset);
@@ -95,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
 void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
   auto place = places_[num];
   Scope* scope = place_scopes_[num];
+#ifdef PADDLE_WITH_CUDA
   auto stream = copy_streams_[num];
   auto event = events_[num];
-
   auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
   platform::CUDADeviceGuard guard(dev_id);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+  xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+#endif
+
   auto& block = program.Block(0);
   for (auto& var : block.AllVars()) {
     if (var->Persistable()) {
@@ -116,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
       HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
     }                                                                   \
   } while (0)
+
+#define HeterMemcpyXpuFunc(cpp_type, proto_type)                \
+  do {                                                          \
+    if (root_tensor->type() == proto_type) {                    \
+      HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place); \
+    }                                                           \
+  } while (0)
+#ifdef PADDLE_WITH_CUDA
       _ForEachDataType_(HeterMemcpyFunc);
+#endif
+#ifdef PADDLE_WITH_XPU
+      _ForEachDataType_(HeterMemcpyXpuFunc);
+#endif
     }
   }
+#ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
   cudaEventSynchronize(event);
+#endif
 }
 
+#ifdef PADDLE_WITH_CUDA
 template <typename T>
 void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
                                   LoDTensor* root_tensor,
@@ -141,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
                  root_ptr, sizeof(T) * root_tensor->numel(), stream);
   }
 }
+#endif
+
+#ifdef PADDLE_WITH_XPU
+template <typename T>
+void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
+                                  LoDTensor* root_tensor,
+                                  const paddle::platform::Place& thread_place) {
+  T* thread_ptr =
+      thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
+  T* root_ptr = root_tensor->data<T>();
+  if (platform::is_cpu_place(root_tensor->place())) {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
+                 platform::CPUPlace(), root_ptr,
+                 sizeof(T) * root_tensor->numel());
+  } else {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()),
+                 root_ptr, sizeof(T) * root_tensor->numel());
+  }
+}
+#endif
 
 void HeterXpuTrainer::DumpWork(int tid) {}
 
@@ -171,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
     CreateThreadParam(main_program, i);
     pull_dense_worker_->AddThreadScope(scope);
     pull_dense_worker_->AddPlace(places_[i]);
+#ifdef PADDLE_WITH_CUDA
     pull_dense_worker_->AddStream(copy_streams_[i]);
+#endif
   }
-
   pull_dense_worker_->Start();
+#ifdef PADDLE_WITH_CUDA
   for (auto& stream : copy_streams_) {
     cudaStreamSynchronize(stream);
   }
+#endif
   op_names_.clear();
   for (auto& op_desc : block.AllOps()) {
     std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
@@ -220,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
         OperatorBase* local_op_ptr = local_op.release();
         (context->ops_).push_back(local_op_ptr);
       }
+#ifdef PADDLE_WITH_CUDA
       auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
       platform::CUDADeviceGuard guard(dev_id);
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
+#endif
       object_pool_.Push(context);
     }
   }
@@ -267,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
     }                                                                          \
   } while (0)
       _ForEachDataType_(MergeCallback);
-      if (platform::is_gpu_place(thread_tensor->place())) {
+      if (!platform::is_cpu_place(thread_tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
         auto dev_id =
             BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
         platform::CUDADeviceGuard guard(dev_id);
         cudaMemset(thread_tensor->data<void>(), 0,
                    thread_tensor->numel() * SizeOfType(thread_tensor->type()));
+#endif
+#ifdef PADDLE_WITH_XPU
+        auto place = thread_tensor->place();
+        xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+        platform::DeviceContextPool& pool =
+            platform::DeviceContextPool::Instance();
+        platform::DeviceContext* dev_ctx = pool.Get(place);
+        const platform::XPUDeviceContext* xpu_ctx =
+            reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
+        xpu::memset(xpu_ctx->x_context(), thread_tensor->data<void>(), 0,
+                    thread_tensor->numel() * SizeOfType(thread_tensor->type()));
+#endif
       } else {
         memset(thread_tensor->data<void>(), 0,
                thread_tensor->numel() * SizeOfType(thread_tensor->type()));
@@ -281,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
     auto* merge_var = response->add_vars();
     heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_,
                                merge_var);
-    if (platform::is_gpu_place(root_tensor->place())) {
+    if (!platform::is_cpu_place(root_tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
       auto dev_id =
           BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
       platform::CUDADeviceGuard guard(dev_id);
       cudaMemset(root_tensor->data<void>(), 0,
                  root_tensor->numel() * SizeOfType(root_tensor->type()));
+#endif
+#ifdef PADDLE_WITH_XPU
+      auto place = root_tensor->place();
+      xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      platform::DeviceContext* dev_ctx = pool.Get(place);
+      const platform::XPUDeviceContext* xpu_ctx =
+          reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
+      xpu::memset(xpu_ctx->x_context(), root_tensor->data<void>(), 0,
+                  root_tensor->numel() * SizeOfType(root_tensor->type()));
+#endif
     } else {
       memset(root_tensor->data<void>(), 0,
              root_tensor->numel() * SizeOfType(root_tensor->type()));
@@ -346,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
       OperatorBase* local_op_ptr = local_op.release();
       (context->ops_).push_back(local_op_ptr);
     }
-
+#ifdef PADDLE_WITH_CUDA
     auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
     platform::CUDADeviceGuard guard(dev_id);
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
+#endif
   }
 
   context->Reset();
@@ -359,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
     auto deserial_timer =
         std::make_shared<paddle::ps::CostTimer>("xpu_service_deserial");
     for (int i = 0; i < request->vars_size(); ++i) {
+#ifdef PADDLE_WITH_CUDA
       heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place,
                                       copy_streams_[context->place_num_]);
+#endif
+#ifdef PADDLE_WITH_XPU
+      heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place);
+#endif
     }
+#ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventRecord(context->event_, copy_streams_[context->place_num_]));
     while (cudaEventQuery(context->event_) != cudaSuccess) {
       VLOG(3) << "wait for kernel";
       bthread_yield();
     }
+#endif
   }
 
   {
@@ -378,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
       op->Run(*(context->scope_), place);
     }
   }
+#ifdef PADDLE_WITH_CUDA
   auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
       platform::DeviceContextPool::Instance().Get(place));
   PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -391,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
       bthread_yield();
     }
   }
+#endif
+#ifdef PADDLE_WITH_XPU
+  xpu_wait();
+#endif
 
   for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) {
     const std::string& varname = trainer_desc_.xpu_send_list(i);
@@ -407,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
        ++i) {
     uint64_t tid =
         static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
+#ifdef PADDLE_WITH_CUDA
     fleet_ptr_->PushDenseVarsAsync(
         *(context->scope_), tid, dense_grad_names_[tid],
         &(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
         places_[context->place_num_], copy_streams_[context->place_num_],
         context->event_);
+#endif
+#ifdef PADDLE_WITH_XPU
+    fleet_ptr_->PushDenseVarsAsync(
+        *(context->scope_), tid, dense_grad_names_[tid],
+        &(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
+        places_[context->place_num_]);
+#endif
   }
   for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
        ++i) {
@@ -453,7 +565,6 @@ void HeterXpuTrainer::Finalize() {
   pull_dense_worker_->Stop();
   root_scope_->DropKids();
 }
-
 }  // namespace framework
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index c399c5d02eb..6aeef8a39b5 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -62,13 +62,15 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
   fleet_ptr_ = FleetWrapper::GetInstance();
 #ifdef PADDLE_WITH_CUDA
   copy_streams_.clear();
+#endif
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
   places_.clear();
   thread_scopes_.clear();
 #endif
 }
 
 void PullDenseWorker::CreatePinVar() {
-#ifdef PADDLE_WITH_CUDA
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_PSLIB)
   // for (auto& v : dense_value_names_) {
   //  for (auto& name : v.second) {
   for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
@@ -83,8 +85,13 @@ void PullDenseWorker::CreatePinVar() {
       auto* ptr = root_scope_->Var(name + "pin");
       InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
       LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>();
+#ifdef PADDLE_WITH_CUDA
       pin_tensor->mutable_data<float>(tensor->dims(),
                                       platform::CUDAPinnedPlace());
+#endif
+#ifdef PADDLE_WITH_XPU
+      pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
+#endif
     }
   }
 #endif
@@ -107,7 +114,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
     exit(-1);
   }
   status_vec->resize(0);
-#ifdef PADDLE_WITH_CUDA
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
 
   for (size_t i = 0; i < places_.size(); ++i) {
     // for (auto& v : dense_value_names_) {
@@ -125,9 +132,16 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
         Variable* var = thread_scopes_[i]->FindVar(name);
         LoDTensor* tensor = var->GetMutable<LoDTensor>();
         float* w = tensor->data<float>();
+#ifdef PADDLE_WITH_CUDA
         memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
                      platform::CUDAPinnedPlace(), pin_w,
                      sizeof(float) * tensor->numel(), copy_streams_[i]);
+#endif
+#ifdef PADDLE_WITH_XPU
+        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, places_[i]), w,
+                     platform::CPUPlace(), pin_w,
+                     sizeof(float) * tensor->numel());
+#endif
       }
     }
   }
@@ -148,7 +162,7 @@ void PullDenseWorker::PullDense(bool force_update) {
     uint64_t tid = static_cast<uint64_t>(
         dwp_param_.program_config(0).pull_dense_table_id(i));
     if (force_update || CheckUpdateParam(tid)) {
-#ifdef PADDLE_WITH_CUDA
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
       VLOG(3) << "pull dense " << force_update << " " << tid;
       fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
                                      &pull_dense_status_, false);
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index d041ef48e2c..ecaec49aa46 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -138,7 +138,8 @@ class DistMultiTrainer : public MultiTrainer {
   std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
 };
 
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+    (defined PADDLE_WITH_PSLIB)
 class HeterServiceContext {
  public:
   HeterServiceContext() {}
@@ -151,7 +152,9 @@ class HeterServiceContext {
   void Reset() { push_dense_status_.clear(); }
   int place_num_;
   Scope* scope_{nullptr};
+#ifdef PADDLE_WITH_CUDA
   cudaEvent_t event_;
+#endif
   std::vector<OperatorBase*> ops_;
   std::vector<::std::future<int32_t>> push_dense_status_;
 };
@@ -178,10 +181,18 @@ class HeterXpuTrainer : public TrainerBase {
   virtual void CacheProgram(const ProgramDesc& main_program) {
     new (&program_) ProgramDesc(main_program);
   }
+  virtual std::string GetDumpPath(int tid) { return ""; }
+  virtual void InitDumpEnv() {}
   template <typename T>
+#ifdef PADDLE_WITH_CUDA
   void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
                    const paddle::platform::Place& thread_place,
                    cudaStream_t stream);
+#endif
+#ifdef PADDLE_WITH_XPU
+  void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor,
+                   const paddle::platform::Place& thread_place);
+#endif
   void CreateThreadParam(const ProgramDesc& program, int num);
   template <typename T>
   void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
@@ -207,9 +218,11 @@ class HeterXpuTrainer : public TrainerBase {
   std::vector<std::string> op_names_;
   std::vector<Scope*> place_scopes_;
   BtObjectPool<HeterServiceContext> object_pool_;
-  std::vector<cudaStream_t> copy_streams_;
   std::vector<platform::Place> places_;
+#ifdef PADDLE_WITH_CUDA
+  std::vector<cudaStream_t> copy_streams_;
   std::vector<cudaEvent_t> events_;
+#endif
 };
 #endif
 
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index 15584620d86..cc92c50cc42 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -63,7 +63,8 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
 
 REGISTER_TRAINER_CLASS(MultiTrainer);
 REGISTER_TRAINER_CLASS(DistMultiTrainer);
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+    (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(HeterXpuTrainer);
 #endif
 #if defined(PADDLE_WITH_NCCL)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 2e3f34f4164..3dc30767e5a 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1355,7 +1355,7 @@ class Executor(object):
         if not program._fleet_opt is None:
             if program._fleet_opt.get("worker_class", "") == "HeterCpuWorker":
                 is_heter = 1
-            if program._fleet_opt("trainer", "") == "HeterXpuTrainer":
+            if program._fleet_opt.get("trainer", "") == "HeterXpuTrainer":
                 is_heter = 1
         if scope is None:
             scope = global_scope()
-- 
GitLab


From 09f1953296232d4b2f1ad823fb060a8ed3b2eaa9 Mon Sep 17 00:00:00 2001
From: chalsliu <45041955+chalsliu@users.noreply.github.com>
Date: Fri, 25 Sep 2020 16:45:02 +0800
Subject: [PATCH 234/261] Revert "Disable ut quickly."

This reverts commit 29f1560d8fbb1e516dfac5c609e6e869196475a5.
---
 paddle/scripts/paddle_build.sh     |  5 ----
 tools/check_file_diff_approvals.sh |  2 +-
 tools/is_ut_disabled.py            | 40 ------------------------------
 3 files changed, 1 insertion(+), 46 deletions(-)
 delete mode 100644 tools/is_ut_disabled.py

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index ac6531a2cc5..69303013d2a 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -988,11 +988,6 @@ set +x
                 fi
                 read testcase <<< $(echo "$line"|grep -oEi "\w+$")
 
-                if python $PADDLE_ROOT/tools/is_ut_disabled.py $testcase; then
-                    echo $testcase" is disabled."
-                    continue
-                fi
-
                 if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then
                     echo $testcase" will only run at night."
                     continue
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 16e61d7c77a..84254cc89bb 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -286,7 +286,7 @@ fi
 # Get the list of PR authors with unresolved unit test issues
 pip install PyGithub
 # For getting PR related data
-wget https://sys-p0.bj.bcebos.com/blk/block.txt --no-check-certificate
+wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate
 wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate
 HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
 if [ "${HASUTFIXED}" != "" ]; then
diff --git a/tools/is_ut_disabled.py b/tools/is_ut_disabled.py
deleted file mode 100644
index a21fe39e71e..00000000000
--- a/tools/is_ut_disabled.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Check whether ut is disabled. """
-
-import os
-import sys
-
-
-def check_ut():
-    """ Get disabled unit tests. """
-    disable_ut_file = 'disable_ut'
-    cmd = 'wget -q --no-check-certificate https://sys-p0.bj.bcebos.com/prec/{}'.format(
-        disable_ut_file)
-    os.system(cmd)
-    with open(disable_ut_file) as utfile:
-        for u in utfile:
-            if u.rstrip('\r\n') == sys.argv[1]:
-                exit(0)
-    exit(1)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) != 2:
-        exit(1)
-    try:
-        check_ut()
-    except Exception as e:
-        print(e)
-        exit(1)
-- 
GitLab


From a5b32637825e19f7527c09878ba2994314929d54 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 25 Sep 2020 20:51:21 +0800
Subject: [PATCH 235/261] Refine error msg in paddle/fluid/imperative  (#27521)

* refine err msg

* follow comments
---
 .../fluid/imperative/gradient_accumulator.cc  | 12 +++-
 .../imperative/jit/program_desc_tracer.cc     | 13 ++--
 paddle/fluid/imperative/nccl_context.cc       | 59 +++++++++++++------
 3 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 7caeb4378ce..07f1868b7fa 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/gradient_accumulator.h"
+
 #include <algorithm>
 #include <memory>
 #include <utility>
+
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -136,9 +138,13 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
     return;
   }
 
-  PADDLE_ENFORCE_EQ(dst_tensor->numel() == numel, true,
-                    "dst_numel %d vs. src_numel %d", dst_tensor->numel(),
-                    numel);
+  PADDLE_ENFORCE_EQ(
+      dst_tensor->numel(), numel,
+      platform::errors::PreconditionNotMet(
+          "The number of elements of source tensor and destination tensor "
+          "should be equal, but got the number of elements of source tensor is "
+          "%zu and the number of elements of destination tensor is %zu.",
+          numel, dst_tensor->numel()));
 
   auto data_type = src_tensor.type();
   auto place = src_tensor.place();
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc
index 9f4cf713f7c..59ff5b4eae4 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.cc
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
+
 #include <unordered_map>
 #include <unordered_set>
 
@@ -203,7 +204,8 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc(
 
 void ProgramDescTracer::InsertVarIfNotExist(
     const std::shared_ptr<VarBase> &new_var, bool is_input) {
-  PADDLE_ENFORCE_NOT_NULL(new_var);
+  PADDLE_ENFORCE_NOT_NULL(new_var, platform::errors::InvalidArgument(
+                                       "The variable to insert is NULL."));
   if (vars_.count(new_var) != 0) return;
 
   auto new_var_desc = new framework::VarDesc("");
@@ -220,7 +222,9 @@ void ProgramDescTracer::InsertVarIfNotExist(
   }
 
   const auto &inner_var = new_var->Var();
-  PADDLE_ENFORCE_EQ(inner_var.IsInitialized(), true);
+  PADDLE_ENFORCE_EQ(inner_var.IsInitialized(), true,
+                    platform::errors::InvalidArgument(
+                        "The variable to insert is not initialized."));
   if (inner_var.IsType<framework::LoDTensor>()) {
     const auto &tensor = inner_var.Get<framework::LoDTensor>();
     new_var_desc->SetType(framework::proto::VarType::LOD_TENSOR);
@@ -232,8 +236,9 @@ void ProgramDescTracer::InsertVarIfNotExist(
       new_var_desc->SetDataType(framework::proto::VarType::FP32);
     }
   } else {
-    PADDLE_THROW("Not support variable type %s",
-                 framework::ToTypeName(inner_var.Type()));
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Not support variable type %s.",
+        framework::ToTypeName(inner_var.Type())));
   }
 }
 
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 115078e7ead..c8fd31fcbff 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/nccl_context.h"
+
 #include "paddle/fluid/platform/collective_helper.h"
 
 namespace paddle {
@@ -21,8 +22,10 @@ namespace imperative {
 void NCCLParallelContext::RecvNCCLID(const std::string &ep,
                                      ncclUniqueId *nccl_id) {
   auto addr = paddle::string::Split(ep, ':');
-  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
-                    "The endpoint should contain host and port: %s", ep);
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
   std::string host = addr[0];
   int port = std::stoi(addr[1]);
 
@@ -32,27 +35,41 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
   char buffer[1024] = {0};
   int opt = 0;
   // creating socket fd
-  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0)
-    PADDLE_THROW("create server fd failed");
-  if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)))
-    PADDLE_THROW("set socket opt failed");
+  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0) {
+    PADDLE_THROW(
+        platform::errors::Unavailable("Create server file descriptor failed."));
+  }
+
+  if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt))) {
+    PADDLE_THROW(platform::errors::Unavailable("Set socket options failed."));
+  }
 
   address.sin_family = AF_INET;
   address.sin_addr.s_addr = INADDR_ANY;
   address.sin_port = htons(port);
 
-  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0)
-    PADDLE_THROW("binding failed on ep: %s", ep);
+  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) {
+    PADDLE_THROW(
+        platform::errors::Unavailable("Bind on endpoint %s failed.", ep));
+  }
+
   VLOG(3) << "listening on: " << ep;
-  if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed");
+  if (listen(server_fd, 3) < 0) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Listen on server file descriptor failed."));
+  }
 
   if ((new_socket =
            accept(server_fd, reinterpret_cast<struct sockaddr *>(&address),
-                  reinterpret_cast<socklen_t *>(&addrlen))) < 0)
-    PADDLE_THROW("accept the new socket fd failed");
+                  reinterpret_cast<socklen_t *>(&addrlen))) < 0) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Accept the new socket file descriptor failed."));
+  }
+
+  if (read(new_socket, buffer, 1024) < 0) {
+    PADDLE_THROW(platform::errors::Unavailable("Read from socket failed."));
+  }
 
-  if (read(new_socket, buffer, 1024) < 0)
-    PADDLE_THROW("reading the ncclUniqueId from socket failed");
   VLOG(3) << "recevived the ncclUniqueId";
   memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
 
@@ -63,8 +80,10 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
 void NCCLParallelContext::SendNCCLID(const std::string &ep,
                                      ncclUniqueId *nccl_id) {
   auto addr = paddle::string::Split(ep, ':');
-  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
-                    "The endpoint should contain host and port: %s", ep);
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
   std::string host = addr[0];
   int port = std::stoi(addr[1]);
   // struct sockaddr_in address;
@@ -73,15 +92,17 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
   char buffer[1024] = {0};
 
   memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
-  if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
-    PADDLE_THROW("create socket failed");
+  if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+    PADDLE_THROW(platform::errors::Unavailable("Create socket failed."));
+  }
 
   memset(&serv_addr, '0', sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
   serv_addr.sin_port = htons(port);
 
-  if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0)
-    PADDLE_THROW("invalied address: %s", ep);
+  if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) {
+    PADDLE_THROW(platform::errors::Unavailable("Open address %s failed.", ep));
+  }
 
   int try_times = 0;
   while (true) {
-- 
GitLab


From 0b4bb023a7ef93669e9007f7e6241f24c6e98cb6 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 25 Sep 2020 21:35:40 +0800
Subject: [PATCH 236/261] Add static mode check on data() (#27495)

* add static check on data()

* follow comments

* fix ut
---
 python/paddle/fluid/data.py                          |  2 ++
 python/paddle/fluid/framework.py                     | 12 +++++++++++-
 python/paddle/fluid/layers/io.py                     |  2 ++
 python/paddle/fluid/tests/unittests/test_data.py     | 12 ++++++++++++
 .../tests/unittests/test_deprecated_decorator.py     |  2 ++
 python/paddle/static/input.py                        |  2 ++
 6 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/data.py b/python/paddle/fluid/data.py
index dc57e9f71ed..05ea66f5445 100644
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
@@ -19,10 +19,12 @@ from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.data_feeder import check_dtype, check_type
 from ..utils import deprecated
+from paddle.fluid.framework import static_only
 
 __all__ = ['data']
 
 
+@static_only
 @deprecated(since="2.0.0", update_to="paddle.static.data")
 def data(name, shape, dtype='float32', lod_level=0):
     """
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 797b32f5d47..c7e66bb2877 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -217,7 +217,16 @@ def _dygraph_not_support_(func):
 def _dygraph_only_(func):
     def __impl__(*args, **kwargs):
         assert in_dygraph_mode(
-        ), "We Only support %s in dynamic mode, please call 'paddle.disable_static()' to enter dynamic mode." % func.__name__
+        ), "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode." % func.__name__
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
+def _static_only_(func):
+    def __impl__(*args, **kwargs):
+        assert not in_dygraph_mode(
+        ), "We only support '%s()' in static graph mode, please call 'paddle.enable_static()' to enter static graph mode." % func.__name__
         return func(*args, **kwargs)
 
     return __impl__
@@ -260,6 +269,7 @@ def deprecate_stat_dict(func):
 
 dygraph_not_support = wrap_decorator(_dygraph_not_support_)
 dygraph_only = wrap_decorator(_dygraph_only_)
+static_only = wrap_decorator(_static_only_)
 fake_interface_only = wrap_decorator(_fake_interface_only_)
 
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index d513d44acff..6b98dea4290 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -31,6 +31,7 @@ from ..unique_name import generate as unique_name
 
 import logging
 from ..data_feeder import check_dtype, check_type
+from paddle.fluid.framework import static_only
 
 __all__ = [
     'data', 'read_file', 'double_buffer', 'py_reader',
@@ -38,6 +39,7 @@ __all__ = [
 ]
 
 
+@static_only
 def data(name,
          shape,
          append_batch_size=True,
diff --git a/python/paddle/fluid/tests/unittests/test_data.py b/python/paddle/fluid/tests/unittests/test_data.py
index 8070148f8b3..98739f6e163 100644
--- a/python/paddle/fluid/tests/unittests/test_data.py
+++ b/python/paddle/fluid/tests/unittests/test_data.py
@@ -99,5 +99,17 @@ class TestApiStaticDataError(unittest.TestCase):
             self.assertRaises(TypeError, test_shape_type)
 
 
+class TestApiErrorWithDynamicMode(unittest.TestCase):
+    def test_error(self):
+        with program_guard(Program(), Program()):
+            paddle.disable_static()
+            self.assertRaises(AssertionError, fluid.data, 'a', [2, 25])
+            self.assertRaises(
+                AssertionError, fluid.layers.data, 'b', shape=[2, 25])
+            self.assertRaises(
+                AssertionError, paddle.static.data, 'c', shape=[2, 25])
+            paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
index 2a80e20d692..97b6594eb38 100755
--- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
@@ -72,6 +72,7 @@ class TestDeprecatedDocorator(unittest.TestCase):
         test old fluid elementwise_mul api, it should fire Warinng function, 
         which insert the Warinng info on top of API's doc string.
         """
+        paddle.enable_static()
         # Initialization
         x = fluid.data(name='x', shape=[3, 2, 1], dtype='float32')
 
@@ -80,6 +81,7 @@ class TestDeprecatedDocorator(unittest.TestCase):
 
         # captured        
         captured = get_warning_index(fluid.data)
+        paddle.disable_static()
 
         # testting
         self.assertGreater(expected, captured)
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index eb70320ea75..d7a3cfcdb92 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -19,10 +19,12 @@ from paddle.fluid import core, Variable
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
+from paddle.fluid.framework import static_only
 
 __all__ = ['data', 'InputSpec']
 
 
+@static_only
 def data(name, shape, dtype=None, lod_level=0):
     """
     **Data Layer**
-- 
GitLab


From b38e4f2840ebc4ee0195ab8de789bf5b8d54ef37 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Fri, 25 Sep 2020 22:08:40 +0800
Subject: [PATCH 237/261] Refine vision models (#27476)

* refine vision models
---
 python/paddle/hapi/callbacks.py               |  10 +-
 python/paddle/hapi/model.py                   |  60 +--
 python/paddle/metric/metrics.py               |   9 +-
 python/paddle/tests/CMakeLists.txt            |   4 -
 .../paddle/tests/dist_hapi_mnist_dynamic.py   |   2 +-
 python/paddle/tests/dist_hapi_mnist_static.py |   2 +-
 python/paddle/tests/test_model.py             |  57 ++-
 python/paddle/tests/test_pretrained_model.py  |  45 ++-
 python/paddle/tests/test_vision_models.py     |   2 +-
 python/paddle/vision/models/lenet.py          |  30 +-
 python/paddle/vision/models/mobilenetv1.py    | 189 ++++------
 python/paddle/vision/models/mobilenetv2.py    | 347 ++++++++----------
 python/paddle/vision/models/resnet.py         | 337 ++++++++---------
 python/paddle/vision/models/vgg.py            |  73 ++--
 14 files changed, 539 insertions(+), 628 deletions(-)

diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index 7ed571fa9c6..69b7fedd72e 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -301,10 +301,11 @@ class ProgBarLogger(Callback):
 
             train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-            model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+            lenet = paddle.vision.LeNet()
+            model = paddle.Model(lenet,
                 inputs, labels)
 
-            optim = paddle.optimizer.Adam(0.001)
+            optim = paddle.optimizer.Adam(0.001, parameters=lenet.parameters())
             model.prepare(optimizer=optim,
                         loss=paddle.nn.CrossEntropyLoss(),
                         metrics=paddle.metric.Accuracy())
@@ -436,10 +437,11 @@ class ModelCheckpoint(Callback):
 
             train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-            model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+            lenet = paddle.vision.LeNet()
+            model = paddle.Model(lenet,
                 inputs, labels)
 
-            optim = paddle.optimizer.Adam(0.001)
+            optim = paddle.optimizer.Adam(0.001, parameters=lenet.parameters())
             model.prepare(optimizer=optim,
                         loss=paddle.nn.CrossEntropyLoss(),
                         metrics=paddle.metric.Accuracy())
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 53928ebed1b..1bfe8f07a2f 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -814,10 +814,9 @@ class Model(object):
         from paddle.static import InputSpec
 
         device = paddle.set_device('cpu') # or 'gpu'
-        # if use static graph, do not set
-        paddle.disable_static(device)
 
         net = nn.Sequential(
+            nn.Flatten(1),
             nn.Linear(784, 200),
             nn.Tanh(),
             nn.Linear(200, 10))
@@ -833,7 +832,7 @@ class Model(object):
                       paddle.nn.CrossEntropyLoss(),
                       paddle.metric.Accuracy())
         
-        data = paddle.vision.datasets.MNIST(mode='train', chw_format=False)
+        data = paddle.vision.datasets.MNIST(mode='train')
         model.fit(data, epochs=2, batch_size=32, verbose=1)
     """
 
@@ -850,7 +849,8 @@ class Model(object):
 
         if not isinstance(inputs, (list, dict, Input)):
             raise TypeError(
-                "'inputs' must be list or dict, and couldn't be None.")
+                "'inputs' must be list or dict in static graph mode")
+
         self._inputs = self._verify_spec(inputs, True)
         self._labels = self._verify_spec(labels)
 
@@ -885,7 +885,6 @@ class Model(object):
               from paddle.static import InputSpec
 
               device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
 
               net = nn.Sequential(
                   nn.Linear(784, 200),
@@ -930,7 +929,6 @@ class Model(object):
               from paddle.static import InputSpec
 
               device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
 
               net = nn.Sequential(
                   nn.Linear(784, 200),
@@ -970,9 +968,12 @@ class Model(object):
               import numpy as np
               import paddle
               import paddle.nn as nn
+              from paddle.static import InputSpec
 
               device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
+              
+              input = InputSpec([None, 784], 'float32', 'x')
+              label = InputSpec([None, 1], 'int64', 'label')
 
               net = nn.Sequential(
                   nn.Linear(784, 200),
@@ -980,7 +981,7 @@ class Model(object):
                   nn.Linear(200, 10),
                   nn.Softmax())
 
-              model = paddle.Model(net)
+              model = paddle.Model(net, input, label)
               model.prepare()
               data = np.random.random(size=(4,784)).astype(np.float32)
               out = model.test_batch([data])
@@ -1026,6 +1027,7 @@ class Model(object):
                     def __init__(self):
                         super(Mnist, self).__init__()
                         self.net = nn.Sequential(
+                            nn.Flatten(1),
                             nn.Linear(784, 200),
                             nn.Tanh(),
                             nn.Linear(200, 10),
@@ -1045,7 +1047,7 @@ class Model(object):
                 optim = paddle.optimizer.SGD(learning_rate=1e-3,
                     parameters=model.parameters())
                 model.prepare(optim, paddle.nn.CrossEntropyLoss())
-                data = paddle.vision.datasets.MNIST(mode='train', chw_format=False)
+                data = paddle.vision.datasets.MNIST(mode='train')
                 model.fit(data, epochs=1, batch_size=32, verbose=0)
                 model.save('checkpoint/test')  # save for training
                 model.save('inference_model', False)  # save for inference
@@ -1092,15 +1094,18 @@ class Model(object):
             
               import paddle
               import paddle.nn as nn
-              
+              from paddle.static import InputSpec
+
               device = paddle.set_device('cpu')
-              paddle.disable_static(device)
+
+              input = InputSpec([None, 784], 'float32', 'x')
 
               model = paddle.Model(nn.Sequential(
                   nn.Linear(784, 200),
                   nn.Tanh(),
                   nn.Linear(200, 10),
-                  nn.Softmax()))
+                  nn.Softmax()), input)
+
               model.save('checkpoint/test')
               model.load('checkpoint/test')
         """
@@ -1165,13 +1170,15 @@ class Model(object):
 
               import paddle
               import paddle.nn as nn
+              from paddle.static import InputSpec
 
-              paddle.disable_static()
-
+              input = InputSpec([None, 784], 'float32', 'x')
+              
               model = paddle.Model(nn.Sequential(
                   nn.Linear(784, 200),
                   nn.Tanh(),
-                  nn.Linear(200, 10)))
+                  nn.Linear(200, 10)), input)
+
               params = model.parameters()
         """
         return self._adapter.parameters()
@@ -1313,7 +1320,7 @@ class Model(object):
               label = InputSpec([None, 1], 'int64', 'label')
            
               model = paddle.Model(
-                  paddle.vision.models.LeNet(classifier_activation=None),
+                  paddle.vision.models.LeNet(),
                   input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
@@ -1350,7 +1357,7 @@ class Model(object):
               label = InputSpec([None, 1], 'int64', 'label')
            
               model = paddle.Model(
-                  paddle.vision.models.LeNet(classifier_activation=None), input, label)
+                  paddle.vision.models.LeNet(), input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
               model.prepare(
@@ -1483,7 +1490,7 @@ class Model(object):
 
             # imperative mode
             paddle.disable_static()
-            model = paddle.Model(paddle.vision.models.LeNet())
+            model = paddle.Model(paddle.vision.models.LeNet(), input, label)
             model.prepare(metrics=paddle.metric.Accuracy())
             result = model.evaluate(val_dataset, batch_size=64)
             print(result)
@@ -1580,19 +1587,20 @@ class Model(object):
 
             test_dataset = MnistDataset(mode='test', return_label=False)
 
-            # declarative mode
+            # imperative mode
             input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
             model = paddle.Model(paddle.vision.models.LeNet(), input)
             model.prepare()
-
             result = model.predict(test_dataset, batch_size=64)
             print(len(result[0]), result[0][0].shape)
 
-            # imperative mode
+            # declarative mode
             device = paddle.set_device('cpu')
-            paddle.disable_static(device)
-            model = paddle.Model(paddle.vision.models.LeNet())
+            paddle.enable_static()
+            input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
+            model = paddle.Model(paddle.vision.models.LeNet(), input)
             model.prepare()
+
             result = model.predict(test_dataset, batch_size=64)
             print(len(result[0]), result[0][0].shape)
         """
@@ -1832,15 +1840,11 @@ class Model(object):
 
               import paddle
               from paddle.static import InputSpec
-
-              dynamic = True
-              device = paddle.set_device('cpu')
-              paddle.disable_static(device) if dynamic else None
            
               input = InputSpec([None, 1, 28, 28], 'float32', 'image')
               label = InputSpec([None, 1], 'int64', 'label')
            
-              model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+              model = paddle.Model(paddle.vision.LeNet(),
                   input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 1cd65171ff0..f4a9b8c01d0 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -182,7 +182,6 @@ class Accuracy(Metric):
         import numpy as np
         import paddle
 
-        paddle.disable_static()
         x = paddle.to_tensor(np.array([
             [0.1, 0.2, 0.3, 0.4],
             [0.1, 0.4, 0.3, 0.2],
@@ -202,11 +201,13 @@ class Accuracy(Metric):
         .. code-block:: python
 
         import paddle
-
-        paddle.disable_static()
+        from paddle.static import InputSpec
+           
+        input = InputSpec([None, 1, 28, 28], 'float32', 'image')
+        label = InputSpec([None, 1], 'int64', 'label')
         train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-        model = paddle.Model(paddle.vision.LeNet(classifier_activation=None))
+        model = paddle.Model(paddle.vision.LeNet(), input, label)
         optim = paddle.optimizer.Adam(
             learning_rate=0.001, parameters=model.parameters())
         model.prepare(
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index 6fb73b08c11..e1bc65a5d15 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -8,10 +8,6 @@ foreach(TEST_OP ${DIST_TEST_OPS})
     list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
 
-# disable test_pretrained_model and test_vision_models
-list(REMOVE_ITEM TEST_OPS test_pretrained_model)
-list(REMOVE_ITEM TEST_OPS test_vision_models)
-
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/tests/dist_hapi_mnist_dynamic.py b/python/paddle/tests/dist_hapi_mnist_dynamic.py
index 13d966bf38f..46d02789402 100644
--- a/python/paddle/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/tests/dist_hapi_mnist_dynamic.py
@@ -68,7 +68,7 @@ class TestDistTraning(unittest.TestCase):
         inputs = [Input(im_shape, 'float32', 'image')]
         labels = [Input([None, 1], 'int64', 'label')]
 
-        model = Model(LeNet(classifier_activation=None), inputs, labels)
+        model = Model(LeNet(), inputs, labels)
         optim = fluid.optimizer.Momentum(
             learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
         model.prepare(optim, CrossEntropyLoss(), Accuracy())
diff --git a/python/paddle/tests/dist_hapi_mnist_static.py b/python/paddle/tests/dist_hapi_mnist_static.py
index 9d8e5f3652c..eab34a6dafb 100644
--- a/python/paddle/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/tests/dist_hapi_mnist_static.py
@@ -67,7 +67,7 @@ class TestDistTraning(unittest.TestCase):
         inputs = [Input(im_shape, 'float32', 'image')]
         labels = [Input([None, 1], 'int64', 'label')]
 
-        model = Model(LeNet(classifier_activation=None), inputs, labels)
+        model = Model(LeNet(), inputs, labels)
         optim = fluid.optimizer.Momentum(
             learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
         model.prepare(optim, CrossEntropyLoss(), Accuracy())
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index c89cbbbfbda..5a3d837407b 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -40,7 +40,7 @@ from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTra
 
 
 class LeNetDygraph(paddle.nn.Layer):
-    def __init__(self, num_classes=10, classifier_activation=None):
+    def __init__(self, num_classes=10):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
@@ -55,8 +55,7 @@ class LeNetDygraph(paddle.nn.Layer):
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
-                Softmax())  #Todo: accept any activation
+                Linear(400, 120), Linear(120, 84), Linear(84, 10))
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -67,6 +66,34 @@ class LeNetDygraph(paddle.nn.Layer):
         return x
 
 
+class LeNetDeclarative(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10):
+        super(LeNetDeclarative, self).__init__()
+        self.num_classes = num_classes
+        self.features = Sequential(
+            Conv2d(
+                1, 6, 3, stride=1, padding=1),
+            ReLU(),
+            Pool2D(2, 'max', 2),
+            Conv2d(
+                6, 16, 5, stride=1, padding=0),
+            ReLU(),
+            Pool2D(2, 'max', 2))
+
+        if num_classes > 0:
+            self.fc = Sequential(
+                Linear(400, 120), Linear(120, 84), Linear(84, 10))
+
+    @declarative
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        if self.num_classes > 0:
+            x = fluid.layers.flatten(x, 1)
+            x = self.fc(x)
+        return x
+
+
 class MnistDataset(MNIST):
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
@@ -198,7 +225,7 @@ class TestModel(unittest.TestCase):
         paddle.manual_seed(seed)
         paddle.framework.random._manual_program_seed(seed)
 
-        net = LeNet(classifier_activation=None)
+        net = LeNet()
         optim_new = fluid.optimizer.Adam(
             learning_rate=0.001, parameter_list=net.parameters())
         model = Model(net, inputs=self.inputs, labels=self.labels)
@@ -287,14 +314,12 @@ class TestModel(unittest.TestCase):
 
 
 class MyModel(paddle.nn.Layer):
-    def __init__(self, classifier_activation='softmax'):
+    def __init__(self):
         super(MyModel, self).__init__()
         self._fc = Linear(20, 10)
-        self._act = Softmax()  #Todo: accept any activation
 
     def forward(self, x):
         y = self._fc(x)
-        y = self._act(y)
         return y
 
 
@@ -311,7 +336,7 @@ class TestModelFunction(unittest.TestCase):
         def get_expect():
             fluid.enable_dygraph(fluid.CPUPlace())
             self.set_seed()
-            m = MyModel(classifier_activation=None)
+            m = MyModel()
             optim = fluid.optimizer.SGD(learning_rate=0.001,
                                         parameter_list=m.parameters())
             m.train()
@@ -330,7 +355,7 @@ class TestModelFunction(unittest.TestCase):
             fluid.enable_dygraph(device) if dynamic else None
             self.set_seed()
 
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
             optim2 = fluid.optimizer.SGD(learning_rate=0.001,
                                          parameter_list=net.parameters())
 
@@ -374,7 +399,7 @@ class TestModelFunction(unittest.TestCase):
         for dynamic in [True, False]:
             device = paddle.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
             inputs = [InputSpec([None, 20], 'float32', 'x')]
             labels = [InputSpec([None, 1], 'int64', 'label')]
             optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -417,7 +442,7 @@ class TestModelFunction(unittest.TestCase):
         fluid.enable_dygraph(device)
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
-        model = Model(MyModel(classifier_activation=None), inputs, labels)
+        model = Model(MyModel(), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -426,7 +451,7 @@ class TestModelFunction(unittest.TestCase):
 
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
-        model = Model(MyModel(classifier_activation=None), inputs, labels)
+        model = Model(MyModel(), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -436,7 +461,7 @@ class TestModelFunction(unittest.TestCase):
     def test_static_save_dynamic_load(self):
         path = tempfile.mkdtemp()
 
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -448,7 +473,7 @@ class TestModelFunction(unittest.TestCase):
         device = paddle.set_device('cpu')
         fluid.enable_dygraph(device)  #if dynamic else None
 
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -557,7 +582,7 @@ class TestModelFunction(unittest.TestCase):
 
 class TestRaiseError(unittest.TestCase):
     def test_input_without_name(self):
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
 
         inputs = [InputSpec([None, 10], 'float32')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
@@ -567,7 +592,7 @@ class TestRaiseError(unittest.TestCase):
     def test_input_without_input_spec(self):
         for dynamic in [True, False]:
             paddle.disable_static() if dynamic else None
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
             with self.assertRaises(TypeError):
                 model = Model(net)
             paddle.enable_static()
diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py
index 641147d39e9..bf9c2a2ae06 100644
--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import unittest
+import tempfile
+import shutil
 import numpy as np
 
 import paddle
@@ -23,27 +25,36 @@ import paddle.vision.models as models
 # test the predicted resutls of static graph and dynamic graph are equal
 # when used pretrained model
 class TestPretrainedModel(unittest.TestCase):
-    def infer(self, x, arch, dygraph=True):
-        if dygraph:
-            paddle.disable_static()
-
-        net = models.__dict__[arch](pretrained=True, classifier_activation=None)
-        inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')]
-        model = paddle.Model(network=net, inputs=inputs)
-        model.prepare()
-        res = model.test_batch(x)
-
-        if dygraph:
-            paddle.enable_static()
-        return res
+    def infer(self, arch):
+        path = tempfile.mkdtemp()
+        x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
+        res = {}
+        for dygraph in [True, False]:
+            if not dygraph:
+                paddle.enable_static()
+
+            net = models.__dict__[arch]()
+            inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')]
+            model = paddle.Model(network=net, inputs=inputs)
+            model.prepare()
+
+            if dygraph:
+                model.save(path)
+                res['dygraph'] = model.test_batch(x)
+            else:
+                model.load(path)
+                res['static'] = model.test_batch(x)
+
+            if not dygraph:
+                paddle.disable_static()
+
+        shutil.rmtree(path)
+        np.testing.assert_allclose(res['dygraph'], res['static'])
 
     def test_models(self):
         arches = ['mobilenet_v1', 'mobilenet_v2', 'resnet18']
         for arch in arches:
-            x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
-            y_dygraph = self.infer(x, arch)
-            y_static = self.infer(x, arch, dygraph=False)
-            np.testing.assert_allclose(y_dygraph, y_static)
+            self.infer(arch)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py
index 44f9ab53901..6489b02615b 100644
--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -36,7 +36,7 @@ class TestVisonModels(unittest.TestCase):
         model.test_batch(x)
 
     def test_mobilenetv2_pretrained(self):
-        self.models_infer('mobilenet_v2', pretrained=True)
+        self.models_infer('mobilenet_v2', pretrained=False)
 
     def test_mobilenetv1(self):
         self.models_infer('mobilenet_v1')
diff --git a/python/paddle/vision/models/lenet.py b/python/paddle/vision/models/lenet.py
index c2d4be7cda1..b30d5992f9a 100644
--- a/python/paddle/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
@@ -12,20 +12,19 @@
 #See the License for the specific language governing permissions and
 #limitations under the License.
 
-import paddle.fluid as fluid
-from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
+import paddle
+import paddle.nn as nn
 
 __all__ = ['LeNet']
 
 
-class LeNet(fluid.dygraph.Layer):
+class LeNet(nn.Layer):
     """LeNet model from
     `"LeCun Y, Bottou L, Bengio Y, et al. Gradient-based learning applied to document recognition[J]. Proceedings of the IEEE, 1998, 86(11): 2278-2324.`_
 
     Args:
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 10.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -35,28 +34,27 @@ class LeNet(fluid.dygraph.Layer):
             model = LeNet()
     """
 
-    def __init__(self, num_classes=10, classifier_activation='softmax'):
+    def __init__(self, num_classes=10):
         super(LeNet, self).__init__()
         self.num_classes = num_classes
-        self.features = Sequential(
-            Conv2d(
+        self.features = nn.Sequential(
+            nn.Conv2d(
                 1, 6, 3, stride=1, padding=1),
-            ReLU(),
-            Pool2D(2, 'max', 2),
-            Conv2d(
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(
                 6, 16, 5, stride=1, padding=0),
-            ReLU(),
-            Pool2D(2, 'max', 2))
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2))
 
         if num_classes > 0:
-            self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
-                Softmax())  #Todo: accept any activation
+            self.fc = nn.Sequential(
+                nn.Linear(400, 120), nn.Linear(120, 84), nn.Linear(84, 10))
 
     def forward(self, inputs):
         x = self.features(inputs)
 
         if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
+            x = paddle.flatten(x, 1)
             x = self.fc(x)
         return x
diff --git a/python/paddle/vision/models/mobilenetv1.py b/python/paddle/vision/models/mobilenetv1.py
index 10defbf593d..39654122e3b 100644
--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.fluid as fluid
-from paddle.fluid.initializer import MSRA
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+import paddle
+import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -24,85 +22,66 @@ __all__ = ['MobileNetV1', 'mobilenet_v1']
 model_urls = {
     'mobilenetv1_1.0':
     ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams',
-     'bf0d25cb0bed1114d9dac9384ce2b4a6')
+     '42a154c2f26f86e7457d6daded114e8c')
 }
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(nn.Layer):
     def __init__(self,
-                 num_channels,
-                 filter_size,
-                 num_filters,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  stride,
                  padding,
-                 channels=None,
-                 num_groups=1,
-                 act='relu',
-                 use_cudnn=True,
-                 name=None):
+                 num_groups=1):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
+        self._conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
             stride=stride,
             padding=padding,
             groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=ParamAttr(
-                initializer=MSRA(), name=self.full_name() + "_weights"),
             bias_attr=False)
 
-        self._batch_norm = BatchNorm(
-            num_filters,
-            act=act,
-            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
-            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
-            moving_mean_name=self.full_name() + "_bn" + '_mean',
-            moving_variance_name=self.full_name() + "_bn" + '_variance')
+        self._norm_layer = nn.BatchNorm2d(out_channels)
+        self._act = nn.ReLU()
 
-    def forward(self, inputs):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        return y
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._norm_layer(x)
+        x = self._act(x)
+        return x
 
 
-class DepthwiseSeparable(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters1,
-                 num_filters2,
-                 num_groups,
-                 stride,
-                 scale,
-                 name=None):
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self, in_channels, out_channels1, out_channels2, num_groups,
+                 stride, scale):
         super(DepthwiseSeparable, self).__init__()
 
         self._depthwise_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=int(num_filters1 * scale),
-            filter_size=3,
+            in_channels,
+            int(out_channels1 * scale),
+            kernel_size=3,
             stride=stride,
             padding=1,
-            num_groups=int(num_groups * scale),
-            use_cudnn=False)
+            num_groups=int(num_groups * scale))
 
         self._pointwise_conv = ConvBNLayer(
-            num_channels=int(num_filters1 * scale),
-            filter_size=1,
-            num_filters=int(num_filters2 * scale),
+            int(out_channels1 * scale),
+            int(out_channels2 * scale),
+            kernel_size=1,
             stride=1,
             padding=0)
 
-    def forward(self, inputs):
-        y = self._depthwise_conv(inputs)
-        y = self._pointwise_conv(y)
-        return y
+    def forward(self, x):
+        x = self._depthwise_conv(x)
+        x = self._pointwise_conv(x)
+        return x
 
 
-class MobileNetV1(fluid.dygraph.Layer):
+class MobileNetV1(nn.Layer):
     """MobileNetV1 model from
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" <https://arxiv.org/abs/1704.04861>`_.
 
@@ -111,7 +90,6 @@ class MobileNetV1(fluid.dygraph.Layer):
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 1000.
         with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -121,11 +99,7 @@ class MobileNetV1(fluid.dygraph.Layer):
             model = MobileNetV1()
     """
 
-    def __init__(self,
-                 scale=1.0,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
         super(MobileNetV1, self).__init__()
         self.scale = scale
         self.dwsl = []
@@ -133,18 +107,17 @@ class MobileNetV1(fluid.dygraph.Layer):
         self.with_pool = with_pool
 
         self.conv1 = ConvBNLayer(
-            num_channels=3,
-            filter_size=3,
-            channels=3,
-            num_filters=int(32 * scale),
+            in_channels=3,
+            out_channels=int(32 * scale),
+            kernel_size=3,
             stride=2,
             padding=1)
 
         dws21 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(32 * scale),
-                num_filters1=32,
-                num_filters2=64,
+                in_channels=int(32 * scale),
+                out_channels1=32,
+                out_channels2=64,
                 num_groups=32,
                 stride=1,
                 scale=scale),
@@ -153,9 +126,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws22 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(64 * scale),
-                num_filters1=64,
-                num_filters2=128,
+                in_channels=int(64 * scale),
+                out_channels1=64,
+                out_channels2=128,
                 num_groups=64,
                 stride=2,
                 scale=scale),
@@ -164,9 +137,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws31 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
-                num_filters1=128,
-                num_filters2=128,
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=128,
                 num_groups=128,
                 stride=1,
                 scale=scale),
@@ -175,9 +148,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws32 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
-                num_filters1=128,
-                num_filters2=256,
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=256,
                 num_groups=128,
                 stride=2,
                 scale=scale),
@@ -186,9 +159,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws41 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
-                num_filters1=256,
-                num_filters2=256,
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=256,
                 num_groups=256,
                 stride=1,
                 scale=scale),
@@ -197,9 +170,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws42 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
-                num_filters1=256,
-                num_filters2=512,
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=512,
                 num_groups=256,
                 stride=2,
                 scale=scale),
@@ -209,9 +182,9 @@ class MobileNetV1(fluid.dygraph.Layer):
         for i in range(5):
             tmp = self.add_sublayer(
                 sublayer=DepthwiseSeparable(
-                    num_channels=int(512 * scale),
-                    num_filters1=512,
-                    num_filters2=512,
+                    in_channels=int(512 * scale),
+                    out_channels1=512,
+                    out_channels2=512,
                     num_groups=512,
                     stride=1,
                     scale=scale),
@@ -220,9 +193,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws56 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(512 * scale),
-                num_filters1=512,
-                num_filters2=1024,
+                in_channels=int(512 * scale),
+                out_channels1=512,
+                out_channels2=1024,
                 num_groups=512,
                 stride=2,
                 scale=scale),
@@ -231,9 +204,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws6 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(1024 * scale),
-                num_filters1=1024,
-                num_filters2=1024,
+                in_channels=int(1024 * scale),
+                out_channels1=1024,
+                out_channels2=1024,
                 num_groups=1024,
                 stride=1,
                 scale=scale),
@@ -241,29 +214,23 @@ class MobileNetV1(fluid.dygraph.Layer):
         self.dwsl.append(dws6)
 
         if with_pool:
-            self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
-
-        if num_classes > -1:
-            self.out = Linear(
-                int(1024 * scale),
-                num_classes,
-                act=classifier_activation,
-                param_attr=ParamAttr(
-                    initializer=MSRA(), name=self.full_name() + "fc7_weights"),
-                bias_attr=ParamAttr(name="fc7_offset"))
-
-    def forward(self, inputs):
-        y = self.conv1(inputs)
+            self.pool2d_avg = nn.AdaptiveAvgPool2d(1)
+
+        if num_classes > 0:
+            self.fc = nn.Linear(int(1024 * scale), num_classes)
+
+    def forward(self, x):
+        x = self.conv1(x)
         for dws in self.dwsl:
-            y = dws(y)
+            x = dws(x)
 
         if self.with_pool:
-            y = self.pool2d_avg(y)
+            x = self.pool2d_avg(x)
 
         if self.num_classes > 0:
-            y = fluid.layers.reshape(y, shape=[-1, 1024])
-            y = self.out(y)
-        return y
+            x = paddle.flatten(x, 1)
+            x = self.fc(x)
+        return x
 
 
 def _mobilenet(arch, pretrained=False, **kwargs):
@@ -275,7 +242,7 @@ def _mobilenet(arch, pretrained=False, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index c08fb88f8bd..bab8b7b2b1b 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -14,9 +14,9 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+
+import paddle.nn as nn
+import paddle.nn.functional as F
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -25,221 +25,166 @@ __all__ = ['MobileNetV2', 'mobilenet_v2']
 model_urls = {
     'mobilenetv2_1.0':
     ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v2_x1.0.pdparams',
-     '8ff74f291f72533f2a7956a4efff9d88')
+     '0340af0a901346c8d46f4529882fb63d')
 }
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_channels,
-                 filter_size,
-                 num_filters,
-                 stride,
-                 padding,
-                 channels=None,
-                 num_groups=1,
-                 use_cudnn=True):
-        super(ConvBNLayer, self).__init__()
-
-        tmp_param = ParamAttr(name=self.full_name() + "_weights")
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=tmp_param,
-            bias_attr=False)
-
-        self._batch_norm = BatchNorm(
-            num_filters,
-            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
-            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
-            moving_mean_name=self.full_name() + "_bn" + '_mean',
-            moving_variance_name=self.full_name() + "_bn" + '_variance')
-
-    def forward(self, inputs, if_act=True):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        if if_act:
-            y = fluid.layers.relu6(y)
-        return y
-
-
-class InvertedResidualUnit(fluid.dygraph.Layer):
-    def __init__(
-            self,
-            num_channels,
-            num_in_filter,
-            num_filters,
-            stride,
-            filter_size,
-            padding,
-            expansion_factor, ):
-        super(InvertedResidualUnit, self).__init__()
-        num_expfilter = int(round(num_in_filter * expansion_factor))
-        self._expand_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_expfilter,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1)
-
-        self._bottleneck_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_expfilter,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            num_groups=num_expfilter,
-            use_cudnn=False)
-
-        self._linear_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_filters,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1)
-
-    def forward(self, inputs, ifshortcut):
-        y = self._expand_conv(inputs, if_act=True)
-        y = self._bottleneck_conv(y, if_act=True)
-        y = self._linear_conv(y, if_act=False)
-        if ifshortcut:
-            y = fluid.layers.elementwise_add(inputs, y)
-        return y
-
-
-class InvresiBlocks(fluid.dygraph.Layer):
-    def __init__(self, in_c, t, c, n, s):
-        super(InvresiBlocks, self).__init__()
-
-        self._first_block = InvertedResidualUnit(
-            num_channels=in_c,
-            num_in_filter=in_c,
-            num_filters=c,
-            stride=s,
-            filter_size=3,
-            padding=1,
-            expansion_factor=t)
-
-        self._inv_blocks = []
-        for i in range(1, n):
-            tmp = self.add_sublayer(
-                sublayer=InvertedResidualUnit(
-                    num_channels=c,
-                    num_in_filter=c,
-                    num_filters=c,
-                    stride=1,
-                    filter_size=3,
-                    padding=1,
-                    expansion_factor=t),
-                name=self.full_name() + "_" + str(i + 1))
-            self._inv_blocks.append(tmp)
-
-    def forward(self, inputs):
-        y = self._first_block(inputs, ifshortcut=False)
-        for inv_block in self._inv_blocks:
-            y = inv_block(y, ifshortcut=True)
-        return y
-
-
-class MobileNetV2(fluid.dygraph.Layer):
-    """MobileNetV2 model from
-    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
-
-    Args:
-        scale (float): scale of channels in each layer. Default: 1.0.
-        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
-                            will not be defined. Default: 1000.
-        with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
-
-    Examples:
-        .. code-block:: python
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
 
-            from paddle.vision.models import MobileNetV2
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
 
-            model = MobileNetV2()
-    """
 
+class ConvBNReLU(nn.Sequential):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 kernel_size=3,
+                 stride=1,
+                 groups=1,
+                 norm_layer=nn.BatchNorm2d):
+        padding = (kernel_size - 1) // 2
+
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups=groups,
+                bias_attr=False),
+            norm_layer(out_planes),
+            nn.ReLU6())
+
+
+class InvertedResidual(nn.Layer):
     def __init__(self,
-                 scale=1.0,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
+                 inp,
+                 oup,
+                 stride,
+                 expand_ratio,
+                 norm_layer=nn.BatchNorm2d):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(
+                ConvBNReLU(
+                    inp, hidden_dim, kernel_size=1, norm_layer=norm_layer))
+        layers.extend([
+            ConvBNReLU(
+                hidden_dim,
+                hidden_dim,
+                stride=stride,
+                groups=hidden_dim,
+                norm_layer=norm_layer),
+            nn.Conv2d(
+                hidden_dim, oup, 1, 1, 0, bias_attr=False),
+            norm_layer(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Layer):
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
+        """MobileNetV2 model from
+        `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
+
+        Args:
+            scale (float): scale of channels in each layer. Default: 1.0.
+            num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                                will not be defined. Default: 1000.
+            with_pool (bool): use pool before the last fc layer or not. Default: True.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.vision.models import MobileNetV2
+
+                model = MobileNetV2()
+        """
         super(MobileNetV2, self).__init__()
-        self.scale = scale
         self.num_classes = num_classes
         self.with_pool = with_pool
+        input_channel = 32
+        last_channel = 1280
+
+        block = InvertedResidual
+        round_nearest = 8
+        norm_layer = nn.BatchNorm2d
+        inverted_residual_setting = [
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],
+        ]
 
-        bottleneck_params_list = [
-            (1, 16, 1, 1),
-            (6, 24, 2, 2),
-            (6, 32, 3, 2),
-            (6, 64, 4, 2),
-            (6, 96, 3, 1),
-            (6, 160, 3, 2),
-            (6, 320, 1, 1),
+        input_channel = _make_divisible(input_channel * scale, round_nearest)
+        self.last_channel = _make_divisible(last_channel * max(1.0, scale),
+                                            round_nearest)
+        features = [
+            ConvBNReLU(
+                3, input_channel, stride=2, norm_layer=norm_layer)
         ]
 
-        self._conv1 = ConvBNLayer(
-            num_channels=3,
-            num_filters=int(32 * scale),
-            filter_size=3,
-            stride=2,
-            padding=1)
-
-        self._invl = []
-        i = 1
-        in_c = int(32 * scale)
-        for layer_setting in bottleneck_params_list:
-            t, c, n, s = layer_setting
-            i += 1
-            tmp = self.add_sublayer(
-                sublayer=InvresiBlocks(
-                    in_c=in_c, t=t, c=int(c * scale), n=n, s=s),
-                name='conv' + str(i))
-            self._invl.append(tmp)
-            in_c = int(c * scale)
-
-        self._out_c = int(1280 * scale) if scale > 1.0 else 1280
-        self._conv9 = ConvBNLayer(
-            num_channels=in_c,
-            num_filters=self._out_c,
-            filter_size=1,
-            stride=1,
-            padding=0)
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * scale, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(
+                    block(
+                        input_channel,
+                        output_channel,
+                        stride,
+                        expand_ratio=t,
+                        norm_layer=norm_layer))
+                input_channel = output_channel
+
+        features.append(
+            ConvBNReLU(
+                input_channel,
+                self.last_channel,
+                kernel_size=1,
+                norm_layer=norm_layer))
+
+        self.features = nn.Sequential(*features)
 
         if with_pool:
-            self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
-
-        if num_classes > 0:
-            tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
-            self._fc = Linear(
-                self._out_c,
-                num_classes,
-                act=classifier_activation,
-                param_attr=tmp_param,
-                bias_attr=ParamAttr(name="fc10_offset"))
-
-    def forward(self, inputs):
-        y = self._conv1(inputs, if_act=True)
-        for inv in self._invl:
-            y = inv(y)
-        y = self._conv9(y, if_act=True)
+            self.pool2d_avg = nn.AdaptiveAvgPool2d(1)
+
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Dropout(0.2), nn.Linear(self.last_channel, num_classes))
+
+    def forward(self, x):
+        x = self.features(x)
 
         if self.with_pool:
-            y = self._pool2d_avg(y)
+            x = self.pool2d_avg(x)
+
         if self.num_classes > 0:
-            y = fluid.layers.reshape(y, shape=[-1, self._out_c])
-            y = self._fc(y)
-        return y
+            x = paddle.flatten(x, 1)
+            x = self.classifier(x)
+        return x
 
 
 def _mobilenet(arch, pretrained=False, **kwargs):
@@ -251,7 +196,7 @@ def _mobilenet(arch, pretrained=False, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/resnet.py b/python/paddle/vision/models/resnet.py
index da0c3e9eb3f..f9e00aefd6b 100644
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -15,11 +15,8 @@
 from __future__ import division
 from __future__ import print_function
 
-import math
-import paddle.fluid as fluid
-
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
-from paddle.fluid.dygraph.container import Sequential
+import paddle
+import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -29,143 +26,129 @@ __all__ = [
 
 model_urls = {
     'resnet18': ('https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams',
-                 '0ba53eea9bc970962d0ef96f7b94057e'),
+                 'cf548f46534aa3560945be4b95cd11c4'),
     'resnet34': ('https://paddle-hapi.bj.bcebos.com/models/resnet34.pdparams',
-                 '46bc9f7c3dd2e55b7866285bee91eff3'),
+                 '8d2275cf8706028345f78ac0e1d31969'),
     'resnet50': ('https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams',
-                 '5ce890a9ad386df17cf7fe2313dca0a1'),
+                 'ca6f485ee1ab0492d38f323885b0ad80'),
     'resnet101': ('https://paddle-hapi.bj.bcebos.com/models/resnet101.pdparams',
-                  'fb07a451df331e4b0bb861ed97c3a9b9'),
+                  '02f35f034ca3858e1e54d4036443c92d'),
     'resnet152': ('https://paddle-hapi.bj.bcebos.com/models/resnet152.pdparams',
-                  'f9c700f26d3644bb76ad2226ed5f5713'),
+                  '7ad16a2f1e7333859ff986138630fd7a'),
 }
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
+class BasicBlock(nn.Layer):
+    expansion = 1
+
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
+                 inplanes,
+                 planes,
                  stride=1,
+                 downsample=None,
                  groups=1,
-                 act=None):
-        super(ConvBNLayer, self).__init__()
-
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=False)
-
-        self._batch_norm = BatchNorm(num_filters, act=act)
-
-    def forward(self, inputs):
-        x = self._conv(inputs)
-        x = self._batch_norm(x)
-
-        return x
-
-
-class BasicBlock(fluid.dygraph.Layer):
-    """residual block of resnet18 and resnet34
-    """
-    expansion = 1
-
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
         super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
 
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=3,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu')
+        if dilation > 1:
+            raise NotImplementedError(
+                "Dilation > 1 not supported in BasicBlock")
 
-        if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters,
-                filter_size=1,
-                stride=stride)
+        self.conv1 = nn.Conv2d(
+            inplanes, planes, 3, padding=1, stride=stride, bias_attr=False)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias_attr=False)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
 
-        self.shortcut = shortcut
+    def forward(self, x):
+        identity = x
 
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-        conv1 = self.conv1(y)
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
 
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
+        out = self.conv2(out)
+        out = self.bn2(out)
 
-        y = short + conv1
+        if self.downsample is not None:
+            identity = self.downsample(x)
 
-        return fluid.layers.relu(y)
+        out += identity
+        out = self.relu(out)
 
+        return out
 
-class BottleneckBlock(fluid.dygraph.Layer):
-    """residual block of resnet50, resnet101 amd resnet152
-    """
+
+class BottleneckBlock(nn.Layer):
 
     expansion = 4
 
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
         super(BottleneckBlock, self).__init__()
-
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+
+        self.conv1 = nn.Conv2d(inplanes, width, 1, bias_attr=False)
+        self.bn1 = norm_layer(width)
+
+        self.conv2 = nn.Conv2d(
+            width,
+            width,
+            3,
+            padding=dilation,
             stride=stride,
-            act='relu')
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * self.expansion,
-            filter_size=1,
-            act=None)
+            groups=groups,
+            dilation=dilation,
+            bias_attr=False)
+        self.bn2 = norm_layer(width)
 
-        if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * self.expansion,
-                filter_size=1,
-                stride=stride)
+        self.conv3 = nn.Conv2d(
+            width, planes * self.expansion, 1, bias_attr=False)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
 
-        self.shortcut = shortcut
+    def forward(self, x):
+        identity = x
 
-        self._num_channels_out = num_filters * self.expansion
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
 
-    def forward(self, inputs):
-        x = self.conv0(inputs)
-        conv1 = self.conv1(x)
-        conv2 = self.conv2(conv1)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
 
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
+        out = self.conv3(out)
+        out = self.bn3(out)
 
-        x = fluid.layers.elementwise_add(x=short, y=conv2)
+        if self.downsample is not None:
+            identity = self.downsample(x)
 
-        return fluid.layers.relu(x)
+        out += identity
+        out = self.relu(out)
 
+        return out
 
-class ResNet(fluid.dygraph.Layer):
+
+class ResNet(nn.Layer):
     """ResNet model from
     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
 
@@ -175,7 +158,6 @@ class ResNet(fluid.dygraph.Layer):
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 1000.
         with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -189,82 +171,87 @@ class ResNet(fluid.dygraph.Layer):
 
     """
 
-    def __init__(self,
-                 Block,
-                 depth=50,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
+    def __init__(self, block, depth, num_classes=1000, with_pool=True):
         super(ResNet, self).__init__()
-
-        self.num_classes = num_classes
-        self.with_pool = with_pool
-
-        layer_config = {
+        layer_cfg = {
             18: [2, 2, 2, 2],
             34: [3, 4, 6, 3],
             50: [3, 4, 6, 3],
             101: [3, 4, 23, 3],
-            152: [3, 8, 36, 3],
+            152: [3, 8, 36, 3]
         }
-        assert depth in layer_config.keys(), \
-            "supported depth are {} but input layer is {}".format(
-                layer_config.keys(), depth)
-
-        layers = layer_config[depth]
-
-        in_channels = 64
-        out_channels = [64, 128, 256, 512]
-
-        self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
-        self.pool = Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
-
-        self.layers = []
-        for idx, num_blocks in enumerate(layers):
-            blocks = []
-            shortcut = False
-            for b in range(num_blocks):
-                if b == 1:
-                    in_channels = out_channels[idx] * Block.expansion
-                block = Block(
-                    num_channels=in_channels,
-                    num_filters=out_channels[idx],
-                    stride=2 if b == 0 and idx != 0 else 1,
-                    shortcut=shortcut)
-                blocks.append(block)
-                shortcut = True
-            layer = self.add_sublayer("layer_{}".format(idx),
-                                      Sequential(*blocks))
-            self.layers.append(layer)
+        layers = layer_cfg[depth]
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+        self._norm_layer = nn.BatchNorm2d
+
+        self.inplanes = 64
+        self.dilation = 1
 
+        self.conv1 = nn.Conv2d(
+            3,
+            self.inplanes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias_attr=False)
+        self.bn1 = self._norm_layer(self.inplanes)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
         if with_pool:
-            self.global_pool = Pool2D(
-                pool_size=7, pool_type='avg', global_pooling=True)
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
 
         if num_classes > 0:
-            stdv = 1.0 / math.sqrt(out_channels[-1] * Block.expansion * 1.0)
-            self.fc_input_dim = out_channels[-1] * Block.expansion * 1 * 1
-            self.fc = Linear(
-                self.fc_input_dim,
-                num_classes,
-                act=classifier_activation,
-                param_attr=fluid.param_attr.ParamAttr(
-                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
-
-    def forward(self, inputs):
-        x = self.conv(inputs)
-        x = self.pool(x)
-        for layer in self.layers:
-            x = layer(x)
-
-        if self.with_pool:
-            x = self.global_pool(x)
-
-        if self.num_classes > -1:
-            x = fluid.layers.reshape(x, shape=[-1, self.fc_input_dim])
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes,
+                    planes * block.expansion,
+                    1,
+                    stride=stride,
+                    bias_attr=False),
+                norm_layer(planes * block.expansion), )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, 1, 64,
+                  previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        if self.with_pool > 0:
+            x = self.avgpool(x)
+
+        if self.num_classes > 0:
+            x = paddle.flatten(x, 1)
             x = self.fc(x)
+
         return x
 
 
@@ -277,7 +264,7 @@ def _resnet(arch, Block, depth, pretrained, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.set_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/vgg.py b/python/paddle/vision/models/vgg.py
index 8bfacda2476..d11845b6616 100644
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.fluid as fluid
-from paddle.nn import Conv2d, Pool2D, BatchNorm, Linear, ReLU, Softmax
-from paddle.fluid.dygraph.container import Sequential
+import paddle
+import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -28,39 +27,18 @@ __all__ = [
 
 model_urls = {
     'vgg16': ('https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams',
-              'c788f453a3b999063e8da043456281ee')
+              '89bbffc0f87d260be9b8cdc169c991c4')
 }
 
 
-class Classifier(fluid.dygraph.Layer):
-    def __init__(self, num_classes, classifier_activation='softmax'):
-        super(Classifier, self).__init__()
-        self.linear1 = Linear(512 * 7 * 7, 4096)
-        self.linear2 = Linear(4096, 4096)
-        self.linear3 = Linear(4096, num_classes)
-        self.act = Softmax()  #Todo: accept any activation
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = fluid.layers.relu(x)
-        x = fluid.layers.dropout(x, 0.5)
-        x = self.linear2(x)
-        x = fluid.layers.relu(x)
-        x = fluid.layers.dropout(x, 0.5)
-        x = self.linear3(x)
-        out = self.act(x)
-        return out
-
-
-class VGG(fluid.dygraph.Layer):
+class VGG(nn.Layer):
     """VGG model from
     `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
 
     Args:
-        features (fluid.dygraph.Layer): vgg features create by function make_layers.
+        features (nn.Layer): vgg features create by function make_layers.
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 1000.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -76,44 +54,41 @@ class VGG(fluid.dygraph.Layer):
 
     """
 
-    def __init__(self,
-                 features,
-                 num_classes=1000,
-                 classifier_activation='softmax'):
+    def __init__(self, features, num_classes=1000):
         super(VGG, self).__init__()
         self.features = features
-        self.num_classes = num_classes
-
-        if num_classes > 0:
-            classifier = Classifier(num_classes, classifier_activation)
-            self.classifier = self.add_sublayer("classifier",
-                                                Sequential(classifier))
+        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
+        self.classifier = nn.Sequential(
+            nn.Linear(512 * 7 * 7, 4096),
+            nn.ReLU(),
+            nn.Dropout(),
+            nn.Linear(4096, 4096),
+            nn.ReLU(),
+            nn.Dropout(),
+            nn.Linear(4096, num_classes), )
 
     def forward(self, x):
         x = self.features(x)
-
-        if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
-            x = self.classifier(x)
+        x = self.avgpool(x)
+        x = paddle.flatten(x, 1)
+        x = self.classifier(x)
         return x
 
 
 def make_layers(cfg, batch_norm=False):
     layers = []
     in_channels = 3
-
     for v in cfg:
         if v == 'M':
-            layers += [Pool2D(pool_size=2, pool_stride=2)]
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
         else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
             if batch_norm:
-                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
-                layers += [conv2d, BatchNorm(v), ReLU()]
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU()]
             else:
-                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
-                layers += [conv2d, ReLU()]
+                layers += [conv2d, nn.ReLU()]
             in_channels = v
-    return Sequential(*layers)
+    return nn.Sequential(*layers)
 
 
 cfgs = {
@@ -144,7 +119,7 @@ def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
-- 
GitLab


From b0ee1405f74e0c598f84694f91cfd331a1ab10ca Mon Sep 17 00:00:00 2001
From: "joanna.wozna.intel" <joanna.wozna@intel.com>
Date: Sat, 26 Sep 2020 07:32:39 +0200
Subject: [PATCH 238/261] Add conv2d bfloat16 support (#27325)

---
 .../framework/ir/graph_pattern_detector.cc    |   3 +-
 paddle/fluid/operators/conv_op.cc             |   8 +-
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  |  54 +++--
 .../operators/mkldnn/dequantize_mkldnn_op.cc  |   3 +-
 paddle/fluid/pybind/tensor_py.h               |  18 ++
 python/paddle/fluid/framework.py              |   4 +-
 .../mkldnn/test_conv2d_bf16_mkldnn_op.py      | 208 ++++++++++++++++++
 .../mkldnn/test_conv2d_int8_mkldnn_op.py      |   4 +-
 .../mkldnn/test_dequantize_mkldnn_op.py       |  13 +-
 .../paddle/fluid/tests/unittests/op_test.py   |  36 ++-
 10 files changed, 323 insertions(+), 28 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 9c1eaa99a3c..96952e20c21 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1894,8 +1894,7 @@ PDNode *patterns::QuantizePlacement::operator()(
 
 PDNode *patterns::Bfloat16Placement::operator()(
     const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
-  std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>();
+  std::unordered_set<std::string> supported_op_types{"conv2d"};
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index bf97b9d03c4..ef8a2b38f20 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -166,7 +166,8 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
 #endif
 
   if (input_data_type != framework::proto::VarType::INT8 &&
-      input_data_type != framework::proto::VarType::UINT8) {
+      input_data_type != framework::proto::VarType::UINT8 &&
+      input_data_type != framework::proto::VarType::BF16) {
     auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
     PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
                       platform::errors::InvalidArgument(
@@ -455,6 +456,11 @@ void Conv3DOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<std::string>(
+      "mkldnn_data_type",
+      "(string, default \"float32\"). Data type of mkldnn kernel")
+      .SetDefault("float32")
+      .InEnum({"float32", "int8", "bfloat16"});
   AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
   AddAttr<std::string>("fuse_activation",
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index a6cda154e55..7a4e11091fd 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -55,12 +55,12 @@ inline MKLDNNMemoryFormat GetWeightsFormat(const MKLDNNMemoryFormat format,
   }
 }
 
-static mkldnn::memory::data_type GetDstType(bool is_int8,
+static mkldnn::memory::data_type GetDstType(bool is_int8, bool is_bfloat16,
                                             bool force_fp32_output,
                                             std::string fuse_activation,
                                             bool fuse_residual_conn,
                                             const Tensor* residual_param) {
-  auto dst_dt = mkldnn::memory::data_type::f32;  // uint8_t, int8_t, float
+  auto dst_dt = mkldnn::memory::data_type::f32;
   if (is_int8) {
     dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6")
                  ? mkldnn::memory::data_type::u8
@@ -72,6 +72,13 @@ static mkldnn::memory::data_type GetDstType(bool is_int8,
       auto residual_dt = framework::ToMKLDNNDataType(residual_param->type());
       if (dst_dt != residual_dt) dst_dt = residual_dt;
     }
+  } else {
+    if (!force_fp32_output && is_bfloat16) {
+      dst_dt = mkldnn::memory::data_type::bf16;
+      if (fuse_residual_conn && residual_param) {
+        dst_dt = framework::ToMKLDNNDataType(residual_param->type());
+      }
+    }
   }
   return dst_dt;
 }
@@ -224,12 +231,15 @@ class ConvMKLDNNHandlerT
               src_tz.size(), chosen_memory_format);
         }
       }
-
-      const auto src_md = platform::MKLDNNMemDesc(
-          src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-      const auto weights_md =
-          platform::MKLDNNMemDesc(weights_tz, platform::MKLDNNGetDataType<T>(),
-                                  MKLDNNMemoryFormat::any);
+      auto data_type = mkldnn::memory::data_type::f32;
+      if (ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16" ||
+          std::is_same<T_out, platform::bfloat16>::value)
+        data_type = mkldnn::memory::data_type::bf16;
+
+      const auto src_md =
+          platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format);
+      const auto weights_md = platform::MKLDNNMemDesc(weights_tz, data_type,
+                                                      MKLDNNMemoryFormat::any);
       const auto dst_md = platform::MKLDNNMemDesc(
           dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
 
@@ -241,8 +251,8 @@ class ConvMKLDNNHandlerT
 
       if (bias) {
         auto bias_tz = framework::vectorize(bias->dims());
-        auto bias_md = platform::MKLDNNMemDesc(
-            bias_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
+        auto bias_md =
+            platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x);
 
         this->AcquireForwardPrimitiveDescriptor(
             conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct,
@@ -384,15 +394,21 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                           "Operator DNNL Conv must use CPUPlace"));
     bool is_INT8 =
         std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
+    bool is_BFLOAT16 = ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16";
+    auto residual_param = ctx.Input<Tensor>("ResidualData");
+    bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
+    std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
+    bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+    auto dst_dt =
+        GetDstType(is_INT8, is_BFLOAT16, force_fp32_output, fuse_activation,
+                   fuse_residual_conn, residual_param);
     if (!is_INT8) {
-      ComputeFP32<float>(ctx);
+      if (dst_dt == mkldnn::memory::data_type::f32) {
+        ComputeFP32<float>(ctx);
+      } else if (dst_dt == mkldnn::memory::data_type::bf16) {
+        ComputeFP32<platform::bfloat16>(ctx);
+      }
     } else {
-      std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
-      bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
-      bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
-      auto residual_param = ctx.Input<Tensor>("ResidualData");
-      auto dst_dt = GetDstType(true, force_fp32_output, fuse_activation,
-                               fuse_residual_conn, residual_param);
       if (dst_dt == mkldnn::memory::data_type::f32) {
         ComputeINT8<float>(ctx);
       } else if (dst_dt == mkldnn::memory::data_type::u8) {
@@ -1103,6 +1119,10 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
                                     ops::kConvMKLDNNFP32,
                                     ops::ConvMKLDNNOpKernel<float, float>);
 
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    conv2d, MKLDNN, ::paddle::platform::CPUPlace, BF16, ops::kConvMKLDNNFP32,
+    ops::ConvMKLDNNOpKernel<paddle::platform::bfloat16, float>);
+
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
                                     ::paddle::platform::CPUPlace, U8,
                                     ops::kConvMKLDNNINT8,
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index 540642c7140..70d4c34d9c5 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -110,4 +110,5 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(dequantize, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::DeQuantOpKernel<uint8_t>, ops::DeQuantOpKernel<int8_t>);
+                   ops::DeQuantOpKernel<uint8_t>, ops::DeQuantOpKernel<int8_t>,
+                   ops::DeQuantOpKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 5ee15073267..142ab2bb9d7 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -41,6 +41,7 @@ namespace detail {
 // import numpy as np
 // print np.dtype(np.float16).num  # 23
 constexpr int NPY_FLOAT16_ = 23;
+constexpr int NPY_UINT16_ = 4;
 
 // Note: Since float16 is not a builtin type in C++, we register
 // paddle::platform::float16 as numpy.float16.
@@ -60,6 +61,23 @@ struct npy_format_descriptor<paddle::platform::float16> {
   static PYBIND11_DESCR name() { return _("float16"); }
 };
 
+// Note: Since bfloat16 is not a builtin type in C++ and in numpy,
+// we register paddle::platform::bfloat16 as numpy.uint16.
+template <>
+struct npy_format_descriptor<paddle::platform::bfloat16> {
+  static py::dtype dtype() {
+    handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_UINT16_);
+    return reinterpret_borrow<py::dtype>(ptr);
+  }
+  static std::string format() {
+    // Note: "H" represents UINT16.
+    // Details at:
+    // https://docs.python.org/3/library/struct.html#format-characters.
+    return "H";
+  }
+  static PYBIND11_DESCR name() { return _("bfloat16"); }
+};
+
 }  // namespace detail
 }  // namespace pybind11
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index c7e66bb2877..b4cea6761dc 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -613,7 +613,9 @@ def convert_np_dtype_to_dtype_(np_dtype):
     elif dtype == np.bool:
         return core.VarDesc.VarType.BOOL
     elif dtype == np.uint16:
-        return core.VarDesc.VarType.INT16
+        # since there is still no support for bfloat16 in NumPy,
+        # uint16 is used for casting bfloat16
+        return core.VarDesc.VarType.BF16
     elif dtype == np.uint8:
         return core.VarDesc.VarType.UINT8
     elif dtype == np.int8:
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
new file mode 100644
index 00000000000..0ac33383fb2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
@@ -0,0 +1,208 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import struct
+
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp
+
+
+def conv2d_forward_refer(input, filter, group, conv_param):
+    out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group,
+                                                          conv_param)
+    return out
+
+
+def conv2d_residual_naive(out, residual):
+    assert out.shape == residual.shape
+    out = np.add(out, residual)
+    return out
+
+
+class TestConv2dBf16Op(TestConv2dOp):
+    def setUp(self):
+        self.op_type = "conv2d"
+        self.use_cudnn = False
+        self.exhaustive_search = False
+        self.use_cuda = False
+        self.use_mkldnn = True
+        self.weight_type = np.float32
+        self.input_type = np.float32
+        self.use_mkldnn = True
+        self.mkldnn_data_type = "bfloat16"
+        self.force_fp32_output = False
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+        self.init_fuse_relu()
+        self.init_fuse_residual()
+        self.init_data_type()
+        self.init_force_fp32_output()
+
+        conv2d_param = {
+            'stride': self.stride,
+            'pad': self.pad,
+            'dilation': self.dilations
+        }
+        self.input = np.random.random(self.input_size).astype(np.float32)
+        self.filter = np.random.random(self.filter_size).astype(np.float32)
+        conv_out, _, _, _, _ = conv2d_forward_naive(self.input, self.filter,
+                                                    self.groups, conv2d_param)
+        self.conv_output_float = conv_out
+
+        if self.fuse_residual:
+            self.input_residual = np.random.random(
+                self.input_residual_size).astype(np.float32)
+            self.conv_output_float = conv2d_residual_naive(
+                self.conv_output_float, self.input_residual)
+            self.conv_output = convert_float_to_uint16(self.conv_output_float)
+            self.outputs = {'Output': self.conv_output}
+        elif self.force_fp32_output:
+            self.outputs = {'Output': self.conv_output_float.astype(np.float32)}
+
+        if self.input_type is not np.float32:
+            self.input = convert_float_to_uint16(self.input)
+
+        self.inputs = {
+            'Input': self.input.view(self.input_type),
+            'Filter': OpTest.np_dtype_to_fluid_dtype(
+                self.filter.astype(self.weight_type))
+        }
+
+        if self.fuse_residual:
+            self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
+                convert_float_to_uint16(self.input_residual))
+
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups,
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn,
+            'mkldnn_data_type': self.mkldnn_data_type,
+            'force_fp32_output': self.force_fp32_output,
+            'fuse_residual_connection': self.fuse_residual
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad(self):
+        pass
+
+    def test_check_grad_no_filter(self):
+        pass
+
+    def test_check_grad_no_input(self):
+        pass
+
+    def init_test_case(self):
+        TestConv2dOp.init_test_case(self)
+        self.input_size = [1, 1, 5, 5]  # NCHW
+        f_c = self.input_size[1] // self.groups
+        self.input_residual_size = [1, 2, 3, 3]
+        self.filter_size = [2, f_c, 3, 3]
+
+    def init_data_type(self):
+        self.weight_type = np.float32
+        self.input_type = np.float32
+
+    def init_force_fp32_output(self):
+        self.force_fp32_output = False
+
+    def init_fuse_relu(self):
+        self.fuse_activation = "relu"
+
+    def init_fuse_residual(self):
+        self.fuse_residual = True
+
+
+class TestConv2d(TestConv2dBf16Op):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.input_residual_size = [2, 6, 3, 3]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_data_type(self):
+        self.input_type = np.uint16
+
+
+class TestWithPad(TestConv2d):
+    def init_test_case(self):
+        TestConv2d.init_test_case(self)
+        self.pad = [1, 1]
+        self.input_residual_size = [2, 6, 5, 5]
+
+
+class TestWithGroup(TestConv2d):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithStride(TestConv2dBf16Op):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]
+        self.input_residual_size = [2, 6, 3, 3]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_data_type(self):
+        self.input_type = np.uint16
+
+
+class TestWith1x1ForceFP32Output(TestConv2dBf16Op):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [1, 3, 5, 5]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_force_fp32_output(self):
+        self.force_fp32_output = True
+
+    def init_fuse_residual(self):
+        self.fuse_residual = False
+
+
+class TestWithInput1x1Filter1x1(TestConv2dBf16Op):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 1, 1]
+        self.input_residual_size = [2, 6, 1, 1]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
index 7a494e3c2c3..9731efced69 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -36,6 +36,7 @@ class TestConv2dInt8Op(TestConv2dOp):
         self.use_cuda = False
         self.use_mkldnn = False
         self.data_format = "NCHW"
+        self.mkldnn_data_type = "int8"
         self.weighttype = np.float32
         self.use_mkldnn = True
         self.init_group()
@@ -141,7 +142,8 @@ class TestConv2dInt8Op(TestConv2dOp):
             'Scale_weights': self.scale_weights,
             'Scale_in_eltwise': self.scale_in_eltwise,
             'fuse_activation': self.fuse_activation,
-            'fuse_residual_connection': self.fuse_residual
+            'fuse_residual_connection': self.fuse_residual,
+            'mkldnn_data_type': self.mkldnn_data_type
         }
         self.outputs = {'Output': output}
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
index 35419462909..70c76f1fb71 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
 
 
 class TestDeQuantizeOp(OpTest):
@@ -32,6 +32,9 @@ class TestDeQuantizeOp(OpTest):
             input = (np.random.randint(0, 100, self.input_size) - 50
                      ).astype(self.data_type)
             output = (input * (1 / self.scale)).astype('float')
+        elif self.data_type == 'uint16':
+            output = np.random.random(self.input_size).astype(np.float32)
+            input = convert_float_to_uint16(output)
         else:
             input = (np.random.randint(0, 100,
                                        self.input_size)).astype(self.data_type)
@@ -70,5 +73,13 @@ class TestDeQuantizeOp2(TestDeQuantizeOp):
         self.data_type = 'uint8'
 
 
+class TestDeQuantizeOpBf16(TestDeQuantizeOp):
+    def set_scale(self):
+        self.scale = 1.0
+
+    def set_data_type(self):
+        self.data_type = 'uint16'
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index a6a4b9574c5..d02fdafe995 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -20,6 +20,7 @@ import warnings
 import numpy as np
 import random
 import six
+import struct
 import time
 import itertools
 import collections
@@ -167,6 +168,18 @@ def skip_check_grad_ci(reason=None):
     return wrapper
 
 
+def copy_bits_from_float_to_uint16(f):
+    return struct.unpack('<I', struct.pack('<f', f))[0] >> 16
+
+
+def convert_float_to_uint16(float_list):
+    new_output = []
+    for x in np.nditer(float_list):
+        new_output.append(np.uint16(copy_bits_from_float_to_uint16(x)))
+
+    return np.reshape(new_output, float_list.shape).view(np.uint16)
+
+
 class OpTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -242,6 +255,11 @@ class OpTest(unittest.TestCase):
             self.call_once = True
             self.dtype = data_type
 
+    def is_bfloat16_op(self):
+        return self.dtype == np.uint16 or (
+            hasattr(self, 'mkldnn_data_type') and
+            getattr(self, 'mkldnn_data_type') is "bfloat16")
+
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def is_np_data(input):
             return isinstance(input, (np.ndarray, np.generic))
@@ -276,8 +294,9 @@ class OpTest(unittest.TestCase):
         infer_dtype(inputs, dtype_set)
         dtype_list = [
             np.dtype(np.float64), np.dtype(np.float32), np.dtype(np.float16),
-            np.dtype(np.int64), np.dtype(np.int32), np.dtype(np.int16),
-            np.dtype(np.int8), np.dtype(np.uint8), np.dtype(np.bool)
+            np.dtype(np.int64), np.dtype(np.int32), np.dtype(np.uint16),
+            np.dtype(np.int16), np.dtype(np.int8), np.dtype(np.uint8),
+            np.dtype(np.bool)
         ]
         # check the dtype in dtype_list in order, select the first dtype that in dtype_set
         for dtype in dtype_list:
@@ -957,6 +976,14 @@ class OpTest(unittest.TestCase):
             self.op_type not in op_threshold_white_list.NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST:
             atol = 0
 
+        if self.is_bfloat16_op():
+            check_dygraph = False
+            if hasattr(self, 'force_fp32_output') and getattr(
+                    self, 'force_fp32_output'):
+                atol = 1e-2
+            else:
+                atol = 2
+
         if no_check_set is not None:
             if self.op_type not in no_check_set_white_list.no_check_set_white_list:
                 raise AssertionError(
@@ -1286,8 +1313,9 @@ class OpTest(unittest.TestCase):
             no_grad_set = set()
         else:
             if (self.op_type not in no_grad_set_white_list.NEED_TO_FIX_OP_LIST
-                ) and (self.op_type not in
-                       no_grad_set_white_list.NOT_CHECK_OP_LIST):
+                ) and (
+                    self.op_type not in no_grad_set_white_list.NOT_CHECK_OP_LIST
+                ) and (not self.is_bfloat16_op()):
                 raise AssertionError("no_grad_set must be None, op_type is " +
                                      self.op_type + " Op.")
 
-- 
GitLab


From ecfdfc9c58d7bcd6e70d0964a469250724359c0b Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 26 Sep 2020 18:04:35 +0800
Subject: [PATCH 239/261] fix guard place set error (#27573)

---
 python/paddle/fluid/dygraph/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 01c2f0fed49..69fb23383e5 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -363,7 +363,7 @@ def guard(place=None):
     with framework.program_guard(train, startup):
         with framework.unique_name.guard():
             with framework._dygraph_guard(tracer):
-                with framework._dygraph_place_guard(place):
+                with framework._dygraph_place_guard(expected_place):
                     yield
 
 
-- 
GitLab


From a85592bcbf837c6d33c528e1dfea380ed6912d42 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Sat, 26 Sep 2020 19:43:52 +0800
Subject: [PATCH 240/261] fix cpplint error for the autmic max/min

fix cpplint error for the autmic max/min
---
 .../fluid/operators/math/segment_pooling.cu   | 17 +++++++------
 paddle/fluid/platform/cuda_primitives.h       | 24 +++++++++----------
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
index bb2b6db100b..37155fa184e 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include <algorithm>
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/gpu_launch_param_config.h"
-#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace operators {
@@ -100,7 +99,7 @@ __global__ void SegmentOpsKernel(const Index* segment_ids, const T* input,
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
     Index inner_dim_size = h.inner_dim_size;
-    h.calculate(stripe_index, segment_offset, dim_index_base, actual_height);
+    h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height);
 
     T minmax = pool.initial();
     Index first_segment_id = segment_ids[dim_index_base];
@@ -154,7 +153,7 @@ __global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input,
                                        T* in_grad, Helper h) {
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
-    h.calculate(stripe_index, segment_offset, dim_index_base, actual_height);
+    h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height);
 
     for (Index j = 0; j < actual_height; j++) {
       Index current_segment_id = segment_ids[dim_index_base + j];
@@ -217,11 +216,11 @@ class ArrangeHelper {
     total_stripe_count = inner_dim_size * input_outer_dim_num_stripe;
   }
 
-  DEVICE inline void calculate(T stripe_index, T& segment_offset,
-                               T& dim_index_base, T& actual_height) {
-    segment_offset = stripe_index % inner_dim_size;
-    dim_index_base = stripe_index / inner_dim_size * DimTileSize;
-    actual_height = min(DimTileSize, input_length_size - dim_index_base);
+  DEVICE inline void calculate(T stripe_index, T* segment_offset,
+                               T* dim_index_base, T* actual_height) {
+    *segment_offset = stripe_index % inner_dim_size;
+    *dim_index_base = stripe_index / inner_dim_size * DimTileSize;
+    *actual_height = min(DimTileSize, input_length_size - *dim_index_base);
   }
 };
 
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index a5dd19d4363..4d9673e9646 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -137,12 +137,12 @@ USE_CUDA_ATOMIC(Max, unsigned int);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
 #else
-CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {
+CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {  // NOLINT
   if (*address >= val) {
     return;
   }
 
-  unsigned long long int old = *address, assumed;
+  unsigned long long int old = *address, assumed;  // NOLINT
 
   do {
     assumed = old;
@@ -169,7 +169,7 @@ CUDA_ATOMIC_WRAPPER(Max, float) {
     return;
   }
 
-  int *const address_as_i = (int *)address;
+  int *const address_as_i = reinterpret_cast<int *>(address);
   int old = *address_as_i, assumed;
 
   do {
@@ -187,9 +187,9 @@ CUDA_ATOMIC_WRAPPER(Max, double) {
     return;
   }
 
-  unsigned long long int *const address_as_ull =
-      (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull, assumed;
+  unsigned long long int *const address_as_ull =            // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
 
   do {
     assumed = old;
@@ -209,12 +209,12 @@ USE_CUDA_ATOMIC(Min, unsigned int);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
 #else
-CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {
+CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {  // NOLINT
   if (*address <= val) {
     return;
   }
 
-  unsigned long long int old = *address, assumed;
+  unsigned long long int old = *address, assumed;  // NOLINT
 
   do {
     assumed = old;
@@ -241,7 +241,7 @@ CUDA_ATOMIC_WRAPPER(Min, float) {
     return;
   }
 
-  int *const address_as_i = (int *)address;
+  int *const address_as_i = reinterpret_cast<int *>(address);
   int old = *address_as_i, assumed;
 
   do {
@@ -259,9 +259,9 @@ CUDA_ATOMIC_WRAPPER(Min, double) {
     return;
   }
 
-  unsigned long long int *const address_as_ull =
-      (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull, assumed;
+  unsigned long long int *const address_as_ull =            // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
 
   do {
     assumed = old;
-- 
GitLab


From 86fa0432050831c562c5a170f3db8fb0477aeda8 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Sun, 27 Sep 2020 10:27:14 +0800
Subject: [PATCH 241/261] init test=develop (#27554)

---
 .../fluid/tests/unittests/CMakeLists.txt       | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 09797576801..97a3ebc2135 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -531,15 +531,15 @@ if(NOT WIN32)
 endif()
 
 if(NOT APPLE AND NOT WIN32)
-    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140   LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 endif()
 
 add_subdirectory(sequence)
-- 
GitLab


From b9d739a7eaf3d75002d5eb233b75b7aa8affd1f1 Mon Sep 17 00:00:00 2001
From: Double_V <liuvv0203@163.com>
Date: Sun, 27 Sep 2020 10:32:50 +0800
Subject: [PATCH 242/261] fix pool bug, test=develop (#27537)

* fix pool bug, test=develop

* fix coverage,test=develop

* fix bug, test=develop
---
 .../fluid/tests/unittests/test_pool1d_api.py  | 18 +++++++++
 python/paddle/nn/functional/pooling.py        | 39 ++++++++++++-------
 2 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index 25216175d59..c1169dfc521 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -195,6 +195,23 @@ class TestPool1d_API(unittest.TestCase):
             result = max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_max_dygraph_return_index_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result, index = F.max_pool1d(
+                input, kernel_size=2, stride=2, padding=0, return_indices=True)
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool1d_dg = paddle.nn.layer.MaxPool1d(
+                kernel_size=2, stride=None, padding=0)
+            result = max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_dygraph_padding_same(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
@@ -228,6 +245,7 @@ class TestPool1d_API(unittest.TestCase):
             self.check_avg_static_results(place)
             self.check_max_dygraph_padding_same(place)
             self.check_avg_dygraph_padding_same(place)
+            self.check_max_dygraph_return_index_results(place)
 
 
 class TestPool2dError_API(unittest.TestCase):
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 1eb9167d035..bed5df8fa78 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -571,15 +571,26 @@ def max_pool1d(x,
     padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
-        pool_out = core.ops.max_pool2d_with_index(
-            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
-            'paddings', padding, 'padding_algorithm', padding_algorithm,
-            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
-            'exclusive', True, 'data_format', data_format)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+        if return_indices:
+            pool_out = core.ops.max_pool2d_with_index(
+                x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
+                stride, 'paddings', padding, 'padding_algorithm',
+                padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+                'use_mkldnn', False, 'exclusive', True, 'data_format',
+                data_format)
+            return (squeeze(pool_out[0], [2]), squeeze(
+                pool_out[1],
+                [2])) if return_indices else squeeze(pool_out[0], [2])
+        else:
+            pool_out = core.ops.pool2d(
+                x, 'pooling_type', 'max', 'ksize', kernel_size,
+                'global_pooling', False, 'padding_algorithm', padding_algorithm,
+                'strides', stride, 'paddings', padding, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
+            return squeeze(pool_out, [2])
 
-    op_type = 'max_pool2d_with_index'
+    op_type = 'max_pool2d_with_index' if return_indices else "pool2d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -696,7 +707,7 @@ def max_pool2d(x,
         )
 
     if in_dygraph_mode():
-        if data_format == "NCHW":
+        if return_indices:
             output = core.ops.max_pool2d_with_index(
                 x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
                 stride, 'paddings', padding, 'padding_algorithm',
@@ -704,7 +715,7 @@ def max_pool2d(x,
                 'use_mkldnn', False, 'exclusive', True, 'data_format',
                 data_format)
             return output if return_indices else output[0]
-        elif data_format == "NHWC" and not return_indices:
+        else:
             output = core.ops.pool2d(
                 x, 'pooling_type', 'max', 'ksize', kernel_size,
                 'global_pooling', False, 'padding_algorithm', padding_algorithm,
@@ -713,7 +724,7 @@ def max_pool2d(x,
                 'data_format', data_format)
             return output
 
-    op_type = 'max_pool2d_with_index' if data_format == "NCHW" else "pool2d"
+    op_type = 'max_pool2d_with_index' if return_indices else "pool2d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -822,7 +833,7 @@ def max_pool3d(x,
         )
 
     if in_dygraph_mode():
-        if data_format == "NCDHW":
+        if return_indices:
             output = core.ops.max_pool3d_with_index(
                 x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
                 stride, 'paddings', padding, 'global_pooling', False,
@@ -830,7 +841,7 @@ def max_pool3d(x,
                 'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
                 'data_format', data_format)
             return output if return_indices else output[0]
-        elif data_format == "NDHWC" and not return_indices:
+        else:
             output = core.ops.pool3d(
                 x, 'pooling_type', 'max', 'ksize', kernel_size,
                 'global_pooling', False, 'padding_algorithm', padding_algorithm,
@@ -839,7 +850,7 @@ def max_pool3d(x,
                 'data_format', data_format)
             return output
 
-    op_type = "max_pool3d_with_index" if data_format == "NCDHW" else "pool3d"
+    op_type = "max_pool3d_with_index" if return_indices else "pool3d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
-- 
GitLab


From 42065ba37af9121188a831848234a7dd879d0d2c Mon Sep 17 00:00:00 2001
From: Double_V <liuvv0203@163.com>
Date: Sun, 27 Sep 2020 10:33:33 +0800
Subject: [PATCH 243/261] fix activate_nn_grad, test=develop (#27555)

---
 .../paddle/fluid/tests/unittests/test_activation_nn_grad.py   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index e8b8a45fb67..c97cca654a7 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -78,15 +78,17 @@ class TestLeakyReluDoubleGradCheck(unittest.TestCase):
 class TestELUDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 6, 6]
         eps = 1e-6
         alpha = 1.1
         dtype = np.float64
+        SEED = 0
 
         x = layers.data('x', shape, False, dtype)
         x.persistable = True
 
         y = layers.elu(x, alpha=alpha)
+        np.random.RandomState(SEED)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
-- 
GitLab


From 0e101c4f6fd4f2d038d725ecae5729d904ef694e Mon Sep 17 00:00:00 2001
From: Chengmo <cmchengmo@163.com>
Date: Sun, 27 Sep 2020 10:36:15 +0800
Subject: [PATCH 244/261] Fix test dist fleet heter ctr (#27513)

* fix test_dist_fleet_heter_ctr & peformance update
---
 .../framework/distributed_strategy.proto      |  1 +
 .../operators/distributed/parameter_recv.cc   | 13 ++++----
 .../distributed/fleet/base/role_maker.py      |  8 ++---
 .../fleet/runtime/parameter_server_runtime.py | 18 +++++++++--
 .../tests/unittests/ctr_dataset_reader.py     |  2 +-
 .../tests/unittests/dist_fleet_heter_ctr.py   |  7 ----
 .../tests/unittests/test_communicator_geo.py  |  1 +
 .../tests/unittests/test_communicator_sync.py |  1 +
 .../test_dist_fleet_a_sync_optimizer_async.py |  2 ++
 .../test_dist_fleet_a_sync_optimizer_sync.py  |  1 +
 .../tests/unittests/test_dist_fleet_base.py   |  5 ++-
 .../unittests/test_dist_fleet_heter_base.py   | 31 ++++--------------
 .../unittests/test_dist_fleet_heter_ctr.py    | 32 -------------------
 13 files changed, 42 insertions(+), 80 deletions(-)

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index c9ae5a67950..21e28d7ac86 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -97,6 +97,7 @@ message AsyncConfig {
   optional int32 thread_pool_size = 6 [ default = 1 ];
   optional int32 send_wait_times = 7 [ default = 1 ];
   optional bool runtime_split_send_recv = 8 [ default = false ];
+  optional bool launch_barrier = 9 [ default = true ];
 }
 
 message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index a91df5b3c47..51b13bc2c56 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -175,10 +175,6 @@ void RecvGeoSparseRecords(const CommContext &rpc_ctx,
 
 template <typename T>
 void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto cpu_place = platform::CPUPlace();
-  auto &cpu_ctx = *pool.Get(cpu_place);
-
   distributed::RPCClient *rpc_client =
       distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
 
@@ -188,8 +184,13 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
   if (rpc_ctx.origin_varnames.size() == 1 &&
       rpc_ctx.splited_varnames.size() == 1) {
     auto varname = rpc_ctx.origin_varnames[0];
-    VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0];
-    rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], cpu_ctx,
+    const auto place =
+        scope.FindVar(varname)->Get<framework::LoDTensor>().place();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(place);
+    VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? "
+            << platform::is_gpu_place(place);
+    rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx,
                                                     scope, varname, varname));
 
     for (size_t i = 0; i < rets.size(); i++) {
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index f66f013e4db..36da7264efe 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -495,7 +495,7 @@ class RoleMakerBase(object):
         Returns:
             string: all heter_trainers'endpoints
         """
-        assert self._heter_trainer_endpoints != []
+        assert self._heter_trainer_endpoints != [], "Heter Worker Endpoints Not initialized"
         return self._heter_trainer_endpoints
 
     def _get_heter_worker_endpoint(self):
@@ -505,10 +505,10 @@ class RoleMakerBase(object):
 
         e.g: if we have 4 cpu-trainer(default), 2 gpu-trainer(heter)
              then No.0 and No.2 cpu-trainer will work with No.0 gpu-trainer
-             and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainerr
+             and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainer
         """
-        assert self._heter_trainer_endpoints != []
-        return self._heter_trainer_endpoints[(self._current_id + 1) %
+        assert self._heter_trainer_endpoints != [], "Heter Worker Endpoints Not initialized"
+        return self._heter_trainer_endpoints[(self._current_id) %
                                              self._heter_worker_num()]
 
     def _get_heter_worker_device(self):
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 6dd4661f000..42be7e869d9 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -23,6 +23,7 @@ from paddle.fluid.executor import Executor
 from paddle.fluid.parallel_executor import ParallelExecutor
 
 from .runtime_base import RuntimeBase
+from ..base.private_helper_function import wait_server_ready
 
 
 class ParameterServerRuntime(RuntimeBase):
@@ -94,8 +95,8 @@ class ParameterServerRuntime(RuntimeBase):
                 return False
 
             if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                            var.desc.type() == core.VarDesc.VarType.READER:
+                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                    var.desc.type() == core.VarDesc.VarType.READER:
                 return False
             return var.persistable
 
@@ -161,6 +162,17 @@ class ParameterServerRuntime(RuntimeBase):
 
         trainer_config = self.async_strategy.get_trainer_runtime_config()
 
+        dist_strategy = self.context["valid_strategy"]
+        launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
+        if launch_barrier:
+            # for trainer wait server ready
+            wait_server_ready(self.role_maker._get_pserver_endpoints())
+
+            # for ps-heter mode, wait heter worker ready
+            if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
+            ):
+                wait_server_ready(self.role_maker._get_heter_worker_endpoints())
+
         lrs = _has_global_step(_get_lr_ops(self.origin_main_program))
 
         if lrs:
@@ -312,7 +324,7 @@ class ParameterServerRuntime(RuntimeBase):
         opts = _get_optimize_ops(self.origin_main_program)
         for op in opts:
             if "Param" in op.input_names and \
-                            "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
+                    "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
                 return op
 
     def _save_dense_params(self, executor, dirname, context, main_program):
diff --git a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
index 15e98481c26..92d84b8b3f3 100644
--- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
@@ -153,7 +153,7 @@ def gen_fake_line(dnn_data_num=7,
     return line
 
 
-def prepare_fake_data(file_nums=9, file_lines=1000):
+def prepare_fake_data(file_nums=6, file_lines=1000):
     """
     Create fake data with same type as avazu_ctr_data
     """
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index f62ad66e462..fefaecd3b89 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -206,13 +206,6 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
                 debug=int(os.getenv("Debug", "0")))
             pass_time = time.time() - pass_start
             print("do_dataset_training done. using time {}".format(pass_time))
-        if os.getenv("SAVE_MODEL") == "1":
-            model_dir = tempfile.mkdtemp()
-            fleet.save_inference_model(exe, model_dir,
-                                       [feed.name for feed in self.feeds],
-                                       self.avg_cost)
-            self.check_model_right(model_dir)
-            shutil.rmtree(model_dir)
 
         fleet.stop_worker()
         print("do_dataset_training stop worker.")
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index 5916000fba7..f625e1de4a3 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -113,6 +113,7 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
         strategy.a_sync_configs = {"k_steps": 100}
+        strategy.a_sync_configs = {"launch_barrier": False}
 
         if training_role == "TRAINER":
             self.run_trainer(role, strategy)
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_sync.py b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
index 95b209b1460..78e2050d3b4 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
@@ -51,6 +51,7 @@ class TestCommunicator(unittest.TestCase):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = False
+        strategy.a_sync_configs = {"launch_barrier": False}
 
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
index 7f55e956a94..845be6eda6e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -52,6 +52,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
+        strategy.a_sync_configs = {"launch_barrier": False}
         optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
@@ -92,6 +93,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
+        strategy.a_sync_configs = {"launch_barrier": False}
         optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
index db3f2afb366..668b4ad872f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -44,6 +44,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = False
+        strategy.a_sync_configs = {"launch_barrier": False}
         optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index c46d1dc5b0f..195b3f8de0a 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -312,9 +312,6 @@ class TestFleetBase(unittest.TestCase):
                 "========================Error tr1_err end==========================="
             )
 
-        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
-        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
-
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
@@ -325,6 +322,8 @@ class TestFleetBase(unittest.TestCase):
         ps1.terminate()
 
         shutil.rmtree(gloo_path)
+        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
+        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
         return 0, 0
 
     def check_with_place(self,
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
index ba97c5079bd..6c5a1d6e36c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
@@ -81,7 +81,7 @@ class FleetDistHeterRunnerBase(object):
     def build_strategy(self, args):
         self.strategy = paddle.distributed.fleet.DistributedStrategy()
         self.strategy.a_sync = True
-
+        self.strategy.a_sync_configs = {"launch_barrier": True}
         return self.strategy
 
     def build_optimizer(self, avg_cost, strategy):
@@ -237,7 +237,10 @@ class TestFleetHeterBase(unittest.TestCase):
         return heter0_proc, heter1_proc, heter0_pipe, heter1_pipe
 
     def _run_cluster(self, model, envs):
-        env = {'GRAD_CLIP': str(self._grad_clip_mode)}
+        env = {
+            'GRAD_CLIP': str(self._grad_clip_mode),
+            'FLAGS_eager_delete_tensor_gb': str(-1)
+        }
         python_path = self._python_interp
         gloo_path = tempfile.mkdtemp()
 
@@ -286,27 +289,6 @@ class TestFleetHeterBase(unittest.TestCase):
 
         tr0_ret = tr0.returncode
         tr1_ret = tr0.returncode
-        print("tr get returncode: {}".format(tr0_ret))
-        if tr0_ret != 0:
-            print(
-                "========================Error tr0_err begin==========================="
-            )
-            os.system("cat {}".format(tempfile.gettempdir() + "/tr0_err.log"))
-            print(
-                "========================Error tr0_err end==========================="
-            )
-
-        if tr1_ret != 0:
-            print(
-                "========================Error tr1_err begin==========================="
-            )
-            os.system("cat {}".format(tempfile.gettempdir() + "/tr1_err.log"))
-            print(
-                "========================Error tr1_err end==========================="
-            )
-
-        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
-        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
 
         # close trainer file
         tr0_pipe.close()
@@ -320,7 +302,8 @@ class TestFleetHeterBase(unittest.TestCase):
         ps1.terminate()
         heter0.terminate()
         heter1.terminate()
-
+        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
+        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
         shutil.rmtree(gloo_path)
         return 0, 0
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
index b3e38a42128..5f7d7b21d7f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
@@ -23,38 +23,6 @@ import paddle
 paddle.enable_static()
 
 
-class TestDistHeterDatasetAsync2x2(TestFleetHeterBase):
-    def _setup_config(self):
-        self._mode = "async"
-        self._reader = "dataset"
-
-    def check_with_place(self,
-                         model_file,
-                         delta=1e-3,
-                         check_error_log=False,
-                         need_envs={}):
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
-            "http_proxy": "",
-            "CPU_NUM": "3"
-        }
-
-        required_envs.update(need_envs)
-
-        if check_error_log:
-            required_envs["GLOG_v"] = "3"
-            required_envs["GLOG_logtostderr"] = "1"
-
-        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
-
-    def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_heter_ctr.py", delta=1e-5, check_error_log=True)
-
-
 class TestDistHeterPyreaderAsync2x2(TestFleetHeterBase):
     def _setup_config(self):
         self._mode = "async"
-- 
GitLab


From 9b124014343cf07a3a2c88006a66f5b3de6af8aa Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Sun, 27 Sep 2020 10:47:04 +0800
Subject: [PATCH 245/261] modified storage address of block file (#27576)

---
 tools/check_file_diff_approvals.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 84254cc89bb..16e61d7c77a 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -286,7 +286,7 @@ fi
 # Get the list of PR authors with unresolved unit test issues
 pip install PyGithub
 # For getting PR related data
-wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate
+wget https://sys-p0.bj.bcebos.com/blk/block.txt --no-check-certificate
 wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate
 HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
 if [ "${HASUTFIXED}" != "" ]; then
-- 
GitLab


From 162b4d6c13f6f38a234423bc984fb41710796475 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Sun, 27 Sep 2020 11:47:36 +0800
Subject: [PATCH 246/261] remove to_variable from 2.0 (#27528)

---
 python/paddle/__init__.py                     |  1 -
 python/paddle/fluid/dygraph/nn.py             |  7 +---
 .../tests/unittests/test_activation_op.py     |  8 ++--
 .../fluid/tests/unittests/test_adamax_api.py  |  2 +-
 .../fluid/tests/unittests/test_adamw_op.py    |  4 +-
 .../unittests/test_adaptive_avg_pool2d.py     |  4 +-
 .../unittests/test_adaptive_avg_pool3d.py     |  4 +-
 .../unittests/test_adaptive_max_pool2d.py     |  4 +-
 .../unittests/test_adaptive_max_pool3d.py     |  4 +-
 .../fluid/tests/unittests/test_addmm_op.py    |  6 +--
 .../fluid/tests/unittests/test_arange.py      |  6 +--
 .../fluid/tests/unittests/test_cholesky_op.py |  2 +-
 .../fluid/tests/unittests/test_clip_op.py     |  6 +--
 .../fluid/tests/unittests/test_concat_op.py   |  6 +--
 .../unittests/test_cosine_similarity_api.py   | 16 ++++----
 .../fluid/tests/unittests/test_cumsum_op.py   |  3 +-
 .../tests/unittests/test_default_dtype.py     |  1 -
 .../unittests/test_directory_migration.py     |  2 +-
 .../test_flatten_contiguous_range_op.py       |  2 +-
 .../tests/unittests/test_imperative_basic.py  |  4 +-
 .../test_imperative_selected_rows.py          |  2 +-
 .../tests/unittests/test_isfinite_v2_op.py    |  2 +-
 .../tests/unittests/test_jit_save_load.py     | 14 +++----
 .../tests/unittests/test_kldiv_loss_op.py     |  2 +-
 .../fluid/tests/unittests/test_l1_loss.py     |  8 ++--
 .../fluid/tests/unittests/test_log_softmax.py |  4 +-
 .../fluid/tests/unittests/test_logsumexp.py   |  4 +-
 .../fluid/tests/unittests/test_max_op.py      |  2 +-
 .../fluid/tests/unittests/test_maximum_op.py  |  8 ++--
 .../fluid/tests/unittests/test_mean_op.py     |  2 +-
 .../fluid/tests/unittests/test_min_op.py      |  2 +-
 .../fluid/tests/unittests/test_randn_op.py    |  2 +-
 .../tests/unittests/test_retain_graph.py      |  4 +-
 .../tests/unittests/test_transformer_api.py   | 40 +++++++++----------
 .../tests/unittests/test_zeros_like_op.py     |  2 +-
 python/paddle/tensor/linalg.py                | 18 +++------
 python/paddle/tensor/math.py                  |  3 +-
 tools/wlist.json                              |  5 ++-
 38 files changed, 102 insertions(+), 114 deletions(-)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index e749cf88b6a..40275a2ce71 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -230,7 +230,6 @@ from .framework import CPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPlace  #DEFINE_ALIAS
 from .framework import CUDAPinnedPlace  #DEFINE_ALIAS
 
-from .framework import to_variable  #DEFINE_ALIAS
 from .framework import grad  #DEFINE_ALIAS
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index a14c3a81c12..05269028acc 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -3230,14 +3230,11 @@ class Flatten(layers.Layer):
         .. code-block:: python
 
           import paddle
-          from paddle import to_variable
           import numpy as np
+          paddle.disable_static()
 
           inp_np = np.ones([5, 2, 3, 4]).astype('float32')
-          
-          paddle.disable_static()
-          
-          inp_np = to_variable(inp_np)
+          inp_np = paddle.to_tensor(inp_np)
           flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
           flatten_res = flatten(inp_np)
 
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 791f1ee2dfa..ad7539e76e4 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -228,7 +228,7 @@ class TestTanhAPI(unittest.TestCase):
 
     def test_dygraph_api(self):
         paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x_np)
+        x = paddle.to_tensor(self.x_np)
         out1 = F.tanh(x)
         out2 = paddle.tanh(x)
         th = paddle.nn.Tanh()
@@ -573,7 +573,7 @@ class TestHardShrinkAPI(unittest.TestCase):
 
     def test_dygraph_api(self):
         paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x_np)
+        x = paddle.to_tensor(self.x_np)
         out1 = F.hardshrink(x)
         hd = paddle.nn.Hardshrink()
         out2 = hd(x)
@@ -639,7 +639,7 @@ class TestHardtanhAPI(unittest.TestCase):
 
     def test_dygraph_api(self):
         paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x_np)
+        x = paddle.to_tensor(self.x_np)
         out1 = F.hardtanh(x)
         m = paddle.nn.Hardtanh()
         out2 = m(x)
@@ -1063,7 +1063,7 @@ class TestLeakyReluAPI(unittest.TestCase):
 
     def test_dygraph_api(self):
         paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x_np)
+        x = paddle.to_tensor(self.x_np)
         out1 = F.leaky_relu(x)
         m = paddle.nn.LeakyReLU()
         out2 = m(x)
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_api.py b/python/paddle/fluid/tests/unittests/test_adamax_api.py
index 5a33e11d286..6d2ec0eefbb 100644
--- a/python/paddle/fluid/tests/unittests/test_adamax_api.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py
@@ -25,7 +25,7 @@ class TestAdamaxAPI(unittest.TestCase):
     def test_adamax_api_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
-        a = paddle.to_variable(value)
+        a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         adam = paddle.optimizer.Adamax(
             learning_rate=0.01,
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index cce24b57d2c..b799508f6b8 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -22,7 +22,7 @@ class TestAdamWOp(unittest.TestCase):
     def test_adamw_op_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
-        a = paddle.to_variable(value)
+        a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         adam = paddle.optimizer.AdamW(
             learning_rate=0.01,
@@ -37,7 +37,7 @@ class TestAdamWOp(unittest.TestCase):
     def test_adamw_op_coverage(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
-        a = paddle.to_variable(value)
+        a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         adam = paddle.optimizer.AdamW(
             learning_rate=0.0,
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
index e3c70884ebc..b8c5bd29491 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
@@ -147,7 +147,7 @@ class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             out_1 = paddle.nn.functional.adaptive_avg_pool2d(
                 x=x, output_size=[3, 3])
@@ -245,7 +245,7 @@ class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
             out_1 = adaptive_avg_pool(x=x)
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
index a3c9dd91a69..bb36aaebf08 100755
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
@@ -162,7 +162,7 @@ class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             out_1 = paddle.nn.functional.adaptive_avg_pool3d(
                 x=x, output_size=[3, 3, 3])
@@ -262,7 +262,7 @@ class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
                 output_size=[3, 3, 3])
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
index d78788eb1e7..dfa6f3226c8 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
@@ -147,7 +147,7 @@ class TestAdaptiveMaxPool2dAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             out_1 = paddle.nn.functional.adaptive_max_pool2d(
                 x=x, return_indices=False, output_size=[3, 3])
@@ -240,7 +240,7 @@ class TestAdaptiveMaxPool2dClassAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
             out_1 = adaptive_max_pool(x=x)
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
index a7de0a5c6a7..1fa703688cd 100755
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
@@ -162,7 +162,7 @@ class TestAdaptiveMaxPool3dAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             out_1 = paddle.nn.functional.adaptive_max_pool3d(
                 x=x, output_size=[3, 3, 3])
@@ -257,7 +257,7 @@ class TestAdaptiveMaxPool3dClassAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
                 output_size=[3, 3, 3])
diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py
index 6e66c0c0029..6238d7dd4a1 100644
--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -244,9 +244,9 @@ class TestAddMMAPI(unittest.TestCase):
 
         def test_error1():
             data_x_wrong = np.ones((2, 3)).astype(np.float32)
-            x = paddle.to_variable(data_x_wrong)
-            y = paddle.to_variable(data_y)
-            input = paddle.to_variable(data_input)
+            x = paddle.to_tensor(data_x_wrong)
+            y = paddle.to_tensor(data_y)
+            input = paddle.to_tensor(data_input)
             out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
         self.assertRaises(ValueError, test_error1)
 '''
diff --git a/python/paddle/fluid/tests/unittests/test_arange.py b/python/paddle/fluid/tests/unittests/test_arange.py
index 29003d28e44..d62c08b072b 100644
--- a/python/paddle/fluid/tests/unittests/test_arange.py
+++ b/python/paddle/fluid/tests/unittests/test_arange.py
@@ -98,9 +98,9 @@ class TestArangeImperative(unittest.TestCase):
         x2 = paddle.tensor.arange(5)
         x3 = paddle.tensor.creation.arange(5)
 
-        start = paddle.to_variable(np.array([0], 'float32'))
-        end = paddle.to_variable(np.array([5], 'float32'))
-        step = paddle.to_variable(np.array([1], 'float32'))
+        start = paddle.to_tensor(np.array([0], 'float32'))
+        end = paddle.to_tensor(np.array([5], 'float32'))
+        step = paddle.to_tensor(np.array([1], 'float32'))
         x4 = paddle.arange(start, end, step, 'int64')
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
index ab08a0aacbf..2fcec657c14 100644
--- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
@@ -96,7 +96,7 @@ class TestDygraph(unittest.TestCase):
         a = np.random.rand(3, 3)
         a_t = np.transpose(a, [1, 0])
         x_data = np.matmul(a, a_t) + 1e-03
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor(x_data)
         out = paddle.cholesky(x, upper=False)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index b56d9f6668e..2946798a82f 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -168,9 +168,9 @@ class TestClipAPI(unittest.TestCase):
         paddle.disable_static(place)
         data_shape = [1, 9, 9, 4]
         data = np.random.random(data_shape).astype('float32')
-        images = paddle.to_variable(data, dtype='float32')
-        v_min = paddle.to_variable(np.array([0.2], dtype=np.float32))
-        v_max = paddle.to_variable(np.array([0.8], dtype=np.float32))
+        images = paddle.to_tensor(data, dtype='float32')
+        v_min = paddle.to_tensor(np.array([0.2], dtype=np.float32))
+        v_max = paddle.to_tensor(np.array([0.8], dtype=np.float32))
 
         out_1 = paddle.clip(images, min=0.2, max=0.8)
         out_2 = paddle.clip(images, min=0.2, max=0.9)
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index b4dbba7eead..14c10e7aa20 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -285,9 +285,9 @@ class TestConcatAPI(unittest.TestCase):
         in2 = np.array([[11, 12, 13], [14, 15, 16]])
         in3 = np.array([[21, 22], [23, 24]])
         paddle.disable_static()
-        x1 = paddle.to_variable(in1)
-        x2 = paddle.to_variable(in2)
-        x3 = paddle.to_variable(in3)
+        x1 = paddle.to_tensor(in1)
+        x2 = paddle.to_tensor(in2)
+        x3 = paddle.to_tensor(in3)
         out1 = fluid.layers.concat(input=[x1, x2, x3], axis=-1)
         out2 = paddle.concat(x=[x1, x2], axis=0)
         np_out1 = np.concatenate([in1, in2, in3], axis=-1)
diff --git a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
index 1e25613fa63..a8899d9f022 100644
--- a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
+++ b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
@@ -75,8 +75,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
         np_x2 = np.random.rand(*shape).astype(np.float32)
         np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
 
-        tesnor_x1 = paddle.to_variable(np_x1)
-        tesnor_x2 = paddle.to_variable(np_x2)
+        tesnor_x1 = paddle.to_tensor(np_x1)
+        tesnor_x2 = paddle.to_tensor(np_x2)
         y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
 
         self.assertTrue(np.allclose(y.numpy(), np_out))
@@ -92,8 +92,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
         np_x2 = np.random.rand(*shape).astype(np.float32)
         np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
 
-        tesnor_x1 = paddle.to_variable(np_x1)
-        tesnor_x2 = paddle.to_variable(np_x2)
+        tesnor_x1 = paddle.to_tensor(np_x1)
+        tesnor_x2 = paddle.to_tensor(np_x2)
         y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
 
         self.assertTrue(np.allclose(y.numpy(), np_out))
@@ -110,8 +110,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
         np_x2 = np.random.rand(*shape2).astype(np.float32)
         np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
 
-        tesnor_x1 = paddle.to_variable(np_x1)
-        tesnor_x2 = paddle.to_variable(np_x2)
+        tesnor_x1 = paddle.to_tensor(np_x1)
+        tesnor_x2 = paddle.to_tensor(np_x2)
         y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
 
         self.assertTrue(np.allclose(y.numpy(), np_out))
@@ -129,8 +129,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
         np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
 
         cos_sim_func = nn.CosineSimilarity(axis=axis, eps=eps)
-        tesnor_x1 = paddle.to_variable(np_x1)
-        tesnor_x2 = paddle.to_variable(np_x2)
+        tesnor_x1 = paddle.to_tensor(np_x1)
+        tesnor_x2 = paddle.to_tensor(np_x2)
         y = cos_sim_func(tesnor_x1, tesnor_x2)
 
         self.assertTrue(np.allclose(y.numpy(), np_out))
diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
index ad121fac8cc..818e15bb319 100644
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -21,13 +21,12 @@ import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
-from paddle import to_variable
 
 
 class TestCumsumOp(unittest.TestCase):
     def run_cases(self):
         data_np = np.arange(12).reshape(3, 4)
-        data = to_variable(data_np)
+        data = paddle.to_tensor(data_np)
 
         y = paddle.cumsum(data)
         z = np.cumsum(data_np)
diff --git a/python/paddle/fluid/tests/unittests/test_default_dtype.py b/python/paddle/fluid/tests/unittests/test_default_dtype.py
index 057933fc7a7..29ca9a93985 100644
--- a/python/paddle/fluid/tests/unittests/test_default_dtype.py
+++ b/python/paddle/fluid/tests/unittests/test_default_dtype.py
@@ -20,7 +20,6 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
 import paddle.fluid.core as core
-from paddle import to_variable
 
 
 class TestDefaultType(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 529fff158c5..2f35b45aa67 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -36,7 +36,7 @@ class TestDirectory(unittest.TestCase):
     def test_new_directory(self):
         new_directory = [
             'paddle.enable_static', 'paddle.disable_static',
-            'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad',
+            'paddle.in_dynamic_mode', 'paddle.to_tensor', 'paddle.grad',
             'paddle.no_grad', 'paddle.save', 'paddle.load',
             'paddle.static.save', 'paddle.static.load',
             'paddle.distributed.ParallelEnv',
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
index 642044bb4b1..e0e487eff11 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
@@ -195,7 +195,7 @@ class TestFlattenPython(unittest.TestCase):
 
         def test_Negative():
             paddle.disable_static()
-            img = paddle.to_variable(x)
+            img = paddle.to_tensor(x)
             out = paddle.flatten(img, start_axis=-2, stop_axis=-1)
             return out.numpy().shape
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 22f16287c33..7378975aa37 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -211,7 +211,7 @@ class TestImperative(unittest.TestCase):
         paddle.disable_static()
         self.assertTrue(paddle.in_dynamic_mode())
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        var_inp = paddle.to_variable(np_inp)
+        var_inp = paddle.to_tensor(np_inp)
         mlp = MLP(input_size=2)
         out = mlp(var_inp)
         dy_out1 = out.numpy()
@@ -221,7 +221,7 @@ class TestImperative(unittest.TestCase):
         self.assertFalse(paddle.in_dynamic_mode())
         paddle.disable_static()
         self.assertTrue(paddle.in_dynamic_mode())
-        var_inp = paddle.to_variable(np_inp)
+        var_inp = paddle.to_tensor(np_inp)
         mlp = MLP(input_size=2)
         out = mlp(var_inp)
         dy_out2 = out.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
index 59ddb365e53..97f7162e997 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -54,7 +54,7 @@ class TestSimpleNet(unittest.TestCase):
                     # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
-                    input = paddle.to_variable(input_word)
+                    input = paddle.to_tensor(input_word)
 
                     simplenet = SimpleNet(20, 32, dtype)
                     adam = SGDOptimizer(
diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
index 8a868e751f0..281dc7caded 100644
--- a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
@@ -41,7 +41,7 @@ def run_dygraph(x_np, op_str, use_gpu=True):
     if use_gpu and fluid.core.is_compiled_with_cuda():
         place = paddle.CUDAPlace(0)
     paddle.disable_static(place)
-    x = paddle.to_variable(x_np)
+    x = paddle.to_tensor(x_np)
     dygraph_result = getattr(paddle.tensor, op_str)(x)
     return dygraph_result
 
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 7e6ca8076de..99404246185 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -543,9 +543,9 @@ class TestJitSaveMultiCases(unittest.TestCase):
         loaded_layer = paddle.jit.load(model_path)
         loaded_layer.eval()
         # inference & compare
-        x = paddle.to_variable(np.random.random((1, 784)).astype('float32'))
+        x = paddle.to_tensor(np.random.random((1, 784)).astype('float32'))
         if with_label:
-            y = paddle.to_variable(np.random.random((1, 1)).astype('int64'))
+            y = paddle.to_tensor(np.random.random((1, 1)).astype('int64'))
             pred, _ = layer(x, y)
             pred = pred.numpy()
         else:
@@ -677,7 +677,7 @@ class TestJitSaveMultiCases(unittest.TestCase):
 
         model_path = "test_not_prune_output_spec_name_warning"
         configs = paddle.SaveLoadConfig()
-        out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
+        out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
         configs.output_spec = [out]
         paddle.jit.save(layer, model_path, configs=configs)
 
@@ -709,7 +709,7 @@ class TestJitSaveMultiCases(unittest.TestCase):
 
         model_path = "test_prune_to_static_after_train"
         configs = paddle.SaveLoadConfig()
-        out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
+        out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
         configs.output_spec = [out]
         with self.assertRaises(ValueError):
             paddle.jit.save(
@@ -730,7 +730,7 @@ class TestJitSaveLoadEmptyLayer(unittest.TestCase):
 
     def test_save_load_empty_layer(self):
         layer = EmptyLayer()
-        x = paddle.to_variable(np.random.random((10)).astype('float32'))
+        x = paddle.to_tensor(np.random.random((10)).astype('float32'))
         out = layer(x)
         paddle.jit.save(layer, self.model_path)
         load_layer = paddle.jit.load(self.model_path)
@@ -746,8 +746,8 @@ class TestJitSaveLoadNoParamLayer(unittest.TestCase):
 
     def test_save_load_no_param_layer(self):
         layer = NoParamLayer()
-        x = paddle.to_variable(np.random.random((5)).astype('float32'))
-        y = paddle.to_variable(np.random.random((5)).astype('float32'))
+        x = paddle.to_tensor(np.random.random((5)).astype('float32'))
+        y = paddle.to_tensor(np.random.random((5)).astype('float32'))
         out = layer(x, y)
         paddle.jit.save(layer, self.model_path)
         load_layer = paddle.jit.load(self.model_path)
diff --git a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
index 041fe4e9043..3a3b7071e04 100644
--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
@@ -90,7 +90,7 @@ class TestKLDivLossDygraph(unittest.TestCase):
         with paddle.fluid.dygraph.guard():
             kldiv_criterion = paddle.nn.KLDivLoss(reduction)
             pred_loss = kldiv_criterion(
-                paddle.to_variable(x), paddle.to_variable(target))
+                paddle.to_tensor(x), paddle.to_tensor(target))
             self.assertTrue(np.allclose(pred_loss.numpy(), gt_loss))
 
     def test_kl_loss_batchmean(self):
diff --git a/python/paddle/fluid/tests/unittests/test_l1_loss.py b/python/paddle/fluid/tests/unittests/test_l1_loss.py
index 6a15fe49477..3c37397cae1 100644
--- a/python/paddle/fluid/tests/unittests/test_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py
@@ -26,8 +26,8 @@ class TestFunctionalL1Loss(unittest.TestCase):
         self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
 
     def run_imperative(self):
-        input = paddle.to_variable(self.input_np)
-        label = paddle.to_variable(self.label_np)
+        input = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np)
         dy_result = paddle.nn.functional.l1_loss(input, label)
         expected = np.mean(np.abs(self.input_np - self.label_np))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
@@ -106,8 +106,8 @@ class TestClassL1Loss(unittest.TestCase):
         self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
 
     def run_imperative(self):
-        input = paddle.to_variable(self.input_np)
-        label = paddle.to_variable(self.label_np)
+        input = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np)
         l1_loss = paddle.nn.loss.L1Loss()
         dy_result = l1_loss(input, label)
         expected = np.mean(np.abs(self.input_np - self.label_np))
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index e3d7003eced..9ac4895f499 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -96,7 +96,7 @@ class TestNNLogSoftmaxAPI(unittest.TestCase):
 
         # test dygrapg api
         paddle.disable_static()
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         y = logsoftmax(x)
         self.assertTrue(np.allclose(y.numpy(), ref_out))
         paddle.enable_static()
@@ -127,7 +127,7 @@ class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
         self.assertTrue(np.allclose(out[0], ref_out))
 
         paddle.disable_static()
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         y = F.log_softmax(x, axis, dtype)
         self.assertTrue(np.allclose(y.numpy(), ref_out), True)
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py
index cf9203dffcb..9032293070a 100644
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -111,7 +111,7 @@ class TestLogsumexpAPI(unittest.TestCase):
         self.assertTrue(np.allclose(res[0], out_ref))
 
         paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         out = paddle.logsumexp(x, axis, keepdim)
         self.assertTrue(np.allclose(out.numpy(), out_ref))
         paddle.enable_static()
@@ -126,7 +126,7 @@ class TestLogsumexpAPI(unittest.TestCase):
 
     def test_alias(self):
         paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         out1 = paddle.logsumexp(x)
         out2 = paddle.tensor.logsumexp(x)
         out3 = paddle.tensor.math.logsumexp(x)
diff --git a/python/paddle/fluid/tests/unittests/test_max_op.py b/python/paddle/fluid/tests/unittests/test_max_op.py
index c9afc4bec66..4786d790b14 100644
--- a/python/paddle/fluid/tests/unittests/test_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_max_op.py
@@ -80,7 +80,7 @@ class ApiMaxTest(unittest.TestCase):
     def test_imperative_api(self):
         paddle.disable_static()
         np_x = np.array([10, 10]).astype('float64')
-        x = paddle.to_variable(np_x)
+        x = paddle.to_tensor(np_x)
         z = paddle.max(x, axis=0)
         np_z = z.numpy()
         z_expected = np.array(np.max(np_x, axis=0))
diff --git a/python/paddle/fluid/tests/unittests/test_maximum_op.py b/python/paddle/fluid/tests/unittests/test_maximum_op.py
index 5645597007a..54657d7900e 100644
--- a/python/paddle/fluid/tests/unittests/test_maximum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maximum_op.py
@@ -61,8 +61,8 @@ class ApiMaximumTest(unittest.TestCase):
     def test_dynamic_api(self):
         paddle.disable_static()
         np_x = np.array([10, 10]).astype('float64')
-        x = paddle.to_variable(self.input_x)
-        y = paddle.to_variable(self.input_y)
+        x = paddle.to_tensor(self.input_x)
+        y = paddle.to_tensor(self.input_y)
         z = paddle.maximum(x, y)
         np_z = z.numpy()
         z_expected = np.array(np.maximum(self.input_x, self.input_y))
@@ -73,8 +73,8 @@ class ApiMaximumTest(unittest.TestCase):
         np_x = np.random.rand(5, 4, 3, 2).astype("float64")
         np_y = np.random.rand(4, 3).astype("float64")
 
-        x = paddle.to_variable(self.input_x)
-        y = paddle.to_variable(self.input_y)
+        x = paddle.to_tensor(self.input_x)
+        y = paddle.to_tensor(self.input_y)
         result_1 = paddle.maximum(x, y, axis=1)
         result_2 = paddle.maximum(x, y, axis=-2)
         self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index 29e79b096cf..f0094e703cd 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -204,7 +204,7 @@ class TestMeanAPI(unittest.TestCase):
         paddle.disable_static(self.place)
 
         def test_case(x, axis=None, keepdim=False):
-            x_tensor = paddle.to_variable(x)
+            x_tensor = paddle.to_tensor(x)
             out = paddle.mean(x_tensor, axis, keepdim)
             if isinstance(axis, list):
                 axis = tuple(axis)
diff --git a/python/paddle/fluid/tests/unittests/test_min_op.py b/python/paddle/fluid/tests/unittests/test_min_op.py
index b9eff05c5ea..9c15d721635 100644
--- a/python/paddle/fluid/tests/unittests/test_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_min_op.py
@@ -80,7 +80,7 @@ class ApiMinTest(unittest.TestCase):
     def test_imperative_api(self):
         paddle.disable_static()
         np_x = np.array([10, 10]).astype('float64')
-        x = paddle.to_variable(np_x)
+        x = paddle.to_tensor(np_x)
         z = paddle.min(x, axis=0)
         np_z = z.numpy()
         z_expected = np.array(np.min(np_x, axis=0))
diff --git a/python/paddle/fluid/tests/unittests/test_randn_op.py b/python/paddle/fluid/tests/unittests/test_randn_op.py
index 9d2c03f3bba..4ddd98a8a73 100644
--- a/python/paddle/fluid/tests/unittests/test_randn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randn_op.py
@@ -63,7 +63,7 @@ class TestRandnOpForDygraph(unittest.TestCase):
         dim_2 = paddle.fill_constant([1], "int32", 50)
         x3 = paddle.randn(shape=[dim_1, dim_2, 784])
 
-        var_shape = paddle.to_variable(np.array(shape))
+        var_shape = paddle.to_tensor(np.array(shape))
         x4 = paddle.randn(var_shape)
 
         for out in [x1, x2, x3, x4]:
diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py
index 9abbee17385..98c7e3800c2 100644
--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
@@ -105,8 +105,8 @@ class TestRetainGraph(unittest.TestCase):
         A = np.random.rand(2, 3, 32, 32).astype('float32')
         B = np.random.rand(2, 3, 32, 32).astype('float32')
 
-        realA = paddle.to_variable(A)
-        realB = paddle.to_variable(B)
+        realA = paddle.to_tensor(A)
+        realB = paddle.to_tensor(B)
         fakeB = g(realA)
 
         optim_d.clear_gradients()
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index 7c7a71a3be1..067d1ea5f73 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -487,24 +487,24 @@ class TestTransformer(unittest.TestCase):
                 dropout=dropout,
                 weight_attr=[None],
                 bias_attr=[False])
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                 np.random.rand(batch_size, source_length, d_model).astype(
                     "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                 np.random.rand(batch_size, target_length, d_model).astype(
                     "float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
             tgt_mask = np.zeros((batch_size, n_head, target_length,
                                  target_length)).astype("float32")
             tgt_mask[0][0][0][0] = -1e9
             memory_mask = np.zeros((batch_size, n_head, target_length,
                                     source_length)).astype("float32")
             memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
@@ -521,24 +521,24 @@ class TestTransformer(unittest.TestCase):
                 dropout=dropout,
                 weight_attr=[None, None],
                 bias_attr=[False, False])
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                 np.random.rand(batch_size, source_length, d_model).astype(
                     "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                 np.random.rand(batch_size, target_length, d_model).astype(
                     "float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
             tgt_mask = np.zeros((batch_size, n_head, target_length,
                                  target_length)).astype("float32")
             tgt_mask[0][0][0][0] = -1e9
             memory_mask = np.zeros((batch_size, n_head, target_length,
                                     source_length)).astype("float32")
             memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
@@ -555,24 +555,24 @@ class TestTransformer(unittest.TestCase):
                 dropout=dropout,
                 weight_attr=[None, None, None],
                 bias_attr=[False, False, True])
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                 np.random.rand(batch_size, source_length, d_model).astype(
                     "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                 np.random.rand(batch_size, target_length, d_model).astype(
                     "float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
             tgt_mask = np.zeros((batch_size, n_head, target_length,
                                  target_length)).astype("float32")
             tgt_mask[0][0][0][0] = -1e9
             memory_mask = np.zeros((batch_size, n_head, target_length,
                                     source_length)).astype("float32")
             memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
@@ -588,24 +588,24 @@ class TestTransformer(unittest.TestCase):
                 dim_feedforward=dim_feedforward,
                 dropout=dropout,
                 bias_attr=False)
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                 np.random.rand(batch_size, source_length, d_model).astype(
                     "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                 np.random.rand(batch_size, target_length, d_model).astype(
                     "float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
             tgt_mask = np.zeros((batch_size, n_head, target_length,
                                  target_length)).astype("float32")
             tgt_mask[0][0][0][0] = -1e9
             memory_mask = np.zeros((batch_size, n_head, target_length,
                                     source_length)).astype("float32")
             memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
index 21e618a4620..2cea3072809 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
@@ -63,7 +63,7 @@ class TestZerosLikeImpeartive(unittest.TestCase):
         place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         paddle.disable_static(place)
-        x = paddle.to_variable(np.ones(shape))
+        x = paddle.to_tensor(np.ones(shape))
         for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
             out = zeros_like(x, dtype)
             self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 26624d3b5ff..15580b6618e 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -707,20 +707,14 @@ def cross(x, y, axis=None, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            from paddle import to_variable
-            import numpy as np
-
             paddle.disable_static()
 
-            data_x = np.array([[1.0, 1.0, 1.0],
-                               [2.0, 2.0, 2.0],
-                               [3.0, 3.0, 3.0]])
-            data_y = np.array([[1.0, 1.0, 1.0],
-                               [1.0, 1.0, 1.0],
-                               [1.0, 1.0, 1.0]])
-            x = to_variable(data_x)
-            y = to_variable(data_y)
-
+            x = paddle.to_tensor([[1.0, 1.0, 1.0],
+                                  [2.0, 2.0, 2.0],
+                                  [3.0, 3.0, 3.0]])
+            y = paddle.to_tensor([[1.0, 1.0, 1.0],
+                                  [1.0, 1.0, 1.0],
+                                  [1.0, 1.0, 1.0]])
             z1 = paddle.cross(x, y)
             print(z1.numpy())
             # [[-1. -1. -1.]
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 966544c7abb..ce32fb76f5c 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1650,12 +1650,11 @@ def cumsum(x, axis=None, dtype=None, name=None):
         .. code-block:: python
             
             import paddle
-            from paddle import to_variable
             import numpy as np
 
             paddle.disable_static()
             data_np = np.arange(12).reshape(3, 4)
-            data = to_variable(data_np)
+            data = paddle.to_tensor(data_np)
 
             y = paddle.cumsum(data)
             print(y.numpy())
diff --git a/tools/wlist.json b/tools/wlist.json
index 0ed0b4e4069..9b36ac6adc7 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -251,9 +251,10 @@
         "BilinearTensorProduct",
         "GroupNorm",
         "SpectralNorm",
-        "TreeConv",
+        "TreeConv"
+    ],
+    "wlist_temp":[
         "prroi_pool",
-        "to_tensor",
         "ChunkEvaluator",
         "EditDistance",
         "ErrorClipByValue",
-- 
GitLab


From 35074963e359ba9ce5e38279fc1205bcee67157d Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Sun, 27 Sep 2020 12:17:50 +0800
Subject: [PATCH 247/261] Refine error msg in paddle/fluid/framework/details
 [part 2] (#27429)

* refine broadcast_op_handle

* refine some error messages

* refine some files

* fix bug

* fix bug

* fix bug

* follow comments

* follow comments
---
 .../framework/details/all_reduce_op_handle.cc |  4 +-
 .../framework/details/broadcast_op_handle.cc  | 32 ++++--
 .../details/broadcast_op_handle_test.h        | 41 ++++++--
 .../fluid/framework/details/build_strategy.cc |  3 +-
 .../details/eager_deletion_op_handle.cc       | 40 +++++---
 .../details/fused_all_reduce_op_handle.cc     | 85 +++++++++++++---
 .../details/fused_broadcast_op_handle.cc      | 11 ++-
 .../details/fused_broadcast_op_handle_test.cc |  5 +-
 .../framework/details/gather_op_handle.cc     | 39 ++++++--
 .../details/gather_op_handle_test.cc          | 28 +++++-
 .../fluid/framework/details/nccl_op_handle.h  | 61 +++++++-----
 .../fluid/framework/details/op_handle_base.cc | 49 +++++----
 paddle/fluid/framework/details/op_registry.h  | 14 +--
 .../framework/details/reduce_and_gather.h     | 28 +++++-
 .../framework/details/reduce_op_handle.cc     | 57 ++++++++---
 .../details/reduce_op_handle_test.cc          | 42 ++++++--
 .../details/share_tensor_buffer_functor.cc    |  9 +-
 .../details/sparse_all_reduce_op_handle.cc    | 99 ++++++++++++++-----
 18 files changed, 475 insertions(+), 172 deletions(-)

diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 939a2fc8fc9..78887f3ac51 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -76,7 +76,7 @@ void AllReduceOpHandle::AllReduceImpl(
                     platform::errors::InvalidArgument(
                         "The NoDummyInputSize should be equal "
                         "to the number of places, but got NoDummyInputSize is "
-                        "%d and the number of place is %d.",
+                        "%d and the number of places is %d.",
                         in_var_handles.size(), num_places));
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), out_var_handles.size(),
@@ -89,7 +89,7 @@ void AllReduceOpHandle::AllReduceImpl(
       platform::errors::InvalidArgument(
           "The number of local scopes should be equal "
           "to the number of places, but got the number of local scopes is "
-          "%d and the number of place is %d.",
+          "%d and the number of places is %d.",
           in_var_handles.size(), num_places));
 
   std::vector<const void *> lod_tensor_data;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 4c3b0a7c6a4..35b10660674 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() {
   auto out_var_handles = DynamicCast<VarHandle>(outputs_);
 
   PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
-                    "The number of input should be one.");
-  PADDLE_ENFORCE_EQ(
-      out_var_handles.size(), places_.size(),
-      "The number of output should equal to the number of places.");
+                    platform::errors::PreconditionNotMet(
+                        "The number of inputs should be 1, but got %d.",
+                        in_var_handles.size()));
+  PADDLE_ENFORCE_EQ(out_var_handles.size(), places_.size(),
+                    platform::errors::PreconditionNotMet(
+                        "The number of outputs and the number of places should "
+                        "be equal, but got the number of outputs is %d and the "
+                        "number of places is %d.",
+                        out_var_handles.size(), places_.size()));
 
   VarHandle *in_var_handle = in_var_handles[0];
 
@@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar(
     const std::vector<Scope *> &var_scopes) {
   auto *in_var =
       var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name());
-  PADDLE_ENFORCE_NOT_NULL(in_var);
+  PADDLE_ENFORCE_NOT_NULL(
+      in_var, platform::errors::NotFound("Variable %s is not found in scopes.",
+                                         in_var_handle.name()));
   Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
   if (UNLIKELY(!in_tensor.IsInitialized())) {
     VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!";
@@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar(
 
       broadcast_calls.emplace_back(
           [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
-            PADDLE_ENFORCE(platform::dynload::ncclBcast(
+            PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
                 send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
                 root_id, nccl_ctx.comm_, nccl_ctx.stream()));
           });
@@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar(
       nccl_ctxs_->DevCtx(p)->Wait();
     }
 #else
-    PADDLE_THROW("CUDA is not enabled.");
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
   }
 }
@@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue(
     auto t_out_p = out_var_handle->place();
     auto *out_var = var_scopes.at(out_var_handle->scope_idx())
                         ->FindVar(out_var_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound(
+                                         "Variable %s is not found in scopes.",
+                                         out_var_handle->name()));
     if (is_gpu_place(in_tensor.place())) {
-      PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
-                     "Places of input and output must be all on GPU.");
+      PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
+                        platform::errors::PreconditionNotMet(
+                            "Places of input and output must be all on GPU."));
     } else {
       t_out_p = platform::CPUPlace();
     }
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index e455879a68f..4fdc420e1e0 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -79,7 +79,8 @@ struct TestBroadcastOpHandle {
       }
       nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
     } else {
       int count = 8;
@@ -113,7 +114,8 @@ struct TestBroadcastOpHandle {
       op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                          place_list_, nccl_ctxs_.get());
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
     } else {
 #if defined(PADDLE_WITH_NCCL)
@@ -171,7 +173,9 @@ struct TestBroadcastOpHandle {
                                    float val_scalar = 0.0) {
     auto var = param_scopes_[input_scope_idx]->FindVar(varname);
 
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                        varname));
     auto lod_tensor = var->GetMutable<f::LoDTensor>();
     std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
     for (size_t k = 0; k < send_vector.size(); ++k) {
@@ -194,7 +198,9 @@ struct TestBroadcastOpHandle {
     }
 
     auto var = param_scopes_[input_scope_idx]->FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                        varname));
     auto selected_rows = var->GetMutable<f::SelectedRows>();
     auto value = selected_rows->mutable_value();
     value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
@@ -211,13 +217,24 @@ struct TestBroadcastOpHandle {
                          const std::vector<float>& send_vector,
                          const std::vector<int64_t>& rows, int height) {
     auto var = param_scopes_[input_scope_idx]->FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                        varname));
     auto& selected_rows = var->Get<f::SelectedRows>();
     auto rt = selected_rows.value();
-    PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal.");
+    PADDLE_ENFORCE_EQ(selected_rows.height(), height,
+                      platform::errors::InvalidArgument(
+                          "The height of SelectedRows is not equal to "
+                          "the expected, expect %d, but got %ld.",
+                          height, selected_rows.height()));
 
     for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]);
+      PADDLE_ENFORCE_EQ(
+          selected_rows.rows()[k], rows[k],
+          platform::errors::InvalidArgument(
+              "The item at position %zu of rows of SelectedRows "
+              "is not equal to the expected, expect %ld, but got %ld.",
+              k, rows[k], selected_rows.rows()[k]));
     }
 
     p::CPUPlace cpu_place;
@@ -235,9 +252,15 @@ struct TestBroadcastOpHandle {
                       framework::Scope* scope) {
     p::CPUPlace cpu_place;
     auto var = scope->FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                        varname));
     auto tensor = var->Get<f::LoDTensor>();
-    PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal.");
+    PADDLE_ENFORCE_EQ(tensor.lod(), lod,
+                      platform::errors::InvalidArgument(
+                          "The LoD of tensor is not equal to "
+                          "the expected, expect %s, but got %s.",
+                          lod, tensor.lod()));
     f::Tensor result_tensor;
     f::TensorCopySync(tensor, cpu_place, &result_tensor);
     float* ct = result_tensor.mutable_data<float>(cpu_place);
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index ecdb8cc9b8c..962f968c84e 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
               AppendPass("reduce_mode_multi_devices_pass").get();
           break;
         default:
-          PADDLE_THROW("Unknown reduce strategy.");
+          PADDLE_THROW(
+              platform::errors::Unimplemented("Unknown reduce strategy."));
       }
     }
     multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 7735f9720c1..266557cb855 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+
 #include <memory>
 #include <unordered_set>
 #include <utility>
 
-#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
@@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
     if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
       platform::CUDADeviceGuard guard(
           BOOST_GET_CONST(platform::CUDAPlace, place).device);
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-      PADDLE_ENFORCE_NOT_NULL(event_);
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+      PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
+                                          "The cuda envet created is NULL."));
     }
   }
 #endif
-  PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument(
-                                            "Variable names are empty."));
+  PADDLE_ENFORCE_NE(vars.empty(), true,
+                    platform::errors::InvalidArgument(
+                        "The variables to be deleted are empty."));
   for (auto *var : var_infos_) {
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
+                                     "The memory optimization info is NULL."));
   }
 }
 
@@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
   if (event_) {
     auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
     platform::CUDADeviceGuard guard(gpu_place.device);
-    PADDLE_ENFORCE(cudaEventDestroy(event_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
   }
 #endif
 }
@@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() {
 }
 
 void EagerDeletionOpHandle::CallOnce() {
-  PADDLE_ENFORCE(vars_.empty(), "vars_ must be initialized here");
+  PADDLE_ENFORCE_EQ(
+      vars_.empty(), true,
+      platform::errors::InvalidArgument(
+          "The variables to be deleted should be initialized here."));
   Scope *exec_scope = local_exec_scopes_[0];
   for (auto *var_info : var_infos_) {
     auto *var = exec_scope->FindVar(var_info->Name());
-    PADDLE_ENFORCE_NOT_NULL(var, "Variable %s should not be nullptr",
-                            var_info->Name());
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound(
+                 "The variable(%s) to be inplaced is not found in scope.",
+                 var_info->Name()));
     vars_.emplace_back(var);
   }
 }
@@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() {
         garbages.emplace_back(t.MoveMemoryHolder());
       }
     } else {
-      PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                   framework::ToTypeName(var->Type()), var_info->Name());
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "The variable(%s) of type %s is not supported in eager deletion.",
+          framework::ToTypeName(var->Type()), var_info->Name()));
     }
   }
 
@@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages(
     auto callback_stream =
         reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
     auto callback_func = [=]() {
-      PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
-      PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaStreamWaitEvent(callback_stream, event_, 0));
     };
     gc_->Add(std::move(*garbages), callback_func);
   } else {
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index c67e21d5c47..c5388116699 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
+
 #include <algorithm>
 #include <utility>
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() {
   size_t place_num = places_.size();
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), place_num * num_of_all_reduce_,
-      "The NoDummyInputSize should be equal to the number of places.");
+      platform::errors::PreconditionNotMet(
+          "The number of input variable handles should be equal to the number "
+          "of places plus the number of all reduce handles, "
+          "but got the number of input variable handles is %d, the "
+          "number of places is %d, and the number of all reduce handles "
+          "is %d.",
+          in_var_handles.size(), place_num, num_of_all_reduce_));
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+      platform::errors::PreconditionNotMet(
+          "The number of input variable handles should be equal to the number "
+          "of output variable handles, but got the number of input variable "
+          "handles is %d, and the number of  output variable handles is %d.",
+          in_var_handles.size(), out_var_handles.size()));
 
   // Note: some gradient op doesn't have CUDAKernel, so the gradients of
   // those op are in CPUPlace, in this case, the all reduce should not be fused.
@@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
       dtype = ele_dtype;
     }
 
-    PADDLE_ENFORCE_EQ(ele_dtype, dtype);
+    PADDLE_ENFORCE_EQ(
+        ele_dtype, dtype,
+        platform::errors::InvalidArgument(
+            "The DataType of grad tensors of fused_all_reduce_op_handle  "
+            "must be consistent. The current dtype is %s, but the "
+            "previous dtype is %s.",
+            DataTypeToString(ele_dtype), DataTypeToString(dtype)));
 
     // Check whether the address space is contiguous.
     std::sort(
@@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
           "input[%d] address: 0X%02x. The offset: %d",
           k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
           next_address, k, infer_next_address, offset);
-      PADDLE_ENFORCE_EQ(infer_next_address, next_address,
-                        "The address is not consistent.");
+      PADDLE_ENFORCE_EQ(
+          infer_next_address, next_address,
+          platform::errors::InvalidArgument(
+              "The infered address of the next tensor should be equal to the "
+              "real address of the next tensor. But got infered address is %p "
+              "and real address is %p.",
+              infer_next_address, next_address));
     }
   }
 
   if (!FLAGS_skip_fused_all_reduce_check) {
     for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
       for (size_t j = 1; j < num_of_all_reduce_; ++j) {
-        PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first,
-                          grads_tensor.at(scope_idx).at(j).first);
+        PADDLE_ENFORCE_EQ(
+            grads_tensor.at(0).at(j).first,
+            grads_tensor.at(scope_idx).at(j).first,
+            platform::errors::InvalidArgument(
+                "The variable name of grad tensors of "
+                "fused_all_reduce_op_handle  "
+                "must be consistent. The current name is %s, but the "
+                "previous name is %s.",
+                grads_tensor.at(0).at(j).first,
+                grads_tensor.at(scope_idx).at(j).first));
       }
     }
   }
@@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
     for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
       auto var_name = in_var_handles[j]->name();
       auto var = local_scope->FindVar(var_name);
-      PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          var, platform::errors::NotFound(
+                   "The variable '%s' is not found in local scope.", var_name));
       auto &lod_tensor = var->Get<LoDTensor>();
       if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
         return true;
@@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
   size_t place_num = places_.size();
   for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
     auto var_name = in_var_handles[j]->name();
-    PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
+    PADDLE_ENFORCE_EQ(
+        var_name, out_var_handles[j]->name(),
+        platform::errors::InvalidArgument(
+            "The name of input variable should be equal "
+            "to the name of output variable. But got the name of input "
+            "variable is %s and the name of output variable is %s.",
+            var_name, out_var_handles[j]->name()));
     auto var = local_scope->FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound(
+                 "The variable '%s' is not found in local scope.", var_name));
     auto &lod_tensor = var->Get<LoDTensor>();
 
     PADDLE_ENFORCE_EQ(
         platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)),
-        true, "%s(%d) is not in the right place.", var_name, scope_idx);
+        true, platform::errors::InvalidArgument(
+                  "The variable '%s' at scope %d is not in the right place.",
+                  var_name, scope_idx));
     grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
   }
 }
@@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
   size_t size_of_dtype = 0;
   for (size_t i = 0; i < grad_tensor.size(); ++i) {
     // Get dtype
-    auto ele_type = grad_tensor.at(i).second->type();
+    auto ele_dtype = grad_tensor.at(i).second->type();
     if (i == 0) {
-      *dtype = ele_type;
-      size_of_dtype = framework::SizeOfType(ele_type);
+      *dtype = ele_dtype;
+      size_of_dtype = framework::SizeOfType(ele_dtype);
     }
-    PADDLE_ENFORCE_EQ(ele_type, *dtype);
+    PADDLE_ENFORCE_EQ(
+        ele_dtype, *dtype,
+        platform::errors::InvalidArgument(
+            "The DataType of grad tensors of fused_all_reduce_op_handle  "
+            "must be consistent. The current dtype is %s, but the "
+            "previous dtype is %s.",
+            DataTypeToString(ele_dtype), DataTypeToString(*dtype)));
 
     // Get element number
     int64_t len = grad_tensor.at(i).second->numel();
-    PADDLE_ENFORCE_GT(len, 0);
+    PADDLE_ENFORCE_GT(
+        len, 0, platform::errors::InvalidArgument(
+                    "The size of grad tensors of fused_all_reduce_op_handle  "
+                    "must be > 0, but got %d.",
+                    len));
     *numel +=
         platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
   }
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
index 59c5da0de8c..1ae09dcde9f 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() {
   WaitInputVarGenerated();
 
   size_t place_num = places_.size();
-  PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size());
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size() * place_num, out_var_handles.size(),
+      platform::errors::PreconditionNotMet(
+          "The number of input variable handles plus the number "
+          "of places should be equal to the number of output variable handles, "
+          "but got the number of input variable handles is %d, the "
+          "number of places is %d, and the number of output variable handles "
+          "is %d.",
+          in_var_handles.size(), place_num, out_var_handles.size()));
 
   for (size_t i = 0; i < in_var_handles.size(); ++i) {
     BroadcastOneVar(
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
index 761a5b5a30a..ce7621d4e35 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
+
 #include <memory>
 #include <unordered_map>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
@@ -58,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
       op_handle_ = new FusedBroadcastOpHandle(
           nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
-      PADDLE_THROW("CUDA is not supported.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
     } else {
 #if defined(PADDLE_WITH_NCCL)
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index a039c6200e3..2d3b2fb39af 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/gather_op_handle.h"
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 
@@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() {
 
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), places_.size(),
-      "The number of output should equal to the number of places.");
+      platform::errors::InvalidArgument(
+          "The number of input variables should be equal "
+          "to the number of places, but got the number of input variables is "
+          "%d and the number of places is %d.",
+          in_var_handles.size(), places_.size()));
 
   VarHandle *out_var_handle;
   {
     auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
-                      "The number of output should be one.");
+    PADDLE_ENFORCE_EQ(
+        out_var_handles.size(), 1,
+        platform::errors::InvalidArgument(
+            "The number of output variables should be 1, but got %d.",
+            out_var_handles.size()));
     out_var_handle = out_var_handles.front();
   }
 
@@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() {
   auto in_0_handle = in_var_handles[0];
   auto pre_in_var =
       var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
+  PADDLE_ENFORCE_NOT_NULL(
+      pre_in_var,
+      platform::errors::NotFound("The variable '%s' is not found in the scope.",
+                                 in_0_handle->name()));
 
-  PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
-                 "Currently, gather_op only can gather SelectedRows.");
+  PADDLE_ENFORCE_EQ(pre_in_var->IsType<framework::SelectedRows>(), true,
+                    platform::errors::Unimplemented(
+                        "Currently, gather_op only supports SelectedRows."));
 
   // Wait input done, this Wait is asynchronous operation
   WaitInputVarGenerated();
@@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() {
   for (auto *in_handle : in_var_handles) {
     auto *in_var =
         var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(in_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        in_var,
+        platform::errors::NotFound(
+            "The variable '%s' is not found in the scope.", in_handle->name()));
     VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);
 
     auto &in_sr_value = in_var->Get<framework::SelectedRows>();
@@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() {
   // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
   platform::Place t_out_p = out_var_handle->place();
   if (platform::is_gpu_place(pre_in_value.place())) {
-    PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
-                   "Places of input and output must be all on GPU.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
+                      platform::errors::PreconditionNotMet(
+                          "Places of input and output must be all on GPU."));
   } else {
     t_out_p = platform::CPUPlace();
   }
 
   auto out_var = var_scopes.at(out_var_handle->scope_idx())
                      ->FindVar(out_var_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(out_var);
+  PADDLE_ENFORCE_NOT_NULL(
+      out_var,
+      platform::errors::NotFound("The variable '%s' is not found in the scope.",
+                                 out_var_handle->name()));
   auto out_value = out_var->GetMutable<framework::SelectedRows>();
   out_value->set_height(pre_in_value.height());
   out_value->set_rows(out_rows);
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index f3fcc1a436d..60c1d0d39a5 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/gather_op_handle.h"
+
 #include <memory>
 #include <unordered_map>
+
 #include "gtest/gtest.h"
 
 namespace paddle {
@@ -60,7 +62,8 @@ struct TestGatherOpHandle {
         ctxs_.emplace_back(new p::CUDADeviceContext(p));
       }
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
     } else {
       int count = 8;
@@ -141,7 +144,9 @@ struct TestGatherOpHandle {
     for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
          ++input_scope_idx) {
       auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
-      PADDLE_ENFORCE_NOT_NULL(in_var);
+      PADDLE_ENFORCE_NOT_NULL(
+          in_var, platform::errors::NotFound(
+                      "The variable '%s' is not found in the scope.", "input"));
       auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
       auto value = in_selected_rows->mutable_value();
       value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -155,7 +160,9 @@ struct TestGatherOpHandle {
     }
 
     auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
-    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        out_var, platform::errors::NotFound(
+                     "The variable '%s' is not found in the scope.", "out"));
     auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
 
     auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
@@ -173,9 +180,19 @@ struct TestGatherOpHandle {
     auto& out_select_rows = out_var->Get<f::SelectedRows>();
     auto rt = out_select_rows.value();
 
-    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
+                      platform::errors::InvalidArgument(
+                          "The height of SelectedRows is not equal to "
+                          "the expected, expect %d, but got %d.",
+                          height, out_select_rows.height()));
+
     for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+      PADDLE_ENFORCE_EQ(
+          out_select_rows.rows()[k], rows[k % rows.size()],
+          platform::errors::InvalidArgument(
+              "The item at position %d of rows of SelectedRows is not equal to "
+              "the expected, expect %d, but got %d.",
+              k, rows[k % rows.size()], out_select_rows.rows()[k]));
     }
 
     f::Tensor result_tensor;
@@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) {
   test_op.TestGatherSelectedRows(input_scope_idx);
 }
 #endif
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h
index 2d4d4122a3c..22a059773f5 100644
--- a/paddle/fluid/framework/details/nccl_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_op_handle.h
@@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase {
   }
   virtual ~NCCLOpHandleBase() {
     for (auto& ev : inter_events_) {
-      PADDLE_ENFORCE(cudaEventDestroy(ev.second));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
     }
     for (auto& ev : exter_events_) {
-      PADDLE_ENFORCE(cudaEventDestroy(ev.second));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
     }
   }
   void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
-    PADDLE_ENFORCE(run_order >= 0, "run_order must >= 0");
+    PADDLE_ENFORCE_GE(
+        run_order, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order must be >= 0, but got %d.", run_order));
     run_order_ = run_order;
     use_hierarchical_allreduce_ = use_hierarchical_allreduce;
 
@@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase {
       return;
     }
 
-    PADDLE_ENFORCE(places_.size() == 1,
-                   "HierarchicalAllReduce run one proc with one card mode.");
+    PADDLE_ENFORCE_EQ(places_.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "HierarchicalAllReduce can only run "
+                          "one proccess with one card mode, but got %d cards.",
+                          places_.size()));
 
     for (auto& p : places_) {
       auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order);
@@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase {
         continue;
       }
 
-      PADDLE_ENFORCE(cudaSetDevice(dev_id));
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&inter_events_[dev_id],
-                                              cudaEventDisableTiming));
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&exter_events_[dev_id],
-                                              cudaEventDisableTiming));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
+          &inter_events_[dev_id], cudaEventDisableTiming));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
+          &exter_events_[dev_id], cudaEventDisableTiming));
       VLOG(10) << "Create events on dev_id:" << dev_id
                << ", inter_event:" << &inter_events_[dev_id]
                << ", exter_event:" << &exter_events_[dev_id];
@@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase {
   void FlatNCCLAllReduce(platform::Place place, const void* sendbuff,
                          void* recvbuff, size_t count, ncclDataType_t datatype,
                          ncclRedOp_t op) {
-    PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
+    PADDLE_ENFORCE_GE(
+        run_order_, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order_ must be >= 0, but got %d.", run_order_));
     auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
     auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
@@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", dev_id:" << dev_id << ", dtype:" << datatype
              << ", place:" << place;
 
-    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
   }
 
   void NCCLAllReduce(platform::Place place, const void* sendbuff,
                      void* recvbuff, size_t count, ncclDataType_t datatype,
                      ncclRedOp_t op) {
-    PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
+    PADDLE_ENFORCE_GE(
+        run_order_, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order_ must be >= 0, but got %d.", run_order_));
     if (!use_hierarchical_allreduce_) {
       FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op);
       return;
@@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase {
   void HierarchicalAllReduce(platform::Place place, const void* sendbuff,
                              void* recvbuff, size_t count,
                              ncclDataType_t datatype, ncclRedOp_t op) {
-    PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
+    PADDLE_ENFORCE_GE(
+        run_order_, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order_ must be >= 0, but got %d.", run_order_));
     InterReduce(place, sendbuff, recvbuff, count, datatype, op);
     // When a trainer is not in exter allreduce ring
     // they need not to call this.
@@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", dtype:" << datatype << ", place:" << place
              << ", stream:" << stream;
 
-    PADDLE_ENFORCE(platform::dynload::ncclReduce(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
         sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
 
     cudaEventRecord(inter_events_.at(dev_id), stream);
 
     if (FLAGS_sync_nccl_allreduce) {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream),
-                     "sync HierarchicalAllReduce inter stream error");
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
     }
   }
 
@@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase {
                       void* recvbuff, size_t count, ncclDataType_t datatype,
                       ncclRedOp_t op) {
     auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_);
-    PADDLE_ENFORCE(nccl_ctxs_, "can't get exter %d nccl_ctxs", run_order_);
+    PADDLE_ENFORCE_NOT_NULL(
+        nccl_ctxs_, platform::errors::NotFound(
+                        "Can't get exter %d nccl contexts.", run_order_));
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
     auto& nccl_ctx = nccl_ctxs->at(dev_id);
     auto stream = nccl_ctx.stream();
@@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase {
 
     cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
 
-    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
 
     cudaEventRecord(exter_events_.at(dev_id), stream);
 
     if (FLAGS_sync_nccl_allreduce) {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream),
-                     "sync HierarchicalAllReduce exter stream error");
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
     }
   }
 
@@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", stream:" << stream;
 
     cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
-    PADDLE_ENFORCE(platform::dynload::ncclBcast(sendbuff, count, datatype, 0,
-                                                comm, stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+        sendbuff, count, datatype, 0, comm, stream));
   }
 
  protected:
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 459bcff5c0b..105c37192f5 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -47,8 +47,8 @@ void OpHandleBase::InitCUDA() {
 #ifdef PADDLE_WITH_CUDA
   for (auto &p : dev_ctxes_) {
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
-    PADDLE_ENFORCE(cudaSetDevice(dev_id));
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
   }
   if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
@@ -62,17 +62,22 @@ void OpHandleBase::InitCUDA() {
       }
     }
   } else {
-    PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
-                      "%s should have only one dev_ctx.", Name());
+    PADDLE_ENFORCE_EQ(
+        dev_ctxes_.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Operator %s should have only one dev_ctx, but got %d.", Name(),
+            dev_ctxes_.size()));
     auto &place = dev_ctxes_.begin()->first;
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
     for (auto &out_var : outputs_) {
       auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
       if (out_var_handle) {
-        PADDLE_ENFORCE(platform::is_same_place(place, out_var_handle->place()),
-                       "The place of output(%s) is not consistent with the "
-                       "place of current op(%s).",
-                       out_var_handle->Name(), Name());
+        PADDLE_ENFORCE_EQ(
+            platform::is_same_place(place, out_var_handle->place()), true,
+            platform::errors::InvalidArgument(
+                "The place of output(%s) is not consistent with the "
+                "place of current op(%s).",
+                out_var_handle->Name(), Name()));
         out_var_handle->SetGenerateEvent(events_.at(dev_id));
       }
     }
@@ -86,7 +91,10 @@ void OpHandleBase::Run(bool use_cuda) {
     InitCUDA();
   }
 #else
-  PADDLE_ENFORCE(!use_cuda);
+  PADDLE_ENFORCE_EQ(use_cuda, false,
+                    platform::errors::InvalidArgument(
+                        "Argument use_cuda should be false when Paddle is not "
+                        "compiled with CUDA."));
 #endif
 
   // skip running current op, used with inplace_addto_op_pass
@@ -100,17 +108,20 @@ void OpHandleBase::Run(bool use_cuda) {
 
 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
 #ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_NOT_NULL(waited_ctx);
+  PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument(
+                                          "Argument waited_ctx is NULL."));
   if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
     for (auto &dev_ctx : dev_ctxes_) {
-      PADDLE_ENFORCE_NOT_NULL(dev_ctx.second);
+      PADDLE_ENFORCE_NOT_NULL(
+          dev_ctx.second,
+          platform::errors::InvalidArgument("The device context is NULL."));
       dev_ctx.second->Wait();
     }
   } else {
     auto stream =
         static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
     for (auto &ev : events_) {
-      PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
     }
   }
 #else
@@ -145,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() {
           auto stream =
               static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
                   ->stream();
-          PADDLE_ENFORCE(
+          PADDLE_ENFORCE_CUDA_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
-          PADDLE_THROW("Doesn't compile the GPU.");
+          PADDLE_THROW(
+              platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
         }
         // There are nothing to do when the place is CPUPlace.
@@ -169,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
           auto stream = static_cast<platform::CUDADeviceContext *>(
                             dev_ctxes_.at(in_var_handle->place()))
                             ->stream();
-          PADDLE_ENFORCE(
+          PADDLE_ENFORCE_CUDA_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
-          PADDLE_THROW("Doesn't compile the GPU.");
+          PADDLE_THROW(
+              platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
         }
         // There are nothing to do when the place is CPUPlace.
@@ -242,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes(
   auto scopes = GetLocalScopes();
   for (auto *scope : scopes) {
     auto iter = scope_map.find(scope);
-    PADDLE_ENFORCE(iter != scope_map.end(), "Local scope not found");
+    PADDLE_ENFORCE_NE(
+        iter, scope_map.end(),
+        platform::errors::NotFound("Local scope not found in scope map."));
     local_exec_scopes_.emplace_back(iter->second);
   }
 }
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 1e608000e0a..453a25166b5 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/inplace_op_inference.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
@@ -186,19 +187,20 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
     PADDLE_ENFORCE_EQ(info->proto_, nullptr,
                       platform::errors::AlreadyExists(
-                          "OpProto of %s has been registered", op_type));
+                          "OpProto of %s has been registered.", op_type));
     PADDLE_ENFORCE_EQ(info->checker_, nullptr,
                       platform::errors::AlreadyExists(
-                          "OpAttrChecker of %s has been registered", op_type));
+                          "OpAttrChecker of %s has been registered.", op_type));
     info->proto_ = new proto::OpProto;
     info->checker_ = new OpAttrChecker();
     T maker;
     maker(info->proto_, info->checker_);
     info->proto_->set_type(op_type);
-    PADDLE_ENFORCE(
-        info->proto_->IsInitialized(),
-        "Fail to initialize %s's OpProto, because %s is not initialized",
-        op_type, info->proto_->InitializationErrorString());
+    PADDLE_ENFORCE_EQ(
+        info->proto_->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "Fail to initialize %s's OpProto, because %s is not initialized.",
+            op_type, info->proto_->InitializationErrorString()));
   }
 };
 
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index 11c4621fde3..9ecb2d8dbdd 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <map>
 #include <vector>
+
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -32,9 +33,13 @@ struct ReduceLoDTensor {
 
   template <typename T>
   void apply() const {
-    PADDLE_ENFORCE(!src_tensors_.empty());
+    PADDLE_ENFORCE_NE(src_tensors_.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The number of tensors to be reduced is 0."));
     auto &t0 = *src_tensors_[0];
-    PADDLE_ENFORCE_NE(t0.numel(), 0);
+    PADDLE_ENFORCE_NE(t0.numel(), 0,
+                      platform::errors::InvalidArgument(
+                          "The size of first tensor to be reduced is 0."));
 
     dst_tensor_.Resize(t0.dims());
     T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
@@ -45,8 +50,19 @@ struct ReduceLoDTensor {
         continue;
       }
 
-      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
-      PADDLE_ENFORCE_EQ(t.type(), t0.type());
+      PADDLE_ENFORCE_EQ(t.dims(), t0.dims(),
+                        platform::errors::InvalidArgument(
+                            "The shape of tensors to be reduced must be "
+                            "consistent. The shape of current tensor is %s, "
+                            "but the shape of the first tensor is %s.",
+                            t.dims(), t0.dims()));
+
+      PADDLE_ENFORCE_EQ(t.type(), t0.type(),
+                        platform::errors::InvalidArgument(
+                            "The type of tensors to be reduced must be "
+                            "consistent. The type of current tensor is %s, "
+                            "but the type of the first tensor is %s.",
+                            t.type(), t0.type()));
       std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
                      [](T a, T b) -> T { return a + b; });
     }
@@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor {
         in_places_(in_places),
         out_place_(out_place),
         dst_selected_rows_(dst_selected_rows) {
-    PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false);
+    PADDLE_ENFORCE_NE(src_selected_rows.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The number of selected_rows to be gathered is 0."));
 
     std::vector<int64_t> out_rows;
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index d8f8cc994c0..d7f13f79f68 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows(
   merged_dev_ctx->Wait();
   scope->EraseVars(std::vector<std::string>{gathered_var_name});
 
-  PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope));
-  PADDLE_ENFORCE(remote.size() == vars.size());
+  PADDLE_ENFORCE_EQ(
+      client->Gather(vars, &remote, *merged_dev_ctx, scope), true,
+      platform::errors::PreconditionNotMet("Gather SelectedRows failed."));
+  PADDLE_ENFORCE_EQ(remote.size(), vars.size(),
+                    platform::errors::PreconditionNotMet(
+                        "The number of remotes should be equal to the number "
+                        "of variables to be gathered, but got the number of "
+                        "remotes is %d and the number of variables is %d.",
+                        remote.size(), vars.size()));
 
   // 4. merged local selected rows.
   std::vector<const SelectedRows *> all;
@@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() {
 
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), places_.size(),
-      "The number of output should equal to the number of places.");
+      platform::errors::InvalidArgument(
+          "The number of inputs should equal to the number of places, but got "
+          "the number of inputs is %d and the number of places is %d.",
+          in_var_handles.size(), places_.size()));
 
   VarHandle *out_var_handle;
   {
     auto out_var_handles = DynamicCast<VarHandle>(outputs_);
 
     PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
-                      "The number of output should be one.");
+                      platform::errors::InvalidArgument(
+                          "The number of output should be one, but got %d.",
+                          out_var_handles.size()));
     out_var_handle = out_var_handles.front();
   }
 
@@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() {
 
   auto pre_in_var =
       var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
+
+  PADDLE_ENFORCE_NOT_NULL(pre_in_var, platform::errors::NotFound(
+                                          "Variable %s is not found in scope.",
+                                          in_0_handle->name()));
 
   // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
   std::vector<platform::Place> in_places;  // used to get dev_ctx
@@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() {
     in_places.emplace_back(in_handle->place());
     auto in_var =
         var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(in_var);
+
+    PADDLE_ENFORCE_NOT_NULL(
+        in_var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                           in_handle->name()));
+
     VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var);
   }
 
   auto out_var = var_scopes.at(out_var_handle->scope_idx())
                      ->FindVar(out_var_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(out_var);
+
+  PADDLE_ENFORCE_NOT_NULL(
+      out_var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                          out_var_handle->name()));
 
   // NOTE: The tensors' Place of input and output must be all on GPU or all on
   // CPU.
   auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place();
   platform::Place t_out_p;
   if (platform::is_gpu_place(in_p)) {
-    PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place()),
-                   "Places of input and output must be all on GPU.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_var_handle->place()), true,
+                      platform::errors::PreconditionNotMet(
+                          "Places of input and output must be all on GPU."));
     t_out_p = out_var_handle->place();
   } else {
     t_out_p = platform::CPUPlace();
@@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() {
             in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
             out_var->GetMutable<framework::SelectedRows>());
       } else {
-        PADDLE_THROW("only support double or float when gather SelectedRows");
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Only support double or float when gather SelectedRows, but got "
+            "%s.",
+            framework::DataTypeToString(in_selected_rows[0]->value().type())));
       }
 #endif
     });
@@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() {
         size_t numel = static_cast<size_t>(lod_tensor.numel());
         all_reduce_calls.emplace_back(
             [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
-              PADDLE_ENFORCE(platform::dynload::ncclReduce(
+              PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
                   buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
                   ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
             });
@@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() {
         }
       });
 #else
-      PADDLE_THROW("CUDA is not enabled.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
     } else {
-      PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The place of tensor should be CPUPlace or CUDAPlace, but got %s.",
+          lod_tensors[0]->place()));
     }
   }
 }
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index d71251b76c7..ba03c3a267a 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
+
 #include <unordered_map>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -69,7 +71,8 @@ struct TestReduceOpHandle {
       }
       nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
     } else {
       int count = 8;
@@ -103,7 +106,8 @@ struct TestReduceOpHandle {
       op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
                                           gpu_list_, nccl_ctxs_.get()));
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
     } else {
 #if defined(PADDLE_WITH_NCCL)
@@ -164,7 +168,10 @@ struct TestReduceOpHandle {
     for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
          ++input_scope_idx) {
       auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
-      PADDLE_ENFORCE_NOT_NULL(in_var);
+
+      PADDLE_ENFORCE_NOT_NULL(
+          in_var, platform::errors::NotFound(
+                      "Variable %s is not found in scope.", "input"));
       auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
       auto value = in_selected_rows->mutable_value();
       value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -178,7 +185,9 @@ struct TestReduceOpHandle {
     }
 
     auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
-    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_NOT_NULL(out_var,
+                            platform::errors::NotFound(
+                                "Variable %s is not found in scope.", "out"));
     auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
 
     auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
@@ -196,9 +205,18 @@ struct TestReduceOpHandle {
     auto &out_select_rows = out_var->Get<f::SelectedRows>();
     auto rt = out_select_rows.value();
 
-    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
+                      platform::errors::InvalidArgument(
+                          "The height of SelectedRows is not equal to "
+                          "the expected, expect %d, but got %d.",
+                          height, out_select_rows.height()));
     for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+      PADDLE_ENFORCE_EQ(
+          out_select_rows.rows()[k], rows[k % rows.size()],
+          platform::errors::InvalidArgument(
+              "The item at position %d of rows of SelectedRows is not equal to "
+              "the expected, expect %d, but got %d.",
+              k, rows[k % rows.size()], out_select_rows.rows()[k]));
     }
 
     f::Tensor result_tensor;
@@ -208,7 +226,7 @@ struct TestReduceOpHandle {
     for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
       ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
     }
-  }
+  }  // namespace details
 
   void TestReduceLodTensors(size_t output_scope_idx) {
     std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
@@ -220,7 +238,9 @@ struct TestReduceOpHandle {
     for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
          ++input_scope_idx) {
       auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
-      PADDLE_ENFORCE_NOT_NULL(in_var);
+      PADDLE_ENFORCE_NOT_NULL(
+          in_var, platform::errors::NotFound(
+                      "Variable %s is not found in scope.", "input"));
       auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
       in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
       in_lod_tensor->set_lod(lod);
@@ -230,7 +250,9 @@ struct TestReduceOpHandle {
     }
 
     auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
-    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_NOT_NULL(out_var,
+                            platform::errors::NotFound(
+                                "Variable %s is not found in scope.", "out"));
     auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
 
     auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
@@ -254,7 +276,7 @@ struct TestReduceOpHandle {
       ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
     }
   }
-};
+};  // namespace details
 
 TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
   TestReduceOpHandle test_op;
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
index bf93d8f85b1..079e9abc895 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -111,13 +111,12 @@ void ShareTensorBufferFunctor::CallOnce() {
     auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
     PADDLE_ENFORCE_NOT_NULL(
         in_var, platform::errors::NotFound(
-                    "The input variable(%s)to be inplaced should not be NULL.",
+                    "The variable(%s) to be inplaced is not found in scope.",
                     in_var_infos_[i]->Name()));
     PADDLE_ENFORCE_NOT_NULL(
-        out_var,
-        platform::errors::NotFound(
-            "The output variable(%s) to be inplaced should not be NULL.",
-            out_var_names_[i]));
+        out_var, platform::errors::NotFound(
+                     "The variable(%s) to be inplaced is not found in scope.",
+                     out_var_names_[i]));
     PADDLE_ENFORCE_NE(
         in_var, out_var,
         platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index 3f9af1c3a12..37399e5ddc0 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
+
 #include <algorithm>
 #include <utility>
+
 #include "dgc/dgc.h"
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
@@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
       is_encoded_(is_encoded),
       nranks_(nranks) {
   // TODO(gongwb) :polish them!
-  PADDLE_ENFORCE_EQ(is_encoded, true);
+  PADDLE_ENFORCE_EQ(is_encoded, true, platform::errors::InvalidArgument(
+                                          "The argument is_encoded is false."));
   VLOG(1) << "Use dgc allreduce mode"
           << ", nranks:" << nranks_;
 
-  PADDLE_ENFORCE_GT(local_scopes_.size(), 0);
+  PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "The number of local scope should be > 0, but got %zu.",
+                        local_scopes_.size()));
   auto nranks_name = g_dgc_nranks;
   for (size_t i = 0; i < local_scopes_.size(); ++i) {
     auto *local_scope = local_scopes_[i];
     auto nranks_var = local_scope->FindVar(nranks_name);
-    if (nranks_var == nullptr) {
-      PADDLE_THROW("not find nranks_var:%s", nranks_name);
-    }
+
+    PADDLE_ENFORCE_NOT_NULL(
+        nranks_var, platform::errors::NotFound(
+                        "Variable %s is not found in scope.", nranks_name));
 
     float *dgc_nranks = nranks_var->GetMutable<LoDTensor>()->data<float>();
     *dgc_nranks = nranks;
@@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
   auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), places_.size(),
-      "The NoDummyInputSize should be equal to the number of places.");
+      platform::errors::PreconditionNotMet(
+          "The number of input variables should be equal to the number of "
+          "places, but got the number of input variables is %zu and the the "
+          "number of places is %zu.",
+          in_var_handles.size(), places_.size()));
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+      platform::errors::PreconditionNotMet(
+          "The number of input variables should be equal to the number of "
+          "output variables, but got the number of input variables is %zu and "
+          "the the number of output variables is %zu.",
+          in_var_handles.size(), out_var_handles.size()));
 
   std::vector<const LoDTensor *> ins;
   std::vector<LoDTensor *> gathers;
@@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
 
     auto encode_var_name = original_name + g_dgc_encoded;
     auto *in_var = local_scope->FindVar(encode_var_name);
-    PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        in_var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                           encode_var_name));
     auto &in = in_var->Get<LoDTensor>();
     ins.emplace_back(&in);
 
     auto gather_var_name = original_name + g_dgc_gather;
     auto *gather_var = local_scope->FindVar(gather_var_name);
-    PADDLE_ENFORCE_NOT_NULL(gather_var, "%s should not be null",
-                            gather_var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        gather_var, platform::errors::NotFound(
+                        "Variable %s is not found in scope.", gather_var));
     auto *gather = gather_var->GetMutable<LoDTensor>();
     gathers.emplace_back(gather);
 
@@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
     }
   }
 
-  PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
-  PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
-  PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+  PADDLE_ENFORCE_EQ(
+      platform::is_gpu_place(ins[0]->place()), true,
+      platform::errors::InvalidArgument(
+          "The place of input variable should be CUDAPlace, but got %s.",
+          ins[0]->place()));
+  PADDLE_ENFORCE_EQ(
+      platform::is_gpu_place(outs[0]->place()), true,
+      platform::errors::InvalidArgument(
+          "The place of input variable should be CUDAPlace, but got %s.",
+          outs[0]->place()));
+  PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, platform::errors::PreconditionNotMet(
+                                          "The nccl contexts are NULL."));
 
   int dtype = -1;
   size_t in_numel = 0;
   size_t out_numel = 0;
-  PADDLE_ENFORCE(nranks_ > 1);
+  PADDLE_ENFORCE_GT(
+      nranks_, 1,
+      platform::errors::PreconditionNotMet(
+          "The number of ranks should be > 1, but got %d.", nranks_));
   std::vector<std::function<void()>> all_gather_calls;
   std::vector<std::function<void()>> sparse_reduce_calls;
 
@@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
 
     dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
     in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
-    PADDLE_ENFORCE(in_numel % 2 == 0);
-    PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
+    PADDLE_ENFORCE_EQ(in_numel % 2, 0,
+                      platform::errors::InvalidArgument(
+                          "The number of elements of input variable should be "
+                          "even, but got %zu.",
+                          in_numel));
+    PADDLE_ENFORCE_EQ(in_numel / 2, static_cast<size_t>(k),
+                      platform::errors::InvalidArgument(
+                          "The number of elements of input variable should be "
+                          "even, but got %zu.",
+                          in_numel));
     out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
 
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
@@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
       PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce(
                             gather_buff, k, out_tensor_buf,
                             static_cast<int>(out_numel), nranks_, stream),
-                        true);
+                        true, platform::errors::Unavailable(
+                                  "Calling sparseReduce() failed."));
     });
   }
 
@@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc(
 int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
   auto original_name = paddle::framework::GradOriginalVarName(grad_name);
   auto var_name = original_name + g_dgc_k;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
+  PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "The number of local scope should be > 0, but got %zu.",
+                        local_scopes_.size()));
 
   auto *scope = local_exec_scopes_[0];
   auto var = scope->FindVar(var_name);
-  PADDLE_ENFORCE_NOT_NULL(var);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                      var_name));
   auto tensor = var->Get<LoDTensor>().data<float>();
   return *tensor;
 }
@@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() {
   }
   auto counter_name = g_dgc_counter_name;
   auto step_name = g_dgc_rampup_begin_step;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
+
+  PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "The number of local scope should be > 0, but got %zu.",
+                        local_scopes_.size()));
 
   auto *local_scope = local_exec_scopes_[0];
   auto count_var = local_scope->FindVar(counter_name);
   auto step_var = local_scope->FindVar(step_name);
-  if (count_var == nullptr || step_var == nullptr) {
-    PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
-                 step_var);
-  }
+
+  PADDLE_ENFORCE_NOT_NULL(
+      count_var, platform::errors::NotFound(
+                     "Variable %s is not found in scope.", counter_name));
+  PADDLE_ENFORCE_NOT_NULL(
+      step_var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                           step_var));
 
   float count = *count_var->Get<LoDTensor>().data<float>();
   float step = *step_var->Get<LoDTensor>().data<float>();
-- 
GitLab


From d014e29fc611392a015dc54b20a0d347e92e65f7 Mon Sep 17 00:00:00 2001
From: Chengmo <cmchengmo@163.com>
Date: Sun, 27 Sep 2020 13:32:32 +0800
Subject: [PATCH 248/261] fix error message (#27318)

* fix sgd/momentum/dpsgd/rmsprop error message
---
 paddle/fluid/operators/optimizers/dpsgd_op.cc | 35 +++++---
 paddle/fluid/operators/optimizers/dpsgd_op.h  | 18 ++--
 .../fluid/operators/optimizers/momentum_op.h  | 79 +++++++++++------
 .../fluid/operators/optimizers/rmsprop_op.cc  | 88 ++++++++++++-------
 .../fluid/operators/optimizers/rmsprop_op.h   | 37 +++++---
 paddle/fluid/operators/optimizers/sgd_op.cc   | 34 ++++---
 paddle/fluid/operators/optimizers/sgd_op.cu   | 36 ++++++--
 paddle/fluid/operators/optimizers/sgd_op.h    | 79 +++++++++++++----
 8 files changed, 277 insertions(+), 129 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc
index 3bcf17fc7b3..bce00933420 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
@@ -24,32 +24,45 @@ class DpsgdOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
-                      "Input(Param) of DpsgdOp should not be null.");
+                      platform::errors::NotFound(
+                          "Input(Param) of DpsgdOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
-                      "Input(Grad) of DpsgdOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
-                      "Input(LearningRate) of DpsgdOp should not be null.");
+                      platform::errors::NotFound(
+                          "Input(Grad) of DpsgdOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("LearningRate"), true,
+        platform::errors::NotFound(
+            "Input(LearningRate) of DpsgdOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->GetInputsVarType("Param").front(),
         framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->GetInputsVarType("Param").front()));
     PADDLE_ENFORCE_EQ(
         ctx->GetInputsVarType("Grad").front(),
         framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->GetInputsVarType("Grad").front()));
 
     PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
-                      "Output(ParamOut) of DpsgdOp should not be null.");
+                      platform::errors::NotFound(
+                          "Output(ParamOut) of DpsgdOp should not be null."));
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 dimension");
+                      platform::errors::InvalidArgument(
+                          "Learning rate should have 1 dimension. But Received "
+                          "LearningRate's dims [%s].",
+                          framework::product(lr_dims)));
     auto param_dims = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
         param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of DpsgdOp should have same dimension");
+        platform::errors::InvalidArgument(
+            "Param and Grad input of DpsgdOp should have same dimension. But "
+            "received Para's dim [%s] and Grad's dim [%s].",
+            param_dims, ctx->GetInputDim("Grad")));
 
     ctx->SetOutputDim("ParamOut", param_dims);
   }
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.h b/paddle/fluid/operators/optimizers/dpsgd_op.h
index 4eb52feb851..e52a1dd9db1 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.h
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.h
@@ -28,17 +28,19 @@ class DpsgdOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *param_var = ctx.InputVar("Param");
     PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      "The Var(%s)'s type should be LoDTensor, "
-                      "but the received is %s",
-                      ctx.InputNames("Param").front(),
-                      framework::ToTypeName(param_var->Type()));
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
 
     const auto *grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
-                      "The Var(%s)'s type should be LoDTensor, "
-                      "but the received is %s",
-                      ctx.InputNames("Grad").front(),
-                      framework::ToTypeName(grad_var->Type()));
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
 
     const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
 
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 10b72524efd..083bd91abfc 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -40,43 +40,62 @@ class MomentumOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(param) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(grad) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Velocity"),
-                   "Input(velocity) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of Momentum should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
-                   "Output(VelocityOut) of Momentum should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
+                      platform::errors::NotFound(
+                          "Input(param) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
+                      platform::errors::NotFound(
+                          "Input(grad) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Velocity"), true,
+                      platform::errors::NotFound(
+                          "Input(velocity) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("LearningRate"), true,
+        platform::errors::NotFound(
+            "Input(LearningRate) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputsVarType("Param").front(),
+        framework::proto::VarType::LOD_TENSOR,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->GetInputsVarType("Param").front()));
+
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
+                      platform::errors::NotFound(
+                          "Output(ParamOut) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("VelocityOut"), true,
+        platform::errors::NotFound(
+            "Output(VelocityOut) of Momentum should not be null."));
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
+                      platform::errors::InvalidArgument(
+                          "Maybe the Input variable LearningRate has not "
+                          "been initialized. You may need to confirm "
+                          "if you put exe.run(startup_program) "
+                          "after optimizer.minimize function."));
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning_rate should be a scalar");
+                      platform::errors::InvalidArgument(
+                          "Learning_rate should be a scalar. But Received "
+                          "LearningRate's dim [%s]",
+                          framework::product(lr_dims)));
 
     auto param_dim = ctx->GetInputDim("Param");
     if (ctx->GetInputsVarType("Grad")[0] ==
         framework::proto::VarType::LOD_TENSOR) {
       PADDLE_ENFORCE_EQ(
           param_dim, ctx->GetInputDim("Grad"),
-          "Param and Grad input of MomentumOp should have the same dimension.");
+          platform::errors::InvalidArgument(
+              "Param and Grad input of MomentumOp should have the same "
+              "dimension. But received Param's dim [%s] and Grad's dim [%s].",
+              param_dim, ctx->GetInputDim("Grad")));
       PADDLE_ENFORCE_EQ(
           param_dim, ctx->GetInputDim("Velocity"),
-          "Param and Velocity of MomentumOp should have the same dimension.");
+          platform::errors::InvalidArgument(
+              "Param and Velocity of MomentumOp should have the same "
+              "dimension. But received Param's dim [%s] and Velocity [%s].",
+              param_dim, ctx->GetInputDim("Velocity")));
     }
 
     ctx->SetOutputDim("ParamOut", param_dim);
@@ -398,10 +417,12 @@ class MomentumOpKernel : public framework::OpKernel<T> {
         for_range(functor);
       }
     } else {
-      PADDLE_THROW(
-          string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows "
-                          "gradient, but the received Variable Type is %s",
-                          framework::ToTypeName(grad_var->Type())));
+      PADDLE_ENFORCE_EQ(false, true,
+                        platform::errors::PermissionDenied(
+                            "Unsupported Variable Type of Grad "
+                            "in MomentumOp. Excepted LodTensor "
+                            "or SelectedRows, But received [%s]",
+                            paddle::framework::ToTypeName(grad_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc
index eeee008cdc5..9e7960c237f 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc
@@ -22,47 +22,75 @@ class RmspropOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("MeanSquare"),
-                   "Input(MeanSquare) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(param_out) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
-                   "Output(MeanSquareOut) of RmspropOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
+                      platform::errors::NotFound(
+                          "Input(Param) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("MeanSquare"), true,
+        platform::errors::NotFound(
+            "Input(MeanSquare) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("LearningRate"), true,
+        platform::errors::NotFound(
+            "Input(LearningRate) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
+                      platform::errors::NotFound(
+                          "Input(Grad) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Moment"), true,
+                      platform::errors::NotFound(
+                          "Input(Moment) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(),
+                      framework::proto::VarType::LOD_TENSOR,
+                      platform::errors::InvalidArgument(
+                          "The input var's type in RmspropOp should be "
+                          "LoDTensor, but the received is %s",
+                          ctx->GetInputsVarType("Param").front()));
+
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("ParamOut"), true,
+        platform::errors::NotFound(
+            "Output(param_out) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("MomentOut"), true,
+        platform::errors::NotFound(
+            "Output(MomentOut) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("MeanSquareOut"), true,
+        platform::errors::NotFound(
+            "Output(MeanSquareOut) of RmspropOp should not be null."));
     if (ctx->Attrs().Get<bool>("centered")) {
-      PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"),
-                     "Output(MeanGradOut) of RmspropOp should not be null.");
+      PADDLE_ENFORCE_EQ(
+          ctx->HasOutput("MeanGradOut"), true,
+          platform::errors::NotFound(
+              "Output(MeanGradOut) of RmspropOp should not be null."));
     }
 
     auto param_dim = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
         param_dim, ctx->GetInputDim("Grad"),
-        "Param and grad input of RmspropOp should have the same dimension.");
+        platform::errors::InvalidArgument(
+            "Param and grad input of RmspropOp should have the same dimension. "
+            "But received Param's dim [%s] and Grad's dim [%s].",
+            param_dim, ctx->GetInputDim("Grad")));
     PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"),
-                      "Param and Momentum input of RmspropOp "
-                      "should have the same dimension.");
+                      platform::errors::InvalidArgument(
+                          "Param and Momentum input of RmspropOp "
+                          "should have the same dimension. But received "
+                          "Param's dim [%s] and Moment [%s]",
+                          param_dim, ctx->GetInputDim("Moment")));
     PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"),
-                      "Param and Momentum input of RmspropOp "
-                      "should have the same dimension.");
+                      platform::errors::InvalidArgument(
+                          "Param and Momentum input of RmspropOp "
+                          "should have the same dimension. But received "
+                          "Param's dim [%s] and MeanSquare [%s]",
+                          param_dim, ctx->GetInputDim("MeanSquare")));
 
     auto lr_dim = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
-                      "Learning Rate should be a scalar.");
+                      platform::errors::InvalidArgument(
+                          "Learning Rate of RmspropOp should be a scalar. But "
+                          "received LearningRate's dim [%s]",
+                          framework::product(lr_dim)));
 
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("MomentOut", param_dim);
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h
index 4550052b2d6..1ec712a1431 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.h
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.h
@@ -148,11 +148,15 @@ class RmspropOpKernel : public framework::OpKernel<T> {
     auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
 
     PADDLE_ENFORCE_EQ(&p_tensor, param_out,
-                      "Param and ParamOut must be the same Tensor");
+                      platform::errors::InvalidArgument(
+                          "Param and ParamOut must be the same Tensor"));
     PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
-                      "Moment and MomentOut must be the same Tensor");
-    PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out,
-                      "MeanSquare and MeanSquareOut must be the same Tensor");
+                      platform::errors::InvalidArgument(
+                          "Moment and MomentOut must be the same Tensor"));
+    PADDLE_ENFORCE_EQ(
+        &ms_tensor, mean_square_out,
+        platform::errors::InvalidArgument(
+            "MeanSquare and MeanSquareOut must be the same Tensor"));
 
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     size_t limit = static_cast<size_t>(ms_tensor.numel());
@@ -179,8 +183,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
           auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
           auto mg = EigenVector<T>::Flatten(mg_tensor);
           auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-          PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
-                            "MeanGrad and MeanGradOut must be the same Tensor");
+          PADDLE_ENFORCE_EQ(
+              &mg_tensor, mean_grad_out,
+              platform::errors::InvalidArgument(
+                  "MeanGrad and MeanGradOut must be the same Tensor"));
           auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
 
           mg_out.device(place) = rho * mg + (1 - rho) * g;
@@ -198,8 +204,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
         if (centered) {
           auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
           auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-          PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
-                            "MeanGrad and MeanGradOut must be the same Tensor");
+          PADDLE_ENFORCE_EQ(
+              &mg_tensor, mean_grad_out,
+              platform::errors::InvalidArgument(
+                  "MeanGrad and MeanGradOut must be the same Tensor"));
           for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
               param_out->mutable_data<T>(ctx.GetPlace()),
               mean_square_out->mutable_data<T>(ctx.GetPlace()),
@@ -233,8 +241,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
       if (centered) {
         auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
         auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-        PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
-                          "MeanGrad and MeanGradOut must be the same Tensor");
+        PADDLE_ENFORCE_EQ(
+            &mg_tensor, mean_grad_out,
+            platform::errors::InvalidArgument(
+                "MeanGrad and MeanGradOut must be the same Tensor"));
         for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
             param_out->mutable_data<T>(ctx.GetPlace()),
             mean_square_out->mutable_data<T>(ctx.GetPlace()),
@@ -249,7 +259,12 @@ class RmspropOpKernel : public framework::OpKernel<T> {
             rho, epsilon, momentum, grad_func));
       }
     } else {
-      PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient");
+      PADDLE_ENFORCE_EQ(false, true,
+                        platform::errors::PermissionDenied(
+                            "Unsupported Variable Type of Grad "
+                            "in RmspropOp. Excepted LodTensor "
+                            "or SelectedRows, But received [%s]",
+                            paddle::framework::ToTypeName(grad_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index aeff8da70b9..569dbcd6a3e 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -22,23 +22,31 @@ class SGDOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of SGDOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
+                      platform::errors::NotFound(
+                          "Input(Param) of SGDOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Grad"), true,
+        platform::errors::NotFound("Input(Grad) of SGDOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
+                      platform::errors::NotFound(
+                          "Input(LearningRate) of SGDOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
+                      platform::errors::NotFound(
+                          "Output(ParamOut) of SGDOp should not be null."));
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
+                      platform::errors::NotFound(
+                          "Maybe the Input variable LearningRate has not "
+                          "been initialized. You may need to confirm "
+                          "if you put exe.run(startup_program) "
+                          "after optimizer.minimize function."));
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 element");
+                      platform::errors::InvalidArgument(
+                          "Learning rate should have 1 element. But received "
+                          "LearningRate dims [%s]",
+                          framework::product(lr_dims)));
     auto param_dim = ctx->GetInputDim("Param");
     if (ctx->GetInputsVarType("Grad")[0] ==
         framework::proto::VarType::LOD_TENSOR) {
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index b70f24e0e5e..a5d9ad271f2 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -57,11 +57,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          paddle::framework::ToTypeName(param_var->Type())));
 
     auto* param = ctx.Input<framework::Tensor>("Param");
     auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
@@ -91,18 +92,30 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
       // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
       // This manual optimization brings difficulty to track data dependency.
       // It's better to find a more elegant solution.
-      PADDLE_ENFORCE_EQ(param, param_out);
+      PADDLE_ENFORCE_EQ(
+          param, param_out,
+          platform::errors::InvalidArgument(
+              "The input tensor Param of SgdOp should be equal with ParamOut "
+              "if variable's type is SelectedRows."));
       auto* grad = ctx.Input<framework::SelectedRows>("Grad");
 
       auto in_height = grad->height();
       auto out_dims = param_out->dims();
-      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+      PADDLE_ENFORCE_EQ(in_height, out_dims[0],
+                        platform::errors::InvalidArgument(
+                            "The input tensor Grad's height of SgdOp should be "
+                            "equal with ParamOut's dims. But received Grad's "
+                            "height [%s] and ParamOut's dims [%s]",
+                            in_height, out_dims[0]));
 
       auto& in_value = grad->value();
       auto& in_rows = grad->rows();
 
       int64_t in_row_numel = in_value.numel() / in_rows.size();
-      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
+      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height,
+                        platform::errors::InvalidArgument(
+                            "The in_row_numel of SgdOp should be equal with "
+                            "param_out's numel / in_height."));
 
       auto* in_data = in_value.data<T>();
       auto* out_data = param_out->data<T>();
@@ -118,7 +131,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
           out_data, in_row_numel, in_rows.size());
 
     } else {
-      PADDLE_THROW("Unsupported Variable Type of Grad");
+      PADDLE_ENFORCE_EQ(false, true,
+                        platform::errors::PermissionDenied(
+                            "Unsupported Variable Type of Grad "
+                            "in SgdOp. Excepted LodTensor or "
+                            "SelectedRows, But received [%s]",
+                            paddle::framework::ToTypeName(grad_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index 539d774a395..1aaf95efc32 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -44,8 +44,20 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
       if (grad_var->IsType<framework::LoDTensor>()) {
         const auto *grad = ctx.Input<framework::Tensor>("Grad");
         auto sz = param_out->numel();
-        PADDLE_ENFORCE_EQ(param->numel(), sz);
-        PADDLE_ENFORCE_EQ(grad->numel(), sz);
+        PADDLE_ENFORCE_EQ(param->numel(), sz,
+                          platform::errors::InvalidArgument(
+                              "The input tensor Param's numel of SgdOp "
+                              "should be equal with ParamOut's numel. "
+                              "But received Param's "
+                              "numel = [%s], ParamOut's numel = [%s]",
+                              param->numel(), sz));
+        PADDLE_ENFORCE_EQ(grad->numel(), sz,
+                          platform::errors::InvalidArgument(
+                              "The input tensor Grad's numel of SgdOp "
+                              "should be equal with ParamOut's numel. "
+                              "But received Grad's "
+                              "numel = [%s], ParamOut's numel = [%s]",
+                              grad->numel(), sz));
 
         jit::sgd_attr_t attr(1, sz, 1, sz, 1);
         const T *lr = learning_rate->data<T>();
@@ -62,7 +74,11 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
         // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
         // This manual optimization brings difficulty to track data dependency.
         // It's better to find a more elegant solution.
-        PADDLE_ENFORCE_EQ(param, param_out);
+        PADDLE_ENFORCE_EQ(param, param_out,
+                          platform::errors::InvalidArgument(
+                              "The input tensor Param of SgdOp "
+                              "should be equal with ParamOut if variable's "
+                              "type is SelectedRows. "));
         const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
         auto &grad_rows = grad->rows();
 
@@ -73,7 +89,13 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
         }
 
         auto out_dims = param_out->dims();
-        PADDLE_ENFORCE_EQ(grad->height(), out_dims[0]);
+        PADDLE_ENFORCE_EQ(
+            grad->height(), out_dims[0],
+            platform::errors::InvalidArgument(
+                "The input tensor Grad's height of SgdOp "
+                "should be equal with ParamOut's dims. But received  Grad's "
+                "height [%s] and ParamOut's dims [%s]",
+                grad->height(), out_dims[0]));
         auto &grad_value = grad->value();
         const T *param_data = param->data<T>();
         const T *grad_data = grad_value.data<T>();
@@ -87,19 +109,31 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
         attr.grad_height = grad_rows.size();  // note: it is not grad->height()
         attr.grad_width = grad_value.numel() / attr.grad_height;
         attr.selected_rows_size = grad_rows.size();
-        PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width);
+        PADDLE_ENFORCE_EQ(
+            attr.grad_width, attr.param_width,
+            platform::errors::InvalidArgument(
+                "The grad_value's numel of SgdOp "
+                "should be equal with param_out's numel. But received "
+                "grad_value's numel [%s] and param_out's numel [%s]",
+                attr.grad_width, attr.param_width));
 
         auto sgd =
             jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
                 attr);
         sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
       } else {
-        PADDLE_THROW("Unsupported Variable Type of Grad");
+        PADDLE_ENFORCE_EQ(
+            false, true,
+            platform::errors::PermissionDenied(
+                "Unsupported Variable Type of Grad in SgdOp. Excepted "
+                "LodTensor or SelectedRows, But received [%s]",
+                paddle::framework::ToTypeName(grad_var->Type())));
       }
     } else if (param_var->IsType<framework::SelectedRows>()) {
-      PADDLE_ENFORCE(grad_var->IsType<framework::SelectedRows>(),
-                     "when param "
-                     "is SelectedRows, gradient should also be SelectedRows");
+      PADDLE_ENFORCE_EQ(grad_var->IsType<framework::SelectedRows>(), true,
+                        platform::errors::InvalidArgument(
+                            "when param is SelectedRows, "
+                            "gradient should also be SelectedRows"));
       const auto &param = param_var->Get<framework::SelectedRows>();
       auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
       const auto &grad = grad_var->Get<framework::SelectedRows>();
@@ -112,27 +146,36 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
 
       auto param_row_width = param.value().dims()[1];
       auto grad_row_width = grad.value().dims()[1];
-      VLOG(4) << " param rows: " << param.rows().size()
-              << " param memory rows: " << param.value().dims()[0]
-              << " grad rows: " << grad.rows().size()
-              << " grad memory rows: " << grad.value().dims()[0];
-      PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
-                        "param_row should have the same size with grad_row");
+      PADDLE_ENFORCE_EQ(
+          param_row_width, grad_row_width,
+          platform::errors::InvalidArgument(
+              "The param_row in SgdOP should have the same size with grad_row. "
+              "But received param_row's width is [%s], and grad_row's width is "
+              "[%s]",
+              param_row_width, grad_row_width));
 
       const auto *lr = learning_rate->data<T>();
       const auto *grad_data = grad.value().data<T>();
       auto *out_data = param_out->mutable_value()->data<T>();
       for (size_t i = 0; i < grad.rows().size(); i++) {
         int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
-        PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
-                          "id should be in the table");
+        PADDLE_ENFORCE_GE(
+            id_index, static_cast<int64_t>(0),
+            platform::errors::InvalidArgument(
+                "The id in SgdOp should be >= 0. But recevied id_index is [%s]",
+                id_index));
         for (int64_t j = 0; j < grad_row_width; j++) {
           out_data[id_index * grad_row_width + j] -=
               lr[0] * grad_data[i * grad_row_width + j];
         }
       }
     } else {
-      PADDLE_THROW("Unsupported Variable Type of Parameter");
+      PADDLE_ENFORCE_EQ(
+          false, true,
+          platform::errors::PermissionDenied(
+              "Unsupported Variable Type of Parameter in SgdOp. Excepted "
+              "LodTensor or SelectedRows, But received [%s]",
+              paddle::framework::ToTypeName(param_var->Type())));
     }
   }
 };
-- 
GitLab


From d37b3774fd4a9b544422ba5e8e335f879744c440 Mon Sep 17 00:00:00 2001
From: Jack Zhou <136876878@qq.com>
Date: Sun, 27 Sep 2020 13:54:48 +0800
Subject: [PATCH 249/261] register log double grad kernel for cpu and cuda

register log double grad kernel for cpu and cuda
---
 paddle/fluid/operators/activation_op.cc       | 51 +++++++++++++++++++
 paddle/fluid/operators/activation_op.cu       | 12 +++++
 paddle/fluid/operators/activation_op.h        | 36 ++++++++++++-
 .../unittests/test_activation_nn_grad.py      | 24 +++++++++
 4 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 95214484dca..a640a6c745c 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -891,6 +891,28 @@ class SquareDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
   }
 };
 
+// log Grad: dx = dout / x
+// log Grad Grad: ddout = ddx / x; dx = -(dout / x) * (ddx / x)
+template <typename T>
+class LogDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
+ public:
+  using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("log_grad_grad");
+    op->SetInput("X", this->Input("X"));
+    // X@GRAD@GRAD: ddx
+    op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
+    op->SetAttrMap(this->Attrs());
+    // X@GRAD: dx
+    op->SetOutput("DX", this->InputGrad("X"));
+    // Out@GRAD@GRAD: ddy
+    op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+  }
+};
+
 DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInferer,
                            {framework::GradVarName("Out"),
                             framework::GradVarName("X")});
@@ -1272,6 +1294,35 @@ REGISTER_OP_CPU_KERNEL(
                                     ops::AbsGradGradFunctor<int64_t>>);
 /* ========================================================================== */
 
+/* ==========================  Log register ==================================*/
+REGISTER_OPERATOR(
+    log, ops::ActivationOp, ops::LogOpMaker, ops::ActivationOpInferVarType,
+    ops::ActivationGradOpMaker<ops::LogGradFunctor<float>::FwdDeps(),
+                               paddle::framework::OpDesc>,
+    ops::ActivationGradOpMaker<ops::LogGradFunctor<float>::FwdDeps(),
+                               paddle::imperative::OpBase>,
+    ops::ActFwdInplaceInferer);
+REGISTER_OPERATOR(log_grad, ops::ActivationOpGrad,
+                  ops::ActivationGradOpInplaceInferer,
+                  ops::LogDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::LogDoubleGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(
+    log_grad_grad,
+    ops::ActivationOpDoubleGrad<ops::LogGradGradFunctor<float>::FwdDeps()>,
+    ops::ActivationDoubleGradOpInplaceInferer);
+
+REGISTER_ACTIVATION_CPU_KERNEL(log, Log, LogFunctor, LogGradFunctor);
+
+REGISTER_OP_CPU_KERNEL(
+    log_grad_grad, ops::LogDoubleGradKernel<plat::CPUDeviceContext,
+                                            ops::LogGradGradFunctor<float>>,
+    ops::LogDoubleGradKernel<plat::CPUDeviceContext,
+                             ops::LogGradGradFunctor<double>>,
+    ops::LogDoubleGradKernel<plat::CPUDeviceContext,
+                             ops::LogGradGradFunctor<plat::float16>>);
+/* ========================================================================== */
+
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(leaky_relu)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 072d952d261..839776ad58d 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -193,3 +193,15 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                     ops::AbsGradGradFunctor<int64_t>>);
 /* ========================================================================== */
+
+/* ==========================  Log register ==================================*/
+REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, LogFunctor, LogGradFunctor);
+
+REGISTER_OP_CUDA_KERNEL(
+    log_grad_grad, ops::LogDoubleGradKernel<plat::CUDADeviceContext,
+                                            ops::LogGradGradFunctor<float>>,
+    ops::LogDoubleGradKernel<plat::CUDADeviceContext,
+                             ops::LogGradGradFunctor<double>>,
+    ops::LogDoubleGradKernel<plat::CUDADeviceContext,
+                             ops::LogGradGradFunctor<plat::float16>>);
+/* ========================================================================== */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 646f546bffb..a5c613297a4 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -1663,6 +1663,10 @@ class SquareDoubleGradKernel
   }
 };
 
+template <typename DeviceContext, typename Functor>
+class LogDoubleGradKernel
+    : public SquareDoubleGradKernel<DeviceContext, Functor> {};
+
 template <typename DeviceContext, typename Functor>
 class ELUDoubleGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -1852,6 +1856,37 @@ class PowGradKernel
     functor(*place, x, out, dout, dx);
   }
 };
+
+template <typename T>
+struct LogGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* X,
+                  const framework::Tensor* ddX, framework::Tensor* ddOut,
+                  const framework::Tensor* dOut, framework::Tensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
+    // ddout = ddx / x; dx = -(dout / x) * (ddx / x)
+    // calculate dx first, so ddout can inplace ddx
+    if (dX) {
+      auto dout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
+      auto dx = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
+      dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
+    }
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
+      ddout.device(*d) = ddx * static_cast<T>(1) / x;
+    }
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -1872,7 +1907,6 @@ class PowGradKernel
   __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor);                          \
   __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
   __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
-  __macro(log, Log, LogFunctor, LogGradFunctor);                              \
   __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                      \
   __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor);                      \
   __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);         \
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index c97cca654a7..6c4834b84f9 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -173,5 +173,29 @@ class TestAbsDoubleGradCheck(unittest.TestCase):
             self.func(p)
 
 
+class TestLogDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        shape = [2, 3, 7, 9]
+        eps = 1e-6
+        dtype = np.float64
+
+        x = layers.data('x', shape, False, dtype)
+        x.persistable = True
+        y = layers.log(x)
+
+        x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From 6b727e08b1f38b3f4acc1708c163ed6ae5df8d58 Mon Sep 17 00:00:00 2001
From: QingshuChen <qingshu.chen714@gmail.com>
Date: Sun, 27 Sep 2020 13:56:14 +0800
Subject: [PATCH 250/261] support elementwise add, activation, matmul on Baidu
 Kunlun (#27143)

* support elementwise add, activation, matmul on Baidu Kunlun
* test=kunlun

* minor
* test=kunlun

* reconstuct the xpu directory
* test=kunlun

* minor
* test=kunlun

* minor
* test=kunlun

* minor
* test=kunlun

* minor
* test=kunlun

* minor
* test=kunlun
---
 cmake/external/xpu.cmake                      |   2 +-
 cmake/operators.cmake                         |   8 +-
 .../allocation/naive_best_fit_allocator.cc    |   9 +-
 paddle/fluid/operators/activation_op_xpu.cc   | 179 +++++++++
 .../elementwise/elementwise_add_op_xpu.cc     | 162 ++++++++
 .../operators/elementwise/elementwise_xpu.h   | 113 ++++++
 paddle/fluid/operators/matmul_op_xpu.cc       | 343 +++++++++++++++++
 .../{xpu/mul_xpu_op.cc => mul_op_xpu.cc}      |   2 +-
 paddle/fluid/platform/init_test.cc            |   1 +
 paddle/fluid/platform/xpu_header.h            |  27 ++
 python/paddle/__init__.py                     |   2 +
 python/paddle/device.py                       |  35 +-
 .../paddle/fluid/tests/unittests/op_test.py   |  16 +
 .../fluid/tests/unittests/test_matmul_op.py   |   1 +
 .../fluid/tests/unittests/test_mul_op.py      |  54 +--
 .../tests/unittests/xpu/test_activation_op.py | 215 +++++++++++
 .../unittests/xpu/test_elementwise_add_op.py  | 346 +++++++++++++++++
 .../tests/unittests/xpu/test_matmul_op.py     | 355 ++++++++++++++++++
 .../fluid/tests/unittests/xpu/test_mul_op.py  | 161 ++++++++
 tools/wlist.json                              |   4 +-
 20 files changed, 1970 insertions(+), 65 deletions(-)
 create mode 100644 paddle/fluid/operators/activation_op_xpu.cc
 create mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
 create mode 100644 paddle/fluid/operators/elementwise/elementwise_xpu.h
 create mode 100644 paddle/fluid/operators/matmul_op_xpu.cc
 rename paddle/fluid/operators/{xpu/mul_xpu_op.cc => mul_op_xpu.cc} (100%)
 create mode 100755 python/paddle/fluid/tests/unittests/xpu/test_activation_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_matmul_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_mul_op.py

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 8a927d8e282..07fe7d245ef 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -4,7 +4,7 @@ endif()
 
 INCLUDE(ExternalProject)
 SET(XPU_PROJECT                 "extern_xpu")
-SET(XPU_URL    "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
 SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
 SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 21080fbe8fd..7aa2766763c 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -62,9 +62,9 @@ function(op_library TARGET)
             endif()
         endif()
         if(WITH_XPU)
-            string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
-                list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
+            string(REPLACE "_op" "_op_xpu" XPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_FILE}.cc)
+                list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
             endif()
         endif()
     else()
@@ -83,7 +83,7 @@ function(op_library TARGET)
                 list(APPEND mkldnn_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cu.cc$")
                 list(APPEND cu_cc_srcs ${src})
-            elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
+            elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                 list(APPEND xpu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
                 list(APPEND cc_srcs ${src})
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 92e3933a072..c661c9f9c37 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -127,11 +127,10 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
                         "Baidu Kunlun Card is properly installed.",
                         ret));
   ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
+  PADDLE_ENFORCE_EQ(
+      ret, XPU_SUCCESS,
+      platform::errors::External(
+          "XPU API return wrong value[%d], no enough memory", ret));
   if (FLAGS_init_allocated_mem) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "xpu memory FLAGS_init_allocated_mem is not implemented."));
diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc
new file mode 100644
index 00000000000..49b7a08a7b5
--- /dev/null
+++ b/paddle/fluid/operators/activation_op_xpu.cc
@@ -0,0 +1,179 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/activation_op.h"
+#include <string>
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+
+template <typename Functor>
+class XPUActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+template <typename Functor>
+class XPUActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+template <typename DeviceContext, typename T>
+void xpu_activation_forward(const framework::ExecutionContext &ctx,
+                            xpu::Activation_t type) {
+  const auto *x = ctx.Input<Tensor>("X");
+  auto *y = ctx.Output<Tensor>("Out");
+  const T *x_data = x->data<T>();
+  T *y_data = y->mutable_data<T>(ctx.GetPlace());
+  int r = 0;
+  if (xpu::Activation_t::ACT_POW == type.type) {
+    type.pow_factor = ctx.Attr<float>("factor");
+  }
+  auto xpu_context = ctx.device_context<DeviceContext>().x_context();
+  r = xpu::activation_forward(xpu_context, type, x->numel(),
+                              reinterpret_cast<const float *>(x_data),
+                              reinterpret_cast<float *>(y_data));
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        r));
+}
+
+template <typename DeviceContext, typename T>
+void xpu_activation_backward(const framework::ExecutionContext &ctx,
+                             xpu::Activation_t type) {
+  /* TODO: relu tanh sigmoid are inplace */
+  const auto *x = ctx.Input<Tensor>("X");
+  auto *y = ctx.Input<Tensor>("Out");
+  auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+  auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+  const T *x_data = nullptr;
+  const T *y_data = nullptr;
+  const T *y_grad = nullptr;
+  if (x != nullptr) x_data = x->data<T>();
+  if (y != nullptr) y_data = y->data<T>();
+  if (dOut != nullptr) y_grad = dOut->data<T>();
+  T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
+  auto xpu_context = ctx.device_context<DeviceContext>().x_context();
+  int r = xpu::activation_backward(xpu_context, type, dX->numel(),
+                                   reinterpret_cast<const float *>(x_data),
+                                   reinterpret_cast<const float *>(y_data),
+                                   reinterpret_cast<const float *>(y_grad),
+                                   reinterpret_cast<float *>(x_grad));
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        r));
+}
+
+template <typename T, xpu::Activation_t::act_enum algorithm>
+struct XPUActivationFunc : public BaseActivationFunctor<T> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
+                                                                  algorithm);
+  }
+};
+
+template <typename T, xpu::Activation_t::act_enum algorithm>
+struct XPUActivationGradFunc : public BaseActivationFunctor<T> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(ctx,
+                                                                   algorithm);
+  }
+};
+
+template <typename T>
+using XPUReluFunctor = XPUActivationFunc<T, xpu::Activation_t::RELU>;
+template <typename T>
+using XPUSigmoidFunctor = XPUActivationFunc<T, xpu::Activation_t::SIGMOID>;
+template <typename T>
+using XPUTanhFunctor = XPUActivationFunc<T, xpu::Activation_t::TANH>;
+template <typename T>
+using XPUGeluFunctor = XPUActivationFunc<T, xpu::Activation_t::GELU>;
+template <typename T>
+using XPULogFunctor = XPUActivationFunc<T, xpu::Activation_t::LOG>;
+template <typename T>
+using XPUSquareFunctor = XPUActivationFunc<T, xpu::Activation_t::SQUARE>;
+template <typename T>
+using XPUSuareGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQUARE>;
+template <typename T>
+using XPUReluGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::RELU>;
+template <typename T>
+using XPUSigmoidGradFunctor =
+    XPUActivationGradFunc<T, xpu::Activation_t::SIGMOID>;
+template <typename T>
+using XPUTanhGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::TANH>;
+template <typename T>
+using XPUGeluGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::GELU>;
+template <typename T>
+using XPUSqrtFunctor = XPUActivationFunc<T, xpu::Activation_t::SQRT>;
+template <typename T>
+using XPUSqrtGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQRT>;
+template <typename T>
+using XPUACTPowFunctor = XPUActivationFunc<T, xpu::Activation_t::ACT_POW>;
+template <typename T>
+using XPUABSFunctor = XPUActivationFunc<T, xpu::Activation_t::ABS>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, functor, grad_functor)  \
+  REGISTER_OP_XPU_KERNEL(act_type,                                       \
+                         ops::XPUActivationKernel<ops::functor<float>>); \
+  REGISTER_OP_XPU_KERNEL(                                                \
+      act_type##_grad,                                                   \
+      ops::XPUActivationGradKernel<ops::grad_functor<float>>);
+
+REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(tanh, XPUTanhFunctor, XPUTanhGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor,
+                               XPUSigmoidGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(gelu, XPUGeluFunctor, XPUGeluGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSuareGradFunctor)
+REGISTER_OP_XPU_KERNEL(log,
+                       ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
+REGISTER_OP_XPU_KERNEL(pow,
+                       ops::XPUActivationKernel<ops::XPUACTPowFunctor<float>>);
+REGISTER_OP_XPU_KERNEL(abs,
+                       ops::XPUActivationKernel<ops::XPUABSFunctor<float>>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
new file mode 100644
index 00000000000..9ff7a71d7f0
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -0,0 +1,162 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    XPUElementwise<T, XPUAddFunctor<T>>(ctx);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+    using Tensor = framework::Tensor;
+
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto dx_dims = dout->dims();
+    auto dy_dims_untrimed = dout->dims();
+    T *dx_data = NULL;
+    T *dy_data = NULL;
+
+    int axis = ctx.Attr<int>("axis");
+    PADDLE_ENFORCE_GE(dx_dims.size(), dy_dims_untrimed.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    if (dx != nullptr) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      dx_dims = dx->dims();
+      dx_data = dx->data<T>();
+    }
+
+    if (dy != nullptr) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      dy_dims_untrimed = dy->dims();
+      dy_data = dy->data<T>();
+    }
+
+    int pre, n, post, is_common_broadcast;
+    if (dx_dims == dy_dims_untrimed) {
+      pre = post = 1;
+      n = dout->numel();
+    } else {
+      axis = (axis == -1 ? dx_dims.size() - dy_dims_untrimed.size() : axis);
+      PADDLE_ENFORCE(axis >= 0 && axis < dx_dims.size(),
+                     "Axis should be in range [0, dx_dims)");
+      auto dy_dims = trim_trailing_singular_dims(dy_dims_untrimed);
+      axis = (dy_dims.size() == 0) ? dx_dims.size() : axis;
+      get_mid_dims(dx_dims, dy_dims, axis, &pre, &n, &post,
+                   &is_common_broadcast);
+    }
+    int len = pre * n * post;
+
+    auto &dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    if (post == 1) {
+      int r = xpu::matrix_vector_add_grad(
+          dev_ctx.x_context(), dout->data<T>(), dout->data<T>(),
+          dout->data<T>(), dout->data<T>(), dx_data, dy_data, pre, n);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+      return;
+    }
+
+    if (dx == nullptr) {
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void **>(&dx_data), len * sizeof(float)),
+          XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    }
+
+    if (dy == nullptr) {
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void **>(&dy_data), len * sizeof(float)),
+          XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    } else {
+      if (len != n) {
+        PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&dy_data),
+                                     len * sizeof(float)),
+                          XPU_SUCCESS, platform::errors::External(
+                                           "XPU has no enough memory"));
+      }
+    }
+
+    int r = xpu::elementwise_add_grad(
+        dev_ctx.x_context(), dout->data<T>() /*x*/, dout->data<T>() /*y*/,
+        dout->data<T>() /*out*/, dout->data<T>(), dx_data, dy_data, len);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            r));
+
+    if ((dy != nullptr) && (len != n)) {
+      r = xpu::reduce_ew(dev_ctx.x_context(), dy_data, dy->data<T>(), pre, n,
+                         post, xpu::ElementwiseOp::ASSIGN);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+      dev_ctx.Wait();
+      xpu_free(dy_data);
+    }
+
+    if ((dx == nullptr || dy == nullptr) && !(dy != nullptr && len != n)) {
+      dev_ctx.Wait();
+    }
+
+    if (dx == nullptr) {
+      xpu_free(dx_data);
+    }
+    if (dy == nullptr) {
+      xpu_free(dy_data);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    elementwise_add,
+    ops::ElementwiseAddXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(elementwise_add_grad,
+                       ops::ElementwiseAddGradXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_xpu.h b/paddle/fluid/operators/elementwise/elementwise_xpu.h
new file mode 100644
index 00000000000..53c4332e919
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct XPUAddFunctor {
+  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
+    return xpu::elementwise_add(ctx, x, y, z, len);
+  }
+};
+
+template <typename T>
+struct XPUMulFunctor {
+  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
+    return xpu::elementwise_mul(ctx, x, y, z, len);
+  }
+};
+
+template <typename T, typename Functor>
+void XPUElementwise(const framework::ExecutionContext& ctx) {
+  PADDLE_ENFORCE(platform::is_xpu_place(ctx.GetPlace()),
+                 "This kernel only runs on XPU device.");
+  auto x_var = ctx.InputVar("X");
+  PADDLE_ENFORCE_NE(x_var, nullptr,
+                    platform::errors::Fatal("Cannot get input Variable X"));
+  PADDLE_ENFORCE(x_var->IsType<framework::LoDTensor>(),
+                 "XPU only support LoDTensor");
+
+  auto x = x_var->Get<framework::LoDTensor>();
+  auto* y = ctx.Input<framework::LoDTensor>("Y");
+  auto* z = ctx.Output<framework::LoDTensor>("Out");
+  z->mutable_data<T>(ctx.GetPlace());
+
+  int axis = ctx.Attr<int>("axis");
+  auto x_dims = x.dims();
+  auto y_dims_untrimed = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
+                    "Rank of first input must >= rank of second input.");
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
+  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                 "Axis should be in range [0, x_dims)");
+  auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
+  axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+  int pre, n, post, is_common_broadcast;
+  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, &is_common_broadcast);
+  int len = pre * n * post;
+
+  const T* x_data = x.data<T>();
+  const T* y_data = y->data<T>();
+  T* z_data = z->data<T>();
+  T* y_broadcast = nullptr;
+
+  auto& dev_ctx =
+      ctx.template device_context<paddle::platform::XPUDeviceContext>();
+
+  if (post == 1) {
+    if (std::is_same<Functor, XPUAddFunctor<T>>::value) {
+      int res = xpu::matrix_vector_add(dev_ctx.x_context(), x_data, y_data,
+                                       z_data, pre, n);
+      PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
+                     res);
+      return;
+    }
+    if (std::is_same<Functor, XPUMulFunctor<T>>::value) {
+      int res = xpu::matrix_vector_mul(dev_ctx.x_context(), x_data, y_data,
+                                       z_data, pre, n);
+      PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
+                     res);
+      return;
+    }
+  }
+
+  if (pre != 1 || post != 1) {
+    PADDLE_ENFORCE(xpu_malloc(reinterpret_cast<void**>(&y_broadcast),
+                              len * sizeof(T)) == XPU_SUCCESS);
+    int res = xpu::broadcast_ew(dev_ctx.x_context(), y_data, y_broadcast, pre,
+                                n, post, xpu::ElementwiseOp::ASSIGN);
+    PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
+                   res);
+    y_data = y_broadcast;
+  }
+
+  Functor functor;
+  int res = functor(dev_ctx.x_context(), x_data, y_data, z_data, len);
+  PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
+                 res);
+
+  if (pre != 1 || post != 1) {
+    dev_ctx.Wait();
+    xpu_free(y_broadcast);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc
new file mode 100644
index 00000000000..ff038d7ef12
--- /dev/null
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
@@ -0,0 +1,343 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
+  if (x_dim.size() > 1) {
+    return x_dim;
+  }
+  return framework::make_ddim({1, x_dim[0]});
+}
+
+static framework::Tensor FoldInitDims(const framework::Tensor &input) {
+  auto output = input;
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
+  }
+  return output;
+}
+/**
+ * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
+ * original y_dim is returned.
+ */
+static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
+  if (y_dim.size() > 1) {
+    return y_dim;
+  }
+  return framework::make_ddim({y_dim[0], 1});
+}
+
+static void ReshapeTensorIntoMatrixSequence(
+    framework::Tensor *x, const math::MatDescriptor &descriptor) {
+  int64_t h, w;
+  h = descriptor.height_;
+  w = descriptor.width_;
+  if (descriptor.trans_) {
+    std::swap(w, h);
+  }
+  if (descriptor.batch_size_) {
+    x->Resize({descriptor.batch_size_, h, w});
+  } else {
+    x->Resize({h, w});
+  }
+}
+/**
+ * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor
+ * Out = matmul(x, y)
+ *
+ * This method will first calculate X,Y matrix sequence, and then calculate
+ * the out shape.
+ *
+ * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2]
+ * The out = [BatchSize, H1, W2]
+ *
+ * If there is no batch size in `X` and `Y`, the out will be [H1, W2]
+ * If any of `X` and `Y` has batch size BatchSize, the out will have the
+ * BatchSize.
+ */
+static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
+                                           framework::Tensor *y,
+                                           framework::Tensor *out, bool trans_x,
+                                           bool trans_y) {
+  auto x_dim = RowMatrixFromVector(x->dims());
+  auto y_dim = ColumnMatrixFromVector(y->dims());
+  auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
+  auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
+  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
+    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
+  } else {
+    out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
+                 mat_dim_x.height_, mat_dim_y.width_});
+  }
+
+  ReshapeTensorIntoMatrixSequence(x, mat_dim_x);
+  ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
+}
+
+template <typename DeviceContext, typename T>
+class MatMulXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<framework::Tensor>("X");
+    auto *y = context.Input<framework::Tensor>("Y");
+    auto *out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    auto mat_dim_a = math::CreateMatrixDescriptor(
+        RowMatrixFromVector(x->dims()), 0, context.Attr<bool>("transpose_X"));
+    auto mat_dim_b =
+        math::CreateMatrixDescriptor(ColumnMatrixFromVector(y->dims()), 0,
+                                     context.Attr<bool>("transpose_Y"));
+    PADDLE_ENFORCE_EQ(
+        mat_dim_a.width_, mat_dim_b.height_,
+        platform::errors::InvalidArgument("Shape mistake in matmul_op"));
+    PADDLE_ENFORCE_EQ(
+        mat_dim_a.batch_size_, mat_dim_b.batch_size_,
+        platform::errors::InvalidArgument("Shape mistake in matmul_op"));
+    T alpha = static_cast<T>(context.Attr<float>("alpha"));
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    float *data_c = out->data<T>();
+    if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
+      int r =
+          xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_,
+                        mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_,
+                        alpha, x->data<T>(), y->data<T>(), 0.0f, data_c);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    } else {
+      // batch matmul
+      int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_,
+                                      mat_dim_b.trans_, mat_dim_a.batch_size_,
+                                      mat_dim_a.height_, mat_dim_b.width_,
+                                      mat_dim_a.width_, alpha, x->data<T>(),
+                                      y->data<T>(), data_c, nullptr, nullptr);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    }
+  }
+};
+
+// Reshape a rank-3 tensor from P x M x N to M x (P * N).
+// (Warning: This requires transposing data and writes into new memory.)
+// Identity op if the tensor is not of rank 3.
+template <typename DeviceContext, typename T>
+static framework::Tensor XPUFoldHeadAndLastDims(
+    const DeviceContext &context, const framework::Tensor &input) {
+  auto in_dims = input.dims();
+  if (in_dims.size() != 3) {
+    return input;
+  }
+
+  framework::Tensor output;
+  output.Resize({in_dims[1], in_dims[0], in_dims[2]});
+  output.mutable_data<T>(context.GetPlace());
+  std::vector<int> in_shape_host = {static_cast<int>(in_dims[0]),
+                                    static_cast<int>(in_dims[1]),
+                                    static_cast<int>(in_dims[2])};
+  std::vector<int> axis_host = {1, 0, 2};
+
+  int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
+                         in_shape_host.data(), axis_host.data(), /*ndims=*/3);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        r));
+  output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
+
+  return output;
+}
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// transpose_X | False    | True     | False    | True
+// transpose_Y | False    | False    | True     | True
+// -----------+----------+----------+----------+-----------
+//        dX = | dOut Y^T | Y dOut^T | dOut Y   | Y^T dOut^T
+//        dY = | X^T dOut | X dOut   | dOut^T X | dOut^T X^T
+//
+// When X is a vector of size K, we treat it instead as a matrix of shape
+// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
+// a matrix of shape (K, 1).
+//
+// When X and Y are both 3-dimensional tensors, then the first dimension
+// the batch dimension can be ignored and the exact same formulas apply
+// as for two matrices.
+//
+// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
+// up with formulas like
+//
+//   dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
+//
+// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
+// to X: (P * M) x K, dOut: (P * M) x N.
+template <typename DeviceContext, typename T>
+class MatMulGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void MatMul(const framework::ExecutionContext &context,
+              const framework::Tensor &a, bool trans_a,
+              const framework::Tensor &b, bool trans_b,
+              framework::Tensor *out) const {
+    out->mutable_data<T>(context.GetPlace());
+    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+    PADDLE_ENFORCE_EQ(
+        mat_dim_a.width_, mat_dim_b.height_,
+        platform::errors::InvalidArgument("Shape mistake in matmul_grad_op"));
+    PADDLE_ENFORCE_EQ(
+        mat_dim_a.batch_size_, mat_dim_b.batch_size_,
+        platform::errors::InvalidArgument("Shape mistake in matmul_grad_op"));
+    T alpha = static_cast<T>(context.Attr<float>("alpha"));
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    float *data_c = out->data<T>();
+    if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
+      int r =
+          xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_,
+                        mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_,
+                        alpha, a.data<T>(), b.data<T>(), 0.0f, data_c);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    } else {
+      // batch matmul
+      int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_,
+                                      mat_dim_b.trans_, mat_dim_a.batch_size_,
+                                      mat_dim_a.height_, mat_dim_b.width_,
+                                      mat_dim_a.width_, alpha, a.data<T>(),
+                                      b.data<T>(), data_c, nullptr, nullptr);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    }
+  }
+
+  void CalcInputGrad(const framework::ExecutionContext &context,
+                     const framework::Tensor &a, bool trans_a,
+                     bool is_fold_init_dims_a, const framework::Tensor &b,
+                     bool trans_b, bool is_fold_init_dims_b,
+                     framework::Tensor *out) const {
+    if (out == nullptr) return;
+    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
+                        out->dims().size() == 2;
+    if (!need_combine) {
+      MatMul(context, a, trans_a, b, trans_b, out);
+    } else {
+      auto &dev_ctx = context.template device_context<DeviceContext>();
+      MatMul(
+          context, is_fold_init_dims_a
+                       ? FoldInitDims(a)
+                       : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, a),
+          trans_a, is_fold_init_dims_b
+                       ? FoldInitDims(b)
+                       : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, b),
+          trans_b, out);
+    }
+  }
+
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto x = *context.Input<framework::Tensor>("X");
+    auto y = *context.Input<framework::Tensor>("Y");
+    auto dout =
+        *context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+
+    framework::DDim dx_dims;
+    if (dx) {
+      dx_dims = dx->dims();
+      if (dx_dims != x.dims()) {
+        dx->Resize(x.dims());
+      }
+    }
+
+    framework::DDim dy_dims;
+    if (dy) {
+      dy_dims = dy->dims();
+      if (dy_dims != y.dims()) {
+        dy->Resize(y.dims());
+      }
+    }
+
+    if (transpose_x && transpose_y) {
+      CalcInputGrad(context, y, true, true, dout, true, false, dx);
+      CalcInputGrad(context, dout, true, true, x, true, false, dy);
+    } else if (transpose_x) {
+      CalcInputGrad(context, y, false, false, dout, true, false, dx);
+      CalcInputGrad(context, x, false, false, dout, false, true, dy);
+    } else if (transpose_y) {
+      CalcInputGrad(context, dout, false, false, y, false, true, dx);
+      CalcInputGrad(context, dout, true, true, x, false, true, dy);
+    } else {
+      CalcInputGrad(context, dout, false, false, y, true, false, dx);
+      CalcInputGrad(context, x, true, true, dout, false, true, dy);
+    }
+
+    if (dx) {
+      if (dx_dims != x.dims()) {
+        dx->Resize(dx_dims);
+      }
+    }
+
+    if (dy) {
+      if (dy_dims != y.dims()) {
+        dy->Resize(dy_dims);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    matmul, ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    matmul_grad,
+    ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/xpu/mul_xpu_op.cc b/paddle/fluid/operators/mul_op_xpu.cc
similarity index 100%
rename from paddle/fluid/operators/xpu/mul_xpu_op.cc
rename to paddle/fluid/operators/mul_op_xpu.cc
index 79aae71c304..0c8469101ab 100644
--- a/paddle/fluid/operators/xpu/mul_xpu_op.cc
+++ b/paddle/fluid/operators/mul_op_xpu.cc
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
+#include "paddle/fluid/operators/mul_op.h"
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "paddle/fluid/operators/mul_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index f14fbdd74f9..f1832206a1a 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/xpu_info.h"
 
 TEST(InitDevices, CPU) {
   using paddle::framework::InitDevices;
diff --git a/paddle/fluid/platform/xpu_header.h b/paddle/fluid/platform/xpu_header.h
index d8c5f85f9cf..95e4979951d 100644
--- a/paddle/fluid/platform/xpu_header.h
+++ b/paddle/fluid/platform/xpu_header.h
@@ -15,9 +15,36 @@
 #pragma once
 
 #ifdef PADDLE_WITH_XPU
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/platform/errors.h"
 #include "xpu/api.h"
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
 
 namespace xpu = baidu::xpu::api;
+
+class XPUActHelper {
+ public:
+  // Convert string to activation type in xpu
+  static xpu::Activation_t ConvertToXpuActType(
+      const std::string& act_type_str) {
+    static std::unordered_map<std::string, xpu::Activation_t> str2act = {
+        {"linear", xpu::Activation_t::LINEAR},
+        {"relu", xpu::Activation_t::RELU},
+        {"sigmoid", xpu::Activation_t::SIGMOID},
+        {"tanh", xpu::Activation_t::TANH},
+        {"gelu", xpu::Activation_t::GELU},
+        {"leaky_relu", xpu::Activation_t::LEAKY_RELU},
+        {"sqrt", xpu::Activation_t::SQRT},
+        {"square", xpu::Activation_t::SQUARE}};
+
+    auto res = str2act.find(act_type_str);
+    PADDLE_ENFORCE_NE(res, str2act.end(),
+                      paddle::platform::errors::InvalidArgument(
+                          "Invalid activation type(%s) in XPU", act_type_str));
+    return res->second;
+  }
+};
 #endif
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 40275a2ce71..e707de8e068 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -257,6 +257,8 @@ from .tensor.stat import numel  #DEFINE_ALIAS
 from .device import get_cudnn_version
 from .device import set_device
 from .device import get_device
+from .device import is_compiled_with_xpu
+from .device import XPUPlace
 # from .tensor.tensor import Tensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensorArray        #DEFINE_ALIAS
diff --git a/python/paddle/device.py b/python/paddle/device.py
index de24fd87513..46d0ff7bedc 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -22,7 +22,9 @@ from paddle.fluid.dygraph.parallel import ParallelEnv
 __all__ = [
     'get_cudnn_version',
     'set_device',
-    'get_device'
+    'get_device',
+    'XPUPlace',
+    'is_compiled_with_xpu'
     #            'cpu_places',
     #            'CPUPlace',
     #            'cuda_pinned_places',
@@ -35,6 +37,37 @@ __all__ = [
 _cudnn_version = None
 
 
+def is_compiled_with_xpu():
+    """
+    Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
+
+    Returns (bool): whether paddle was built with WITH_XPU=ON
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            support_xpu = paddle.device.is_compiled_with_xpu()
+    """
+    return core.is_compiled_with_xpu()
+
+
+def XPUPlace(dev_id):
+    """
+    Return a Baidu Kunlun Place
+
+    Parameters:
+        dev_id(int): Baidu Kunlun device id
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            place = paddle.device.XPUPlace(0)
+    """
+    return core.XPUPlace(dev_id)
+
+
 def get_cudnn_version():
     """
     This funciton return the version of cudnn. the retuen value is int which represents the 
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index d02fdafe995..96efc36ed0a 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -217,6 +217,9 @@ class OpTest(unittest.TestCase):
                     return False
             return True
 
+        def is_xpu_op_test():
+            return hasattr(cls, "use_xpu") and cls.use_xpu == True
+
         def is_mkldnn_op_test():
             return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True
 
@@ -239,6 +242,7 @@ class OpTest(unittest.TestCase):
             if cls.dtype in [np.float32, np.float64] \
                 and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \
                 and not hasattr(cls, 'exist_fp64_check_grad') \
+                and not is_xpu_op_test() \
                 and not is_mkldnn_op_test():
                 raise AssertionError(
                     "This test of %s op needs check_grad with fp64 precision." %
@@ -336,6 +340,11 @@ class OpTest(unittest.TestCase):
                     self.attrs["use_mkldnn"] == True):
             self.__class__.use_mkldnn = True
 
+        if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
+            (hasattr(self, "attrs") and "use_xpu" in self.attrs and \
+                    self.attrs["use_xpu"] == True):
+            self.__class__.use_xpu = True
+
         op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
         "infer datatype from inputs and outputs for this test case"
         self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
@@ -932,6 +941,8 @@ class OpTest(unittest.TestCase):
         need_run_ops = self._get_need_run_ops(op_desc)
 
         res = {}
+        if hasattr(self, 'attrs') and bool(self.attrs.get('use_xpu', False)):
+            return
         for op_desc, father_op_desc in reversed(need_run_ops):
             # The first one is the forward op
             has_infer_inplace = fluid.core.has_infer_inplace(op_desc.type())
@@ -1203,6 +1214,11 @@ class OpTest(unittest.TestCase):
                     self.attrs["use_mkldnn"] == True):
             self.__class__.use_mkldnn = True
 
+        if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
+            (hasattr(self, "attrs") and "use_xpu" in self.attrs and \
+                    self.attrs["use_xpu"] == True):
+            self.__class__.use_xpu = True
+
         places = self._get_places()
         for place in places:
             res = self.check_output_with_place(place, atol, no_check_set,
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
index 3eb822bfed8..2d5f098a7fe 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import paddle.fluid.core as core
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index 5f223de1954..927383c1223 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -18,6 +18,8 @@ import unittest
 import numpy as np
 import paddle
 import paddle.fluid.core as core
+import sys
+sys.path.append("..")
 from op_test import OpTest
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
@@ -175,57 +177,5 @@ class TestFP16MulOp2(TestMulOp2):
                 no_grad_set=set('Y'))
 
 
-@unittest.skipIf(not core.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUMulOp1(TestMulOp):
-    def init_dtype_type(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        place = core.XPUPlace(0)
-        self.check_output_with_place(place, atol=1e-1)
-
-    def test_check_grad_normal(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', max_relative_error=0.5)
-
-    def test_check_grad_ingore_x(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-
-
-@unittest.skipIf(not core.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUMulOp2(TestMulOp2):
-    def init_dtype_type(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        place = core.XPUPlace(0)
-        self.check_output_with_place(place, atol=2e-1)
-
-    def test_check_grad_normal(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', max_relative_error=0.9)
-
-    def test_check_grad_ingore_x(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op.py
new file mode 100755
index 00000000000..788c110a592
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op.py
@@ -0,0 +1,215 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from scipy.special import expit, erf
+import paddle
+import paddle.fluid as fluid
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.fluid import compiler, Program, program_guard
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUActivation(OpTest):
+    def setUp(self):
+        self.op_type = "exp"
+        self.init_dtype()
+        self.init_kernel_type()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.exp(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=1e-3)
+
+    def init_kernel_type(self):
+        pass
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSigmoid(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "sigmoid"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = 1 / (1 + np.exp(-x))
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'], 'Out', max_relative_error=0.01)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUTanh(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "tanh"
+        self.init_dtype()
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.tanh(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSqrt(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "sqrt"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.sqrt(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUAbs(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "abs"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 25]).astype(self.dtype)
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.abs(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPURelu(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "relu"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUGelu(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "gelu"
+        self.init_dtype()
+        approximate = False
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = gelu(x, approximate)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {"approximate": approximate, 'use_xpu': True}
+
+
+def gelu(x, approximate):
+    if approximate:
+        y_ref = 0.5 * x * (1.0 + np.tanh(
+            np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+    else:
+        y_ref = 0.5 * x * (1 + erf(x / np.sqrt(2)))
+    return y_ref.astype(x.dtype)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPULog(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "log"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.log(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSquare(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "square"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.square(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUPow(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "pow"
+        self.init_dtype()
+
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.power(x, 3)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'factor': 3.0, 'use_xpu': True}
+        self.outputs = {'Out': out}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op.py
new file mode 100644
index 00000000000..9c6e7d21c1a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op.py
@@ -0,0 +1,346 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+
+class TestElementwiseAddOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        self.check_output(check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad_normal(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        if self.dtype == np.float16:
+            return
+        self.check_grad(
+            ['X', 'Y'], 'Out', check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad_ingore_x(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        if self.dtype == np.float16:
+            return
+        self.check_grad(
+            ['Y'],
+            'Out',
+            no_grad_set=set("X"),
+            check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad_ingore_y(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        if self.dtype == np.float16:
+            return
+        self.check_grad(
+            ['X'],
+            'Out',
+            no_grad_set=set('Y'),
+            check_dygraph=(self.use_mkldnn == False))
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUElementwiseAddOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_axis()
+
+        self.inputs = {'X': self.x, 'Y': self.y}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': False, 'use_xpu': True}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['Y'], 'Out')
+
+    def test_check_grad_ingore_y(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
+class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random((100, )).astype(self.dtype)
+        self.y = np.random.random((100, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+
+class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 100, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 100, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 100)
+
+
+class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(100, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
+        self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(1, 1, 100).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 12).astype(self.dtype)
+        self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = 2
+
+
+class TestElementwiseAddOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)
+
+            # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
+            # float16 only can be set on GPU place
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2)
+
+
+class TestAddOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
+            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = paddle.add(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+
+    def test_declarative(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = paddle.add(x, y)
+
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((z_value == z_expected).all(), True)
+
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 4]).astype('float64')
+            np_y = np.array([1, 5, 2]).astype('float64')
+            x = fluid.dygraph.to_variable(np_x)
+            y = fluid.dygraph.to_variable(np_y)
+            z = paddle.add(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((np_z == z_expected).all(), True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op.py
new file mode 100644
index 00000000000..ac32d224910
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op.py
@@ -0,0 +1,355 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import paddle.fluid.core as core
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
+    BATCH_SIZE = 2
+    M = 3
+    N = 4
+    K = 5
+    if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
+        K = 1
+    if dim_X == 1:
+        if transpose_X:
+            shape_X = [M]
+        else:
+            shape_X = [K]
+    if dim_Y == 1:
+        if transpose_Y:
+            shape_Y = [N]
+        else:
+            shape_Y = [K]
+    if dim_X >= 2:
+        if transpose_X:
+            shape_X = [K, M]
+        else:
+            shape_X = [M, K]
+    if dim_X == 3:
+        shape_X = [BATCH_SIZE] + shape_X
+    if dim_Y >= 2:
+        if transpose_Y:
+            shape_Y = [N, K]
+        else:
+            shape_Y = [K, N]
+    if dim_Y == 3:
+        shape_Y = [BATCH_SIZE] + shape_Y
+    return shape_X, shape_Y
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, 1))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((1, Y.size))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float32")
+    return Out
+
+
+class Generator(object):
+    def setUp(self):
+        self.op_type = "matmul"
+        X = np.random.random(self.shape_X).astype("float32")
+        Y = np.random.random(self.shape_Y).astype("float32")
+        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y)
+        self.inputs = {'X': X, 'Y': Y}
+        self.attrs = {
+            'transpose_X': self.transpose_X,
+            'transpose_Y': self.transpose_Y
+        }
+        self.outputs = {'Out': Out}
+
+    def test_check_output(self):
+        self.check_output()
+        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
+                self.inputs['Y'].shape) and self.inputs['X'].shape[
+                    0] == self.inputs['Y'].shape[0]:
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3)
+        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
+                self.inputs['Y'].shape) and self.inputs['X'].shape[
+                    0] == self.inputs['Y'].shape[0]:
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X', 'Y'], 'Out', max_relative_error=5e-2)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X"))
+        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
+                self.inputs['Y'].shape) and self.inputs['X'].shape[
+                    0] == self.inputs['Y'].shape[0]:
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                max_relative_error=5e-2,
+                no_grad_set=set("X"))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y'))
+        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
+                self.inputs['Y'].shape) and self.inputs['X'].shape[
+                    0] == self.inputs['Y'].shape[0]:
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                max_relative_error=5e-2,
+                no_grad_set=set('Y'))
+
+
+class TestMatmulOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The inputs type of matmul_op must be Variable.
+            input1 = 12
+            self.assertRaises(TypeError, fluid.layers.matmul, input1, input1)
+            # The inputs dtype of matmul_op must be float32, float64.
+            input2 = fluid.layers.data(
+                name='input2', shape=[10, 10], dtype="int32")
+            self.assertRaises(TypeError, fluid.layers.matmul, input2, input2)
+            input3 = fluid.layers.data(
+                name='input3', shape=[2, 2], dtype="float16")
+            fluid.layers.matmul(input3, input3)
+
+
+# Negative dimension generation
+def generate_negative_dims(in_shape):
+    from itertools import combinations
+    size = len(in_shape)
+    indexs = list()
+    shapes = list()
+    for i in range(size):
+        indexs.extend(list(combinations([j for j in range(size)], i + 1)))
+    for idx in indexs:
+        shapes.append(
+            [in_shape[i] if i not in idx else -1 for i in range(size)])
+    return shapes
+
+
+# Build program with inputs sizes that contain negative numbers
+def test_negative_dims_program(obj):
+    for shape_x in generate_negative_dims(obj.shape_X):
+        for shape_y in generate_negative_dims(obj.shape_Y):
+            X = np.random.random(obj.shape_X).astype("float32")
+            Y = np.random.random(obj.shape_Y).astype("float32")
+            Ref = reference_matmul(X, Y, obj.transpose_X, obj.transpose_Y)
+            with program_guard(Program(), Program()):
+                x = fluid.data(name='x', shape=shape_x, dtype='float32')
+                y = fluid.data(name='y', shape=shape_y, dtype='float32')
+                output = fluid.layers.matmul(x, y, obj.transpose_X,
+                                             obj.transpose_Y)
+                obj.assertEqual(len(Ref.shape), len(output.shape))
+                for idx in range(len(Ref.shape)):
+                    if output.shape[idx] != -1:
+                        obj.assertEqual(Ref.shape[idx], output.shape[idx])
+                exe = fluid.Executor(fluid.CPUPlace())
+                res, = exe.run(fluid.default_main_program(),
+                               feed={'x': X,
+                                     'y': Y},
+                               fetch_list=[output])
+                np.allclose(res, Ref, atol=1e-5)
+
+
+# Generate program api cases for all negative possibilities
+def api_test(dim_x, dim_y, trans_x, trans_y):
+    test_name = ('TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+        dim_x, dim_y, trans_x, trans_y))
+    shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
+                                                  trans_y)
+    globals()[test_name] = type(test_name, (unittest.TestCase, ), {
+        'shape_X': shape_x,
+        'shape_Y': shape_y,
+        'transpose_X': trans_x,
+        'transpose_Y': trans_y,
+        'test_propram': test_negative_dims_program,
+    })
+
+
+# Generate operators cases for all possibilities
+def inject_test(dim_x, dim_y, trans_x, trans_y):
+    test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+        dim_x, dim_y, trans_x, trans_y))
+    shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
+                                                  trans_y)
+    globals()[test_name] = type(test_name, (Generator, OpTest), {
+        'shape_X': shape_x,
+        'shape_Y': shape_y,
+        'transpose_X': trans_x,
+        'transpose_Y': trans_y,
+    })
+
+
+for dim_X in (1, 2, 3):
+    for dim_Y in (1, 2, 3):
+        for transose_x in (False, True):
+            for transose_y in (False, True):
+                inject_test(dim_X, dim_Y, transose_x, transose_y)
+                api_test(dim_X, dim_Y, transose_x, transose_y)
+
+
+# Test case n-dim
+def generate_compatible_shapes(dim, transpose_X, transpose_Y):
+    M = 2
+    N = 4
+    K = 3
+    shape_X = [2 for _ in range(dim - 2)]
+    shape_Y = [2 for _ in range(dim - 2)]
+
+    if transpose_X:
+        shape_X += [K, M]
+    else:
+        shape_X += [M, K]
+
+    if transpose_Y:
+        shape_Y += [N, K]
+    else:
+        shape_Y += [K, N]
+
+    return shape_X, shape_Y
+
+
+# # Test case n-dim
+for dim in [4]:
+    for transpose_X in [False, True]:
+        for transpose_Y in [False, True]:
+            test_name = (
+                'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+                    dim, dim, transpose_X, transpose_Y))
+            shape_X, shape_Y = generate_compatible_shapes(dim, transpose_X,
+                                                          transpose_Y)
+            globals()[test_name] = type(test_name, (Generator, OpTest), {
+                'shape_X': shape_X,
+                'shape_Y': shape_Y,
+                'transpose_X': transpose_X,
+                'transpose_Y': transpose_Y,
+            })
+
+
+class API_TestMm(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2], dtype="float64")
+            y = fluid.data(name='y', shape=[2], dtype='float64')
+            res = fluid.data(name="output", shape=[1], dtype="float64")
+            result = paddle.mm(x, y)
+            exe = fluid.Executor(fluid.CPUPlace())
+            data1 = np.random.rand(2)
+            data2 = np.random.rand(2)
+            np_res = exe.run(feed={'x': data1, 'y': data2}, fetch_list=[result])
+            expected_result = np.matmul(
+                data1.reshape(1, 2), data2.reshape(2, 1))
+
+        self.assertTrue(
+            np.allclose(
+                np_res, expected_result, atol=1e-5),
+            "two value is\
+            {}\n{}, check diff!".format(np_res, expected_result))
+
+    def test_dygraph_without_out(self):
+        device = fluid.CPUPlace()
+        with fluid.dygraph.guard(device):
+            input_array1 = np.random.rand(3, 4).astype("float64")
+            input_array2 = np.random.rand(4, 3).astype("float64")
+            data1 = fluid.dygraph.to_variable(input_array1)
+            data2 = fluid.dygraph.to_variable(input_array2)
+            out = paddle.mm(data1, data2)
+            expected_result = np.matmul(input_array1, input_array2)
+        self.assertTrue(np.allclose(expected_result, out.numpy()))
+
+
+class Test_API_Matmul(unittest.TestCase):
+    def test_dygraph_without_out(self):
+        device = fluid.CPUPlace()
+        with fluid.dygraph.guard(device):
+            input_array1 = np.random.rand(3, 4).astype("float64")
+            input_array2 = np.random.rand(4, 3).astype("float64")
+            data1 = fluid.dygraph.to_variable(input_array1)
+            data2 = fluid.dygraph.to_variable(input_array2)
+            out = paddle.matmul(data1, data2)
+            expected_result = np.matmul(input_array1, input_array2)
+        self.assertTrue(np.allclose(expected_result, out.numpy()))
+
+
+class API_TestMmError(unittest.TestCase):
+    def test_errors(self):
+        def test_error1():
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                data1 = fluid.data(name="data1", shape=[10, 2], dtype="float32")
+                data2 = fluid.data(name="data2", shape=[3, 10], dtype="float32")
+                paddle.mm(data1, data2)
+
+        self.assertRaises(ValueError, test_error1)
+
+        def test_error2():
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                data1 = fluid.data(
+                    name="data1", shape=[-1, 10, 2], dtype="float32")
+                data2 = fluid.data(
+                    name="data2", shape=[-1, 2, 10], dtype="float32")
+                paddle.mm(data1, data2)
+
+        test_error2()
+
+        def test_error3():
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                data1 = fluid.data(
+                    name="data1", shape=[10, 10, 2], dtype="float32")
+                data2 = fluid.data(
+                    name="data2", shape=[3, 2, 10], dtype="float32")
+                paddle.mm(data1, data2)
+
+        self.assertRaises(ValueError, test_error3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mul_op.py b/python/paddle/fluid/tests/unittests/xpu/test_mul_op.py
new file mode 100644
index 00000000000..94ab5b71e4f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mul_op.py
@@ -0,0 +1,161 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+class TestMulOp(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.dtype = np.float64
+        self.init_dtype_type()
+        self.inputs = {
+            'X': np.random.random((20, 5)).astype(self.dtype),
+            'Y': np.random.random((5, 21)).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+    def init_dtype_type(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+class TestMulOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The input type of mul_op must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x2 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, fluid.layers.mul, x1, x2)
+            # The input dtype of mul_op must be float32 or float64.
+            x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32")
+            x4 = fluid.layers.data(name='x4', shape=[4], dtype="int32")
+            self.assertRaises(TypeError, fluid.layers.mul, x3, x4)
+
+
+class TestMulOp2(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.dtype = np.float64
+        self.init_dtype_type()
+        self.inputs = {
+            'X': np.random.random((3, 4, 2, 9)).astype(self.dtype),
+            'Y': np.random.random((3, 6, 1, 2, 3)).astype(self.dtype)
+        }
+        self.attrs = {
+            'x_num_col_dims': 2,
+            'y_num_col_dims': 2,
+        }
+        result = np.dot(self.inputs['X'].reshape(3 * 4, 2 * 9),
+                        self.inputs['Y'].reshape(3 * 6, 1 * 2 * 3))
+        result = result.reshape(3, 4, 1, 2, 3)
+        self.outputs = {'Out': result}
+
+    def init_dtype_type(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set('X'))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp1(TestMulOp):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place, atol=1e-1)
+
+    def test_check_grad_normal(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.5)
+
+    def test_check_grad_ingore_x(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp2(TestMulOp2):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place, atol=2e-1)
+
+    def test_check_grad_normal(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.9)
+
+    def test_check_grad_ingore_x(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/wlist.json b/tools/wlist.json
index 9b36ac6adc7..3ca14cd1dd6 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -407,7 +407,9 @@
         "TransformerDecoder.prepare_incremental_cache",
         "LinearChainCRF.forward",
         "CRFDecoding.forward",
-        "SequenceTagging.forward"
+        "SequenceTagging.forward",
+        "XPUPlace",
+        "is_compiled_with_xpu"
     ],
     "gpu_not_white":[
         "deformable_conv",
-- 
GitLab


From 9b7ebf10993f81fd1b86cb58622c5b61a67b424a Mon Sep 17 00:00:00 2001
From: liym27 <33742067+liym27@users.noreply.github.com>
Date: Sun, 27 Sep 2020 17:06:55 +0800
Subject: [PATCH 251/261] [API 2.0] Fix example code of api 'case' and
 add/delete alias (#27577)

* Fix example code of api fluid.layers.case

* delete paddle.nn.case alias and add paddle.static.nn.case
---
 python/paddle/fluid/layers/control_flow.py | 43 +++++++++++-----------
 python/paddle/nn/__init__.py               |  1 -
 python/paddle/nn/control_flow.py           |  2 -
 python/paddle/static/nn/__init__.py        |  2 +
 4 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 2002b8a95de..013a842e112 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -2488,9 +2488,6 @@ def _error_message(what, arg_name, op_name, right_value, error_value):
 def case(pred_fn_pairs, default=None, name=None):
     '''
     :api_attr: Static Graph
-	:alias_main: paddle.nn.case
-	:alias: paddle.nn.case,paddle.nn.control_flow.case
-	:old_api: paddle.fluid.layers.case
 
     This operator works like an if-elif-elif-else chain.
 
@@ -2500,7 +2497,7 @@ def case(pred_fn_pairs, default=None, name=None):
         name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable|list(Variable): Tensors returned by the callable from the first pair whose pred is True,
+        Tensor|list(Tensor): Tensors returned by the callable from the first pair whose pred is True,
         or Tensors returned by ``default`` if no pred in ``pred_fn_pairs`` is True and ``default`` is not None,
         or Tensors returned by the last callable in ``pred_fn_pairs``  if no pred in ``pred_fn_pairs`` is True and ``default`` is None.
 
@@ -2508,45 +2505,47 @@ def case(pred_fn_pairs, default=None, name=None):
         TypeError: If the type of ``pred_fn_pairs`` is not list or tuple.
         TypeError: If the type of elements in ``pred_fn_pairs`` is not tuple.
         TypeError: If the size of tuples in ``pred_fn_pairs`` is not 2.
-        TypeError: If the first element of 2-tuple in ``pred_fn_pairs`` is not Variable.
+        TypeError: If the first element of 2-tuple in ``pred_fn_pairs`` is not a Tensor.
         TypeError: If the second element of 2-tuple in ``pred_fn_pairs`` is not callable.
         TypeError: If ``default`` is not None but it is not callable.
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            import paddle
+
+            paddle.enable_static()
 
             def fn_1():
-                return layers.fill_constant(shape=[1, 2], dtype='float32', value=1)
+                return paddle.fill_constant(shape=[1, 2], dtype='float32', value=1)
 
             def fn_2():
-                return layers.fill_constant(shape=[2, 2], dtype='int32', value=2)
+                return paddle.fill_constant(shape=[2, 2], dtype='int32', value=2)
 
             def fn_3():
-                return layers.fill_constant(shape=[3], dtype='int32', value=3)
+                return paddle.fill_constant(shape=[3], dtype='int32', value=3)
 
-            main_program = fluid.default_startup_program()
-            startup_program = fluid.default_main_program()
-            with fluid.program_guard(main_program, startup_program):
-                x = layers.fill_constant(shape=[1], dtype='float32', value=0.3)
-                y = layers.fill_constant(shape=[1], dtype='float32', value=0.1)
-                z = layers.fill_constant(shape=[1], dtype='float32', value=0.2)
+            main_program = paddle.static.default_startup_program()
+            startup_program = paddle.static.default_main_program()
 
-                pred_1 = layers.less_than(z, x)  # true: 0.2 < 0.3
-                pred_2 = layers.less_than(x, y)  # false: 0.3 < 0.1
-                pred_3 = layers.equal(x, y)      # false: 0.3 == 0.1
+            with paddle.static.program_guard(main_program, startup_program):
+                x = paddle.fill_constant(shape=[1], dtype='float32', value=0.3)
+                y = paddle.fill_constant(shape=[1], dtype='float32', value=0.1)
+                z = paddle.fill_constant(shape=[1], dtype='float32', value=0.2)
+
+                pred_1 = paddle.less_than(z, x)  # true: 0.2 < 0.3
+                pred_2 = paddle.less_than(x, y)  # false: 0.3 < 0.1
+                pred_3 = paddle.equal(x, y)      # false: 0.3 == 0.1
 
                 # Call fn_1 because pred_1 is True
-                out_1 = layers.case(
+                out_1 = paddle.static.nn.case(
                     pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3)
 
                 # Argument default is None and no pred in pred_fn_pairs is True. fn_3 will be called.
                 # because fn_3 is the last callable in pred_fn_pairs.
-                out_2 = layers.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
+                out_2 = paddle.static.nn.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
 
-                exe = fluid.Executor(fluid.CPUPlace())
+                exe = paddle.static.Executor(paddle.CPUPlace())
                 res_1, res_2 = exe.run(main_program, fetch_list=[out_1, out_2])
                 print(res_1)  # [[1. 1.]]
                 print(res_2)  # [3 3 3]
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 79583f344f0..47a8668362e 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -38,7 +38,6 @@ from .clip import GradientClipByValue  #DEFINE_ALIAS
 # from .clip import set_gradient_clip        #DEFINE_ALIAS
 from .clip import clip  #DEFINE_ALIAS
 from .clip import clip_by_norm  #DEFINE_ALIAS
-from .control_flow import case  #DEFINE_ALIAS
 from .control_flow import cond  #DEFINE_ALIAS
 # from .control_flow import DynamicRNN        #DEFINE_ALIAS
 # from .control_flow import StaticRNN        #DEFINE_ALIAS
diff --git a/python/paddle/nn/control_flow.py b/python/paddle/nn/control_flow.py
index d3b1ec700fe..85f2fbcbe6e 100644
--- a/python/paddle/nn/control_flow.py
+++ b/python/paddle/nn/control_flow.py
@@ -13,14 +13,12 @@
 # limitations under the License.
 
 # TODO: define the control flow api  
-from ..fluid.layers import case  #DEFINE_ALIAS
 from ..fluid.layers import cond  #DEFINE_ALIAS
 from ..fluid.layers import while_loop  #DEFINE_ALIAS
 
 from ..fluid.layers import switch_case  #DEFINE_ALIAS
 
 __all__ = [
-    'case',
     'cond',
     #       'DynamicRNN',
     #       'StaticRNN',
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index 91da0926b18..51d295d050e 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -17,6 +17,7 @@ __all__ = [
     'batch_norm',
     'embedding',
     'bilinear_tensor_product',
+    'case',
     'conv2d',
     'conv2d_transpose',
     'conv3d',
@@ -39,6 +40,7 @@ __all__ = [
 from ...fluid.layers import fc  #DEFINE_ALIAS
 from ...fluid.layers import batch_norm  #DEFINE_ALIAS
 from ...fluid.layers import bilinear_tensor_product  #DEFINE_ALIAS
+from ...fluid.layers import case  #DEFINE_ALIAS
 from ...fluid.layers import conv2d  #DEFINE_ALIAS
 from ...fluid.layers import conv2d_transpose  #DEFINE_ALIAS
 from ...fluid.layers import conv3d  #DEFINE_ALIAS
-- 
GitLab


From c5b6e44b4a67191a30624b05bc0093baebe76f35 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Sun, 27 Sep 2020 18:19:51 +0800
Subject: [PATCH 252/261] fix cholesky of test_math_op_patch_var_base (#27591)

---
 .../fluid/tests/unittests/test_math_op_patch_var_base.py  | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index 5df04ddfc3d..d85521f7662 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -341,10 +341,12 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
             np.array_equal(x.rank().numpy(), paddle.rank(x).numpy()))
         self.assertTrue(
             np.array_equal(x[0].t().numpy(), paddle.t(x[0]).numpy()))
-        m = paddle.to_tensor(np.random.uniform(1, 2, [3, 3]), 'float32')
-        m = m.matmul(m.t())
+        d = paddle.to_tensor([[1.2285208, 1.3491015, 1.4899898],
+                              [1.30058, 1.0688717, 1.4928783],
+                              [1.0958099, 1.3724753, 1.8926544]])
+        d = d.matmul(d.t())
         self.assertTrue(
-            np.array_equal(m.cholesky().numpy(), paddle.cholesky(m).numpy()))
+            np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy()))
 
         self.assertTrue(
             np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy()))
-- 
GitLab


From 3f170dd83da487b7fc9e13dd9145ddf56a9a0fc4 Mon Sep 17 00:00:00 2001
From: liym27 <33742067+liym27@users.noreply.github.com>
Date: Sun, 27 Sep 2020 18:45:25 +0800
Subject: [PATCH 253/261] [API 2.0] Fix example code of api 'switch_case' and
 add/delete alias (#27578)

* Fix example code of api `fluid.layers.switch_case` to use api2.0

* delete `paddle.nn.switch_case` alias and add `paddle.static.nn.switch_case`
---
 python/paddle/fluid/layers/control_flow.py | 35 +++++++++++-----------
 python/paddle/nn/__init__.py               |  1 -
 python/paddle/nn/control_flow.py           |  3 --
 python/paddle/static/nn/__init__.py        |  2 ++
 4 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 013a842e112..498e7126d67 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -3609,18 +3609,18 @@ def switch_case(branch_index, branch_fns, default=None, name=None):
     This operator is like a C++ switch/case statement.
 
     Args:
-        branch_index(Variable): A Tensor with shape [1] to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``.
+        branch_index(Tensor): A Tensor with shape [1] to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``.
         branch_fns(dict|list|tuple): If it's a list or tuple, the elements in it could be pairs of (int, callable) or simple callables whose actual index will be used as the index of callable. If it's a dict, its key is a python integer and the value is a callable. All callables return the same structure of Tensors.
         default(callable, optional): Callable that returns a structure of Tensors.
         name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable|list(Variable): Tensors returned by the callable specified by ``branch_index`` in ``branch_fns``,
+        Tensor|list(Tensor): Tensors returned by the callable specified by ``branch_index`` in ``branch_fns``,
         or Tensors returned by ``default`` if ``default`` is not None and no index matches in ``branch_fns``,
         or Tensors returned by the callable with the max index in ``branch_fns`` if ``default`` is None and no index matches in ``branch_fns``.
 
     Raises:
-        TypeError: If the type of ``branch_index`` is not Variable.
+        TypeError: If the type of ``branch_index`` is not Tensor.
         TypeError: If the data type of ``branch_index`` is not ``int32``, ``int64`` or ``uint8``.
         TypeError: If the type of ``branch_fns`` is not dict, list or tuple.
         TypeError: If the elements of ``branch_fns`` is not 2-tuple.
@@ -3632,40 +3632,41 @@ def switch_case(branch_index, branch_fns, default=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            import paddle
+
+            paddle.enable_static()
 
             def fn_1():
-                return layers.fill_constant(shape=[1, 2], dtype='float32', value=1)
+                return paddle.fill_constant(shape=[1, 2], dtype='float32', value=1)
 
             def fn_2():
-                return layers.fill_constant(shape=[2, 2], dtype='int32', value=2)
+                return paddle.fill_constant(shape=[2, 2], dtype='int32', value=2)
 
             def fn_3():
-                return layers.fill_constant(shape=[3], dtype='int32', value=3)
+                return paddle.fill_constant(shape=[3], dtype='int32', value=3)
 
-            main_program = fluid.default_startup_program()
-            startup_program = fluid.default_main_program()
-            with fluid.program_guard(main_program, startup_program):
-                index_1 = layers.fill_constant(shape=[1], dtype='int32', value=1)
-                index_2 = layers.fill_constant(shape=[1], dtype='int32', value=2)
+            main_program = paddle.static.default_startup_program()
+            startup_program = paddle.static.default_main_program()
+            with paddle.static.program_guard(main_program, startup_program):
+                index_1 = paddle.fill_constant(shape=[1], dtype='int32', value=1)
+                index_2 = paddle.fill_constant(shape=[1], dtype='int32', value=2)
 
-                out_1 = layers.switch_case(
+                out_1 = paddle.static.nn.switch_case(
                     branch_index=index_1,
                     branch_fns={1: fn_1, 2: fn_2},
                     default=fn_3)
 
-                out_2 = layers.switch_case(
+                out_2 = paddle.static.nn.switch_case(
                     branch_index=index_2,
                     branch_fns=[(1, fn_1), (2, fn_2)],
                     default=fn_3)
 
                 # Argument default is None and no index matches. fn_3 will be called because of the max index 7.
-                out_3 = layers.switch_case(
+                out_3 = paddle.static.nn.switch_case(
                     branch_index=index_2,
                     branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)])
 
-                exe = fluid.Executor(fluid.CPUPlace())
+                exe = paddle.static.Executor(paddle.CPUPlace())
                 res_1, res_2, res_3 = exe.run(main_program, fetch_list=[out_1, out_2, out_3])
                 print(res_1)  # [[1. 1.]]
                 print(res_2)  # [[2 2] [2 2]]
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 47a8668362e..b79b965f5b9 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -41,7 +41,6 @@ from .clip import clip_by_norm  #DEFINE_ALIAS
 from .control_flow import cond  #DEFINE_ALIAS
 # from .control_flow import DynamicRNN        #DEFINE_ALIAS
 # from .control_flow import StaticRNN        #DEFINE_ALIAS
-from .control_flow import switch_case  #DEFINE_ALIAS
 from .control_flow import while_loop  #DEFINE_ALIAS
 # from .control_flow import rnn        #DEFINE_ALIAS
 # from .decode import BeamSearchDecoder        #DEFINE_ALIAS
diff --git a/python/paddle/nn/control_flow.py b/python/paddle/nn/control_flow.py
index 85f2fbcbe6e..a78b65c3c6c 100644
--- a/python/paddle/nn/control_flow.py
+++ b/python/paddle/nn/control_flow.py
@@ -16,13 +16,10 @@
 from ..fluid.layers import cond  #DEFINE_ALIAS
 from ..fluid.layers import while_loop  #DEFINE_ALIAS
 
-from ..fluid.layers import switch_case  #DEFINE_ALIAS
-
 __all__ = [
     'cond',
     #       'DynamicRNN',
     #       'StaticRNN',
-    'switch_case',
     'while_loop',
     #       'rnn'
 ]
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index 51d295d050e..510e11312f4 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -35,6 +35,7 @@ __all__ = [
     'prelu',
     'row_conv',
     'spectral_norm',
+    'switch_case',
 ]
 
 from ...fluid.layers import fc  #DEFINE_ALIAS
@@ -58,5 +59,6 @@ from ...fluid.layers import nce  #DEFINE_ALIAS
 from ...fluid.layers import prelu  #DEFINE_ALIAS
 from ...fluid.layers import row_conv  #DEFINE_ALIAS
 from ...fluid.layers import spectral_norm  #DEFINE_ALIAS
+from ...fluid.layers import switch_case  #DEFINE_ALIAS
 
 from ...fluid.input import embedding  #DEFINE_ALIAS
-- 
GitLab


From 1501a80f74a6e9c129889c3c48bdf86829105e1c Mon Sep 17 00:00:00 2001
From: Li Fuchen <lifuchen@baidu.com>
Date: Sun, 27 Sep 2020 19:28:52 +0800
Subject: [PATCH 254/261] add support to float64 input of warpctc op. (#27399)

* add float64 input to ctc_loss

* modified error message of  warpctc

* update repo and tag of warpctc

* add test for warpctc with float64 input

* modified warpctc.cmake to make sure build always

* resolved sample code bug of warpctc

* add core.ops in warpctc dygraph

* fix a bug of test
---
 cmake/external/warpctc.cmake                  |   5 +-
 paddle/fluid/operators/math/sequence_scale.cc |   1 +
 paddle/fluid/operators/math/sequence_scale.cu |   1 +
 paddle/fluid/operators/warpctc_op.cc          |  10 +-
 paddle/fluid/operators/warpctc_op.cu.cc       |   6 +-
 paddle/fluid/operators/warpctc_op.h           |  84 ++++++++++---
 paddle/fluid/platform/dynload/warpctc.h       |   4 +-
 paddle/fluid/pybind/op_function_generator.cc  |   1 +
 python/paddle/fluid/layers/loss.py            |  23 +++-
 .../fluid/tests/unittests/test_warpctc_op.py  | 119 ++++++++++++++++--
 python/paddle/nn/functional/loss.py           |   2 +-
 python/paddle/nn/layer/loss.py                |   2 +-
 12 files changed, 221 insertions(+), 37 deletions(-)

diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index ac6cf624e82..7f2ab1fb11d 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 set(WARPCTC_REPOSITORY  https://github.com/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         fc7f226b93758216a03b1be9d24593a12819b984)
+set(WARPCTC_TAG         95a461eddeabd51099ef059dcfada1117eb1bfb8)
 
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
     CACHE PATH "Warp-ctc Directory" FORCE)
@@ -44,8 +44,9 @@ ExternalProject_Add(
     "${WARPCTC_DOWNLOAD_CMD}"
     PREFIX          ${WARPCTC_PREFIX_DIR}
     SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
-    UPDATE_COMMAND  ""
+    #UPDATE_COMMAND  ""
     PATCH_COMMAND   ""
+    BUILD_ALWAYS    1
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
index 78cbdf311ad..8e58411a1f2 100644
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -46,6 +46,7 @@ class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
 };
 
 template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu
index 079338c1d3d..4a952afe15f 100644
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -52,6 +52,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
 };
 
 template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 5dcbabc96b4..f043b017949 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -103,13 +103,13 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
              "Target sequence length for Label when Label is a 2-D tensor.")
         .AsDispensable();
     AddOutput("WarpCTCGrad",
-              "(Tensor, default: Tensor<float>), a temporary "
+              "(Tensor), a temporary "
               "output Tensor to store the gradients of warp-ctc, which is "
               "computed with loss together in one call. It is a 3-D Tensor of "
               "the shape [max_sequence_length, batch_size, num_classes + 1].")
         .AsIntermediate();
     AddOutput("Loss",
-              "(Tensor, default: Tensor<float>), the Connectionist "
+              "(Tensor), the Connectionist "
               "Temporal Classification (CTC) loss, which is a 2-D Tensor of "
               "the shape [batch_size, 1]");
     AddAttr<int>("blank",
@@ -197,7 +197,9 @@ REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
 REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp,
                   ops::WarpCTCGradOpNoNeedBufferVarInferer);
 REGISTER_OP_CPU_KERNEL(
-    warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
+    warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     warpctc_grad,
-    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/warpctc_op.cu.cc b/paddle/fluid/operators/warpctc_op.cu.cc
index 6f8559f542f..a42093aaa29 100644
--- a/paddle/fluid/operators/warpctc_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_op.cu.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>);
+    warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     warpctc_grad,
-    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index 951a258fd21..8b9276d4fa0 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -27,7 +27,52 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
+template <typename DeviceContext, typename T>
+class ComputeCtcLossFunctor {
+ public:
+  ctcStatus_t operator()(const T* const activations, T* gradients,
+                         const int* const flat_labels,
+                         const int* const label_lengths,
+                         const int* const input_lengths, int alphabet_size,
+                         int minibatch, T* costs, void* workspace,
+                         ctcOptions options) {
+    return CTC_STATUS_EXECUTION_FAILED;
+  }
+};
+
+template <typename DeviceContext>
+class ComputeCtcLossFunctor<DeviceContext, float> {
+ public:
+  ctcStatus_t operator()(const float* const activations, float* gradients,
+                         const int* const flat_labels,
+                         const int* const label_lengths,
+                         const int* const input_lengths, int alphabet_size,
+                         int minibatch, float* costs, void* workspace,
+                         ctcOptions options) {
+    return platform::dynload::compute_ctc_loss(
+        activations, gradients, flat_labels, label_lengths, input_lengths,
+        static_cast<int>(alphabet_size), static_cast<int>(minibatch), costs,
+        workspace, options);
+  }
+};
+
 template <typename DeviceContext>
+class ComputeCtcLossFunctor<DeviceContext, double> {
+ public:
+  ctcStatus_t operator()(const double* const activations, double* gradients,
+                         const int* const flat_labels,
+                         const int* const label_lengths,
+                         const int* const input_lengths, int alphabet_size,
+                         int minibatch, double* costs, void* workspace,
+                         ctcOptions options) {
+    return platform::dynload::compute_ctc_loss_double(
+        activations, gradients, flat_labels, label_lengths, input_lengths,
+        static_cast<int>(alphabet_size), static_cast<int>(minibatch), costs,
+        workspace, options);
+  }
+};
+
+template <typename DeviceContext, typename T>
 class WarpCTCFunctor {
  public:
   /*
@@ -51,21 +96,29 @@ class WarpCTCFunctor {
    * \param blank             blank label used in ctc loss function.
    * \param cpu_losss         cost of each sequence in CPU memory.
    */
-  void operator()(const framework::ExecutionContext& ctx, const float* input,
-                  float* gradient, const int* cpu_labels,
+  void operator()(const framework::ExecutionContext& ctx, const T* input,
+                  T* gradient, const int* cpu_labels,
                   const int* cpu_label_lengths, const int* cpu_input_lengths,
                   const size_t sequence_width, const size_t num_sequences,
-                  const size_t blank, float* cpu_loss) {
+                  const size_t blank, T* cpu_loss) {
     // Init warp-ctc options
     init(ctx, blank);
 
     // Compute the required workspace size.
     // There is no memory allocated operations within warp-ctc.
     size_t workspace_bytes = 0;
-    ctcStatus_t status = platform::dynload::get_workspace_size(
-        cpu_label_lengths, cpu_input_lengths, static_cast<int>(sequence_width),
-        static_cast<int>(num_sequences), options_, &workspace_bytes);
-
+    ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR;
+    if (sizeof(T) == 4) {
+      status = platform::dynload::get_workspace_size(
+          cpu_label_lengths, cpu_input_lengths,
+          static_cast<int>(sequence_width), static_cast<int>(num_sequences),
+          options_, &workspace_bytes);
+    } else {
+      status = platform::dynload::get_workspace_size_double(
+          cpu_label_lengths, cpu_input_lengths,
+          static_cast<int>(sequence_width), static_cast<int>(num_sequences),
+          options_, &workspace_bytes);
+    }
     PADDLE_ENFORCE_EQ(
         CTC_STATUS_SUCCESS, status,
         platform::errors::PreconditionNotMet(
@@ -79,17 +132,17 @@ class WarpCTCFunctor {
             workspace_bytes));
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL;
-    Tensor workspace = ctx.AllocateTmpTensor<float, DeviceContext>(
+    size_t workspace_elements = workspace_bytes / sizeof(T) + 1UL;
+    Tensor workspace = ctx.AllocateTmpTensor<T, DeviceContext>(
         framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
         dev_ctx);
-    float* workspace_data = workspace.data<float>();
-    math::SetConstant<DeviceContext, float>()(
+    T* workspace_data = workspace.data<T>();
+    math::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), &workspace,
-        static_cast<float>(0));
+        static_cast<T>(0));
 
     // compute loss and gradient
-    status = platform::dynload::compute_ctc_loss(
+    status = ComputeCtcLossFunctor<DeviceContext, T>()(
         input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths,
         static_cast<int>(sequence_width), static_cast<int>(num_sequences),
         cpu_loss, workspace_data, options_);
@@ -112,7 +165,8 @@ class WarpCTCFunctor {
                             ctx.device_context())
                             .stream();
 #else
-      PADDLE_THROW("[warpctc init] GPU is not enabled.");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "[warpctc init] GPU is not enabled."));
 #endif
     } else {
       options_.loc = CTC_CPU;
@@ -292,7 +346,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
 
     const size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
 
-    WarpCTCFunctor<DeviceContext>()(
+    WarpCTCFunctor<DeviceContext, T>()(
         ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data,
         warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
         sequence_width, num_sequences, blank, warpctc_loss_data);
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index e10a7233b62..5f1b7612117 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -53,7 +53,9 @@ extern void* warpctc_dso_handle;
   __macro(get_warpctc_version);       \
   __macro(ctcGetStatusString);        \
   __macro(compute_ctc_loss);          \
-  __macro(get_workspace_size)
+  __macro(compute_ctc_loss_double);   \
+  __macro(get_workspace_size);        \
+  __macro(get_workspace_size_double)
 
 WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
 
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index d3052ebd351..9bc603c0ecc 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -48,6 +48,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"collect_fpn_proposals",
      {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
     {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
+    {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index f468815c99e..037c7e85004 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -541,7 +541,7 @@ def warpctc(input,
          (not including the blank label). When it is a 3-D Tensor, its shape 
          is `[max_logit_length, batch_size, num_classes + 1]`,
          where `max_logit_length` is the longest length of
-         input logit sequence. The data type must be float32.
+         input logit sequence. The data type should be float32 or float64.
        label (Variable): The ground truth of variable-length sequence,
          which must be a 2-D Tensor with LoD information or a 3-D Tensor without
          LoD information, needs to be consistent with the coressponding input. 
@@ -571,6 +571,7 @@ def warpctc(input,
         .. code-block:: python
 
             # using LoDTensor
+            import paddle
             import paddle.fluid as fluid
             import numpy as np
 
@@ -581,6 +582,7 @@ def warpctc(input,
             # class num
             class_num = 5
 
+            paddle.enable_static()
             logits = fluid.data(name='logits',shape=[None, class_num+1],
                                  dtype='float32',lod_level=1)
             label = fluid.data(name='label', shape=[None, 1],
@@ -602,6 +604,7 @@ def warpctc(input,
         .. code-block:: python
 
             # using Tensor
+            import paddle
             import paddle.fluid as fluid
             import numpy as np
 
@@ -613,6 +616,7 @@ def warpctc(input,
             batch_size = 16
             # class num
             class_num = 5
+            paddle.enable_static()
             logits = fluid.data(name='logits',
                            shape=[max_seq_length, batch_size, class_num+1],
                            dtype='float32')
@@ -637,8 +641,23 @@ def warpctc(input,
                                   fetch_list=[cost.name])
             print(output)
     """
+    if in_dygraph_mode():
+        if input_length is None or label_length is None:
+            raise ValueError(
+                "input_length and label_length must not be None in dygraph mode!"
+            )
+        grad, loss_out = core.ops.warpctc(
+            input,
+            label,
+            input_length,
+            label_length,
+            'blank',
+            blank,
+            'norm_by_times',
+            norm_by_times, )
+        return loss_out
     helper = LayerHelper('warpctc', **locals())
-    check_variable_and_dtype(input, 'input', ['float32'], "warpctc")
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc")
     check_variable_and_dtype(label, 'label', ['int32'], "warpctc")
     this_inputs = {'Logits': [input], 'Label': [label]}
     if input_length is not None and label_length is not None:
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index c4155e0d826..b82ab04c986 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -24,7 +24,7 @@ from paddle.fluid import Program, program_guard
 import paddle
 import paddle.nn.functional as F
 
-CUDA_BLOCK_SIZE = 512
+CUDA_BLOCK_SIZE = 32
 
 
 class CTCForward(object):
@@ -41,8 +41,8 @@ class CTCForward(object):
         self.num_classes = num_classes
         self.batch_size = batch_size
 
-        self.loss = np.zeros([self.batch_size, 1], dtype="float32")
-        self.gradient = np.zeros(self.softmax.shape, dtype="float32")
+        self.loss = np.zeros([self.batch_size, 1], dtype=softmax.dtype)
+        self.gradient = np.zeros(self.softmax.shape, dtype=softmax.dtype)
 
         # float64
         self.EXP_MAX = sys.float_info.max
@@ -112,13 +112,15 @@ class CTCForward(object):
         # calculate the forward and backward variables,
         # reference Chapter 7.3 of "Alex Grave, Supervised Sequence
         # Labelling with Recurrent Neural Networks"
-        log_acts = np.zeros([total_times, self.num_classes], dtype="float32")
+        log_acts = np.zeros(
+            [total_times, self.num_classes], dtype=softmax_a_sequence.dtype)
         for i in range(total_times):
             for j in range(self.num_classes):
                 log_acts[i, j] = self.safe_log(softmax_a_sequence[i, j])
 
         # calculate the forward variables
-        forward_vars = np.zeros([total_times, total_segments], dtype="float32")
+        forward_vars = np.zeros(
+            [total_times, total_segments], dtype=softmax_a_sequence.dtype)
         for i in range(total_times):
             for j in range(total_segments):
                 forward_vars[i, j] = self.LOG_ZERO
@@ -219,7 +221,7 @@ class TestWarpCTCOp(OpTest):
                                       self.logits_lod[0][i])
         self.gradient = np.zeros(
             [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
+            dtype=logits.dtype)
 
         self.inputs = {
             "Logits": (logits, self.logits_lod),
@@ -287,7 +289,7 @@ class TestWarpCTCOpWithPadding(OpTest):
         # reshape logits to T*N*S
         new_logits = np.zeros(
             [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
+            dtype=logits.dtype)
 
         cur = 0
         for batch_id in range(self.batch_size):
@@ -312,7 +314,7 @@ class TestWarpCTCOpWithPadding(OpTest):
 
         self.gradient = np.zeros(
             [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
+            dtype=logits.dtype)
 
         self.inputs = {
             "Logits": new_logits,
@@ -347,6 +349,90 @@ class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
         self.norm_by_times = False
 
 
+class TestWarpCTCOpFp64(OpTest):
+    def config(self):
+        self.batch_size = 4
+        self.num_classes = 8
+        self.logits_lod = [[4, 1, 5, 5]]
+        self.labels_lod = [[3, 1, 4, 2]]
+        self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
+        self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
+        self.blank = self.num_classes - 1
+        self.norm_by_times = False
+
+    def setUp(self):
+        self.op_type = "warpctc"
+        self.config()
+
+        logits = np.random.uniform(
+            0.1, 1.0,
+            [sum(self.logits_length), self.num_classes]).astype("float64")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        # labels should not be blank
+        labels = np.random.randint(
+            0,
+            self.num_classes - 1, [sum(self.labels_length), 1],
+            dtype="int32")
+
+        ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
+                         self.num_classes, self.batch_size, self.blank,
+                         self.norm_by_times)
+        loss = ctc.forward()
+
+        max_sequence_length = 0
+        for i in range(self.batch_size):
+            max_sequence_length = max(max_sequence_length,
+                                      self.logits_length[i])
+        # reshape logits to T*N*S
+        new_logits = np.zeros(
+            [max_sequence_length, self.batch_size, self.num_classes],
+            dtype=logits.dtype)
+
+        cur = 0
+        for batch_id in range(self.batch_size):
+            for i in range(self.logits_length[batch_id]):
+                for j in range(self.num_classes):
+                    new_logits[i, batch_id, j] = logits[cur + i, j]
+            cur = cur + self.logits_length[batch_id]
+
+        # reshape labels to N*S
+        max_target_seq_length = 0
+        for i in range(self.batch_size):
+            max_target_seq_length = max(max_target_seq_length,
+                                        self.labels_length[i])
+        new_labels = np.zeros(
+            [self.batch_size, max_target_seq_length], dtype="int32")
+
+        cur = 0
+        for batch_id in range(self.batch_size):
+            for i in range(self.labels_length[batch_id]):
+                new_labels[batch_id, i] = labels[cur + i]
+            cur = cur + self.labels_length[batch_id]
+
+        self.gradient = np.zeros(
+            [max_sequence_length, self.batch_size, self.num_classes],
+            dtype=logits.dtype)
+
+        self.inputs = {
+            "Logits": new_logits,
+            "Label": new_labels,
+            "LogitsLength": self.logits_length,
+            "LabelLength": self.labels_length
+        }
+        self.outputs = {"Loss": loss}
+        self.attrs = {
+            "blank": self.blank,
+            "norm_by_times": self.norm_by_times,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.outputs['WarpCTCGrad'] = self.gradient
+        self.check_grad(["Logits"], "Loss")
+
+
 class TestWarpCTCOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
@@ -359,7 +445,7 @@ class TestWarpCTCOpError(unittest.TestCase):
                 name='labels_length', shape=[None], dtype='int64')
 
             def test_logits_Variable():
-                logits_data = np.random.rand(5, 16, 6).astype("float32")
+                logits_data = np.random.rand(5, 16, 6).astype(logits.dtype)
                 fluid.layers.warpctc(
                     input=logits_data,
                     label=label,
@@ -398,6 +484,21 @@ class TestWarpCTCOpError(unittest.TestCase):
 
             self.assertRaises(TypeError, test_label_len_Variable)
 
+    def test_dygraph_errors(self):
+        def test_dygraph_with_lod():
+
+            logits = np.random.uniform(0.1, 1.0, [20, 15]).astype("float32")
+            # labels should not be blank
+            labels = np.random.randint(0, 15 - 1, [15, 1], dtype="int32")
+            softmax = paddle.to_variable(logits)
+            labels = paddle.to_variable(labels)
+
+            fluid.layers.warpctc(input=softmax, label=labels)
+
+        paddle.disable_static()
+        self.assertRaises(ValueError, test_dygraph_with_lod)
+        paddle.enable_static()
+
 
 class TestCTCLossAPICase(unittest.TestCase):
     def test_functinal_api(self):
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 4395520eec7..d27bac14d0a 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -933,7 +933,7 @@ def ctc_loss(log_probs,
     is interated to the Warp-CTC library to normalize values for each row of the input tensor.
 
     Parameters:
-        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type should be float32 or float64.
         labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
         input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
         label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 271dc9b4e68..98048bb7e64 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -773,7 +773,7 @@ class CTCLoss(fluid.dygraph.Layer):
         reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
 
     Shape:
-        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type should be float32 or float64.
         labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
         input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
         label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
-- 
GitLab


From 96daa2594e9e1e92bff7fd54275a9b45ed7fda25 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Sun, 27 Sep 2020 19:40:01 +0800
Subject: [PATCH 255/261] Fix padding in conv1d op (#27590)

---
 python/paddle/nn/layer/conv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index a610693a0a4..3cc6a5a15b6 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -248,7 +248,7 @@ class Conv1d(_ConvNd):
         padding = 0
         if self._padding_mode != "zeros":
             x = F.pad(x,
-                      self._padding,
+                      self._reversed_padding_repeated_twice,
                       mode=self._padding_mode,
                       data_format=self._data_format)
         else:
-- 
GitLab


From 6e41143ffeda7000ba105c2097fd2f40ac68e890 Mon Sep 17 00:00:00 2001
From: littletomatodonkey <2120160898@bit.edu.cn>
Date: Sun, 27 Sep 2020 19:49:10 +0800
Subject: [PATCH 256/261] remove paddle.metrics.cos_sim api (#27569)

* fix api alias

* remove cos_sim
---
 python/paddle/metric/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/metric/__init__.py b/python/paddle/metric/__init__.py
index 6e197881fc0..fba45523889 100644
--- a/python/paddle/metric/__init__.py
+++ b/python/paddle/metric/__init__.py
@@ -16,12 +16,11 @@ from .metrics import *
 from . import metrics
 
 from ..fluid.layers.metric_op import accuracy, auc
-from ..fluid.layers.nn import chunk_eval, cos_sim, mean_iou
+from ..fluid.layers.nn import chunk_eval, mean_iou
 
 __all__ = metrics.__all__ + [
     'accuracy',
     'auc',
     'chunk_eval',
-    'cos_sim',
     'mean_iou',
 ]
-- 
GitLab


From d1c2a3bc6f39d0f10894fa24873c570a54560ef0 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Sun, 27 Sep 2020 21:00:32 +0800
Subject: [PATCH 257/261] disable ut test_warpctc_op,test=document_fix (#27632)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 97a3ebc2135..5df32c5df42 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -394,7 +394,8 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
-py_test_modules(test_warpctc_op MODULES test_warpctc_op)
+# disable test_warpctc_op
+# py_test_modules(test_warpctc_op MODULES test_warpctc_op)
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
-- 
GitLab


From 9d783aeddd86cc95508ef4d6a7e0982b90be31d3 Mon Sep 17 00:00:00 2001
From: Double_V <liuvv0203@163.com>
Date: Sun, 27 Sep 2020 21:30:34 +0800
Subject: [PATCH 258/261] Error message opt, test=develop (#27467)

* Error message opt, test=develop

* solve comments, test=develop

* fix typo, test=develop
---
 paddle/fluid/operators/center_loss_op.cu     | 11 +--
 paddle/fluid/operators/ctc_align_op.cu       |  6 +-
 paddle/fluid/operators/ctc_align_op.h        |  7 +-
 paddle/fluid/operators/pool_cudnn_op.cu.cc   | 12 ++--
 paddle/fluid/operators/pool_op.cc            | 72 ++++++++++++--------
 paddle/fluid/operators/pool_op.h             | 18 +++--
 paddle/fluid/operators/pool_with_index_op.cc | 26 ++++---
 paddle/fluid/operators/pool_with_index_op.h  | 10 ++-
 paddle/fluid/operators/psroi_pool_op.cu      | 25 ++++---
 paddle/fluid/operators/roi_pool_op.cu        | 20 +++++-
 10 files changed, 138 insertions(+), 69 deletions(-)

diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu
index 10b65fa215a..f15d1fe5e02 100644
--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
@@ -30,8 +30,10 @@ __global__ void ComputeDifferent(T *centers_diff, const T *X, const T *centers,
 
   while (idy < K) {
     int64_t id = ids[idy];
-    PADDLE_ENFORCE(id >= 0, "received id:", id);
-    PADDLE_ENFORCE(id < N, "received id:", id);
+    PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
+    PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
+                   id);
+
     T *out = centers_diff + idy * D;
     const T *x = X + idy * D;
     const T *cent = centers + id * D;
@@ -52,8 +54,9 @@ __global__ void UpdateCenters(T *centers, T *centers_diff, const int64_t *ids,
   while (idy < K) {
     int count = 1;
     int64_t id = ids[idy];
-    PADDLE_ENFORCE(id >= 0, "received id:", id);
-    PADDLE_ENFORCE(id < N, "received id:", id);
+    PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
+    PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
+                   id);
 
     for (int i = 0; i < K; i++) {
       if (ids[i] == id) {
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index 44a7c16f96a..67bd71d4a1b 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -69,8 +69,10 @@ template <typename T>
 class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::InvalidArgument(
+                          "CTCAlign operator CUDA kernel must use CUDAPlace "
+                          "rather than CPUPlace."));
     auto* input = ctx.Input<LoDTensor>("Input");
     auto* output = ctx.Output<LoDTensor>("Output");
     const int blank = ctx.Attr<int>("blank");
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index ccf91471ab9..662f899c0a5 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -72,8 +72,11 @@ class CTCAlignKernel : public framework::OpKernel<T> {
       // check input dims and lod
       PADDLE_ENFORCE_EQ(
           input_dims[0], static_cast<int64_t>(input_lod[level].back()),
-          "The first dimension of Input(Input) should be equal to "
-          "the sum of all sequences' lengths.");
+          platform::errors::InvalidArgument(
+              "The first dimension %d of CTCAlign operator Input(Input) should "
+              "be equal to "
+              "the sum of all sequences' lengths %d.",
+              input_dims[0], static_cast<int64_t>(input_lod[level].back())));
 
       const size_t num_sequences = input_lod[level].size() - 1;
 
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 9317a018333..3dc184facc7 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -45,8 +45,10 @@ template <typename T>
 class PoolCUDNNOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
+                                          "CUDAPlace rather than CPUPlace."));
 
     const Tensor *input = ctx.Input<Tensor>("X");
     Tensor *output = ctx.Output<Tensor>("Out");
@@ -175,8 +177,10 @@ template <typename T>
 class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
+                                          "CUDAPlace rather than CPUPlace."));
 
     const Tensor *input = ctx.Input<Tensor>("X");
     const Tensor *output = ctx.Input<Tensor>("Out");
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 9900120e6c5..ba468b79605 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -38,18 +38,22 @@ int PoolOutputSize(int input_size, int filter_size, int padding_1,
   }
   PADDLE_ENFORCE_GT(
       output_size, 0,
-      "ShapeError: the output size must be greater than 0. But received: "
-      "output_size = %d due to the settings of input_size(%d), padding(%d,%d), "
-      "k_size(%d) and stride(%d). Please check again!",
-      output_size, input_size, padding_1, padding_2, filter_size, stride);
+      platform::errors::InvalidArgument(
+          "the output size must be greater than 0. But received: "
+          "output_size = %d due to the settings of input_size(%d), "
+          "padding(%d,%d), "
+          "k_size(%d) and stride(%d). Please check again!",
+          output_size, input_size, padding_1, padding_2, filter_size, stride));
   return output_size;
 }
 
 void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    "X(Input) of Pooling should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                    "Out(Output) of Pooling should not be null.");
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("X"), true,
+      platform::errors::NotFound("Input(X) of Pool operator is not found."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasOutput("Out"), true,
+      platform::errors::NotFound("Output(Out) of Pool operator is not found."));
 
   std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
   std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
@@ -65,28 +69,32 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
   auto in_x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(
       in_x_dims.size() == 4 || in_x_dims.size() == 5, true,
-      "ShapeError: the input of Op(pool) should be 4-D or 5-D Tensor. But "
-      "received: %u-D Tensor and it's shape is [%s].",
-      in_x_dims.size(), in_x_dims);
+      platform::errors::InvalidArgument(
+          "the input of Op(pool) should be 4-D or 5-D Tensor. But "
+          "received: %u-D Tensor and it's shape is [%s].",
+          in_x_dims.size(), in_x_dims));
 
   PADDLE_ENFORCE_EQ(
       in_x_dims.size() - ksize.size(), 2U,
-      "ShapeError: the dimension of input minus the size of "
-      "Attr(ksize) must be euqal to 2 in Op(pool). "
-      "But received: the dimension of input minus the size "
-      "of Attr(ksize) is %d, the "
-      "input's dimension is %d, the shape of input "
-      "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
-      in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
-      ksize.size(), framework::make_ddim(ksize));
-
-  PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
-                    "ShapeError: the size of Attr(ksize) and Attr(strides) in "
-                    "Op(pool) must be equal. "
-                    "But received: Attr(ksize)'s size is %d, Attr(strides)'s "
-                    "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
-                    ksize.size(), strides.size(), framework::make_ddim(ksize),
-                    framework::make_ddim(strides));
+      platform::errors::InvalidArgument(
+          "the dimension of input minus the size of "
+          "Attr(ksize) must be euqal to 2 in Op(pool). "
+          "But received: the dimension of input minus the size "
+          "of Attr(ksize) is %d, the "
+          "input's dimension is %d, the shape of input "
+          "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
+          in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
+          ksize.size(), framework::make_ddim(ksize)));
+
+  PADDLE_ENFORCE_EQ(
+      ksize.size(), strides.size(),
+      platform::errors::InvalidArgument(
+          "the size of Attr(ksize) and Attr(strides) in "
+          "Op(pool) must be equal. "
+          "But received: Attr(ksize)'s size is %d, Attr(strides)'s "
+          "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
+          ksize.size(), strides.size(), framework::make_ddim(ksize),
+          framework::make_ddim(strides)));
 
   // MKL-DNN Kernels are using NCHW order of dims description
   // so we ignore data_format consideration for MKL-DNN kernel
@@ -182,9 +190,12 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar(
 }
 
 void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) must not be null.");
+  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                    platform::errors::NotFound(
+                        "Input(X) of Pool Gradoperator is not found."));
   PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                    "Input(X@GRAD) should not be null.");
+                    platform::errors::NotFound(
+                        "Input(X@GRAD) of Pool Gradoperator is not found."));
   ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
 }
 
@@ -210,7 +221,8 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
   auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
   if (input_data_type == framework::proto::VarType::FP16) {
     PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
-                      "float16 can only be used when CUDNN is used");
+                      platform::errors::InvalidArgument(
+                          "Float16 can only be used when CUDNN is used"));
   }
   return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                  library_);
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 48fb6793d2a..677c724069c 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -81,9 +81,11 @@ inline void UpdatePadding(std::vector<T>* paddings, const bool global_pooling,
       paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
     }
   } else {
-    PADDLE_ENFORCE_EQ(
-        data_dims.size() * 2, paddings->size(),
-        "Paddings size should be the same or twice as the pooling size.");
+    PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(),
+                      platform::errors::InvalidArgument(
+                          "Paddings size %d should be the same or twice as the "
+                          "pooling size %d.",
+                          paddings->size(), data_dims.size() * 2));
   }
 
   // when padding_algorithm is "VALID" or "SAME"
@@ -200,7 +202,10 @@ class PoolKernel : public framework::OpKernel<T> {
                          pool_process, exclusive, adaptive, out);
         }
       } break;
-      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Pool op only supports 2D and 3D input."));
+      }
     }
   }
 };
@@ -287,7 +292,10 @@ class PoolGradKernel : public framework::OpKernel<T> {
                             adaptive, in_x_grad);
           }
         } break;
-        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+        default: {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Pool op only supports 2D and 3D input."));
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 0371ea5b09b..3e44025e5b0 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -46,8 +46,11 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
     std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
     bool adaptive = ctx->Attrs().Get<bool>("adaptive");
 
-    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
-                   "Pooling intput should be 4-D or 5-D tensor.");
+    PADDLE_ENFORCE(
+        in_x_dims.size() == 4 || in_x_dims.size() == 5,
+        platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D "
+                                          "tensor but received %dD-Tensor",
+                                          in_x_dims.size()));
 
     if (ctx->Attrs().Get<bool>("global_pooling")) {
       ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
@@ -57,16 +60,21 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
       }
     }
 
-    PADDLE_ENFORCE_EQ(in_x_dims.size() - ksize.size(), 2U,
-                      platform::errors::InvalidArgument(
-                          "Input size and pooling size should be consistent."));
-    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
-                      platform::errors::InvalidArgument(
-                          "Strides size and pooling size should be the same."));
+    PADDLE_ENFORCE_EQ(
+        in_x_dims.size() - ksize.size(), 2U,
+        platform::errors::InvalidArgument(
+            "The input size %d minus the kernel size %d should equal to 2.",
+            in_x_dims.size(), ksize.size()));
+    PADDLE_ENFORCE_EQ(
+        ksize.size(), strides.size(),
+        platform::errors::InvalidArgument(
+            "Strides size %d and pooling size %d should be the same.",
+            strides.size(), ksize.size()));
     PADDLE_ENFORCE_EQ(
         ksize.size(), paddings.size(),
         platform::errors::InvalidArgument(
-            "Paddings size and pooling size should be the same."));
+            "Paddings size %d and pooling size %d should be the same.",
+            paddings.size(), ksize.size()));
 
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
     if (adaptive) {
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
index a6bec121d4f..065d90704cf 100644
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -61,7 +61,10 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
         pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
                        mask);
       } break;
-      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Pool op only supports 2D and 3D input."));
+      }
     }
   }
 };
@@ -106,7 +109,10 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
           pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                           paddings, adaptive, in_x_grad);
         } break;
-        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+        default: {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Pool op only supports 2D and 3D input."));
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
index 22fec3244fa..748b6036008 100644
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ b/paddle/fluid/operators/psroi_pool_op.cu
@@ -176,22 +176,31 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
     int height = in_dims[2];
     int width = in_dims[3];
 
-    PADDLE_ENFORCE_EQ(input_channels,
-                      output_channels * pooled_height * pooled_width,
-                      "the channels of input X should equal the product of "
-                      "output_channels x pooled_height x pooled_width");
+    PADDLE_ENFORCE_EQ(
+        input_channels, output_channels * pooled_height * pooled_width,
+        platform::errors::InvalidArgument(
+            "The channels %d of input X should equal the product of "
+            "output_channels %d x pooled_height %d x pooled_width %d.",
+            input_channels, output_channels, pooled_height, pooled_width));
 
     int rois_num = rois->dims()[0];
     if (rois_num == 0) return;
 
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        "The rois_batch_size and input(X) batch_size must be the same.");
+    PADDLE_ENFORCE_EQ(rois_batch_size, batch_size,
+                      platform::errors::InvalidArgument(
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size, batch_size));
     int rois_num_with_lod = rois_lod[rois_batch_size];
     PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
+                      platform::errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num, rois_num_with_lod));
 
     // set rois batch id
     framework::Tensor rois_batch_id_list;
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 98d9ef6b6e1..562ff8d576b 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -160,9 +160,14 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
     if (ctx.HasInput("RoisNum")) {
       auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
       int rois_batch_size = rois_num_t->numel();
+
       PADDLE_ENFORCE_EQ(
           rois_batch_size, batch_size,
-          "The rois_batch_size and imgs batch_size must be the same.");
+          platform::errors::InvalidArgument(
+              "The batch size of input(ROIs) and input(X) must be the same but "
+              "received batch size of input(ROIs) and input(X) is %d and %d "
+              "respectively.",
+              rois_batch_size, batch_size));
       std::vector<int> rois_num_list(rois_batch_size);
       memory::Copy(cplace, rois_num_list.data(), gplace,
                    rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
@@ -178,10 +183,19 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
       int rois_batch_size = rois_lod.size() - 1;
       PADDLE_ENFORCE_EQ(
           rois_batch_size, batch_size,
-          "The rois_batch_size and imgs batch_size must be the same.");
+          platform::errors::InvalidArgument(
+              "The batch size of input(ROIs) and input(X) must be the same but "
+              "received batch size of input(ROIs) and input(X) is %d and %d "
+              "respectively.",
+              rois_batch_size, batch_size));
+
       int rois_num_with_lod = rois_lod[rois_batch_size];
       PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        "The rois_num from input and lod must be the same.");
+                        platform::errors::InvalidArgument(
+                            "The number of rois from input(ROIs) and its LOD "
+                            "must be the same. Received rois %d of input(ROIs) "
+                            "but the number of rois %d from its LOD is %d",
+                            rois_num, rois_num_with_lod));
       for (int n = 0; n < rois_batch_size; ++n) {
         for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
           roi_batch_id_data[i] = n;
-- 
GitLab


From d01f6269445aa81971d175fa1931e11601e67360 Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Mon, 28 Sep 2020 00:06:46 +0800
Subject: [PATCH 259/261] update mv op according PR#27024 (#27474)

---
 paddle/fluid/operators/mv_op.cc               | 16 +++----
 paddle/fluid/operators/mv_op.cu               | 31 ++++++-------
 paddle/fluid/operators/mv_op.h                | 30 ++++++------
 .../fluid/tests/unittests/test_mv_op.py       | 46 +++++++++++++------
 4 files changed, 69 insertions(+), 54 deletions(-)

diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc
index 1339982adaa..cce066ec404 100644
--- a/paddle/fluid/operators/mv_op.cc
+++ b/paddle/fluid/operators/mv_op.cc
@@ -42,21 +42,21 @@ class MVOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");
 
     auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Vec");
+    auto dim_vec = context->GetInputDim("Vec");
     PADDLE_ENFORCE_EQ(
         dim_x.size(), 2,
         platform::errors::InvalidArgument(
             "The rank of input X should be 2, but is %d", dim_x.size()));
     PADDLE_ENFORCE_EQ(
-        dim_y.size(), 1,
+        dim_vec.size(), 1,
         platform::errors::InvalidArgument(
-            "The rank of input Vec should be 1, but is %d", dim_y.size()));
-    PADDLE_ENFORCE_EQ(dim_x[1] == dim_y[0], true,
+            "The rank of input Vec should be 1, but is %d", dim_vec.size()));
+    PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0],
                       platform::errors::InvalidArgument(
-                          "The length of input X' second dim should equal the "
-                          "length of input Vec,"
-                          " but X[%d, %d], Vec[%d]",
-                          dim_x[0], dim_x[1], dim_y[0]));
+                          "X's second dimension is expected to be equal to "
+                          "Vec's first dimension"
+                          "but recieved X'shape = [%s], Vec's shape = [%s]",
+                          dim_x, dim_vec));
 
     framework::DDim dim_out = framework::make_ddim({dim_x[0]});
 
diff --git a/paddle/fluid/operators/mv_op.cu b/paddle/fluid/operators/mv_op.cu
index 9a16fe025cd..b6d829392e3 100644
--- a/paddle/fluid/operators/mv_op.cu
+++ b/paddle/fluid/operators/mv_op.cu
@@ -19,8 +19,8 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void MVGradCUDAKernel(const int m, const int n, const T *dout,
-                                 const T *vec, T *dx) {
+__global__ void MVGradDxCUDAKernel(const int m, const int n, const T *dout,
+                                   const T *vec, T *dx) {
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
   for (; idx < m * n; idx += blockDim.x * gridDim.x) {
     int i = idx / n;
@@ -52,32 +52,31 @@ class MVGradKernel<platform::CUDADeviceContext, T>
     int m = dim_x[0];
     int n = dim_x[1];
 
-    dx->Resize(framework::make_ddim({m * n}));
-
     // get data ptr
     const T *x_data = x->data<T>();
     const T *vec_data = vec->data<T>();
     const T *dout_data = dout->data<T>();
 
-    T *dx_data = dx->mutable_data<T>(context.GetPlace());
-    T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
-
     auto &dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
     auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
-
-    // calculate dx
     auto stream = context.cuda_device_context().stream();
     auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);
-    MVGradCUDAKernel<
-        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-        m, n, dout_data, vec_data, dx_data);
 
-    dx->Resize(framework::make_ddim({m, n}));
+    if (dx) {
+      T *dx_data = dx->mutable_data<T>(context.GetPlace());
+
+      MVGradDxCUDAKernel<
+          T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+          m, n, dout_data, vec_data, dx_data);
+    }
+
+    if (dvec) {
+      T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
 
-    // calculate dvec
-    blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
-              static_cast<T>(0), dvec_data);
+      blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
+                static_cast<T>(0), dvec_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/mv_op.h b/paddle/fluid/operators/mv_op.h
index 3c63f3640ff..e2944996298 100644
--- a/paddle/fluid/operators/mv_op.h
+++ b/paddle/fluid/operators/mv_op.h
@@ -74,30 +74,30 @@ class MVGradKernel : public framework::OpKernel<T> {
     int m = dim_x[0];
     int n = dim_x[1];
 
-    dx->Resize(framework::make_ddim({m * n}));
-
     // get data ptr
     const T *x_data = x->data<T>();
     const T *vec_data = vec->data<T>();
     const T *dout_data = dout->data<T>();
 
-    T *dx_data = dx->mutable_data<T>(context.GetPlace());
-    T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
-
-    auto &dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    if (dx) {
+      T *dx_data = dx->mutable_data<T>(context.GetPlace());
 
-    // calculate dx
-    for (int i = 0; i < m; ++i) {
-      for (int j = 0; j < n; ++j)
-        dx_data[i * n + j] = dout_data[i] * vec_data[j];
+      for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+          dx_data[i * n + j] = dout_data[i] * vec_data[j];
+        }
+      }
     }
 
-    dx->Resize(framework::make_ddim({m, n}));
+    if (dvec) {
+      T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
+
+      auto &dev_ctx = context.template device_context<DeviceContext>();
+      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
-    // calculate dvec
-    blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
-              static_cast<T>(0), dvec_data);
+      blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
+                static_cast<T>(0), dvec_data);
+    }
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/test_mv_op.py b/python/paddle/fluid/tests/unittests/test_mv_op.py
index 6b930e59aa5..e0d23e7871f 100644
--- a/python/paddle/fluid/tests/unittests/test_mv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mv_op.py
@@ -20,6 +20,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
+from paddle.static import program_guard, Program
 from op_test import OpTest
 
 
@@ -37,7 +38,7 @@ class TestMVOp(OpTest):
         self.check_grad(['X', 'Vec'], 'Out')
 
     def init_config(self):
-        self.x = np.random.random((5, 100)).astype("float64")
+        self.x = np.random.random((2, 100)).astype("float64")
         self.vec = np.random.random((100)).astype("float64")
 
 
@@ -57,21 +58,36 @@ class TestMVAPI(unittest.TestCase):
         paddle.enable_static()
 
     def test_static_graph(self):
-        paddle.enable_static()
+        for x_stop_gradient in [False, True]:
+            for vec_stop_gradient in [False, True]:
+
+                paddle.enable_static()
+
+                train_program = Program()
+                startup_program = Program()
+
+                self.input_x = np.random.rand(5, 100).astype("float64")
+                self.input_vec = np.random.rand(100).astype("float64")
+
+                with program_guard(train_program, startup_program):
+                    data_x = paddle.static.data(
+                        "x", shape=[5, 100], dtype="float64")
+                    data_vec = paddle.static.data(
+                        "vec", shape=[100], dtype="float64")
+
+                    data_x.stop_gradient = x_stop_gradient
+                    data_vec.stop_gradient = vec_stop_gradient
+
+                    result_vec = paddle.mv(data_x, data_vec)
 
-        self.input_x = np.random.rand(5, 100).astype("float64")
-        self.input_vec = np.random.rand(100).astype("float64")
-
-        data_x = paddle.static.data("x", shape=[5, 100], dtype="float64")
-        data_vec = paddle.static.data("vec", shape=[100], dtype="float64")
-        result_vec = paddle.mv(data_x, data_vec)
-        self.place = paddle.CPUPlace()
-        exe = paddle.static.Executor(self.place)
-        res, = exe.run(feed={"x": self.input_x,
-                             "vec": self.input_vec},
-                       fetch_list=[result_vec])
-        z_expected = np.array(np.dot(self.input_x, self.input_vec))
-        self.assertTrue(np.allclose(res, z_expected))
+                    self.place = paddle.CPUPlace()
+                    exe = paddle.static.Executor(self.place)
+                    res, = exe.run(
+                        feed={"x": self.input_x,
+                              "vec": self.input_vec},
+                        fetch_list=[result_vec])
+                    z_expected = np.array(np.dot(self.input_x, self.input_vec))
+                    self.assertTrue(np.allclose(res, z_expected))
 
 
 class TestMVError(unittest.TestCase):
-- 
GitLab


From 4e8f18ab2592da873bb72bc59ce836fde70c7002 Mon Sep 17 00:00:00 2001
From: Dong Daxiang <35550832+guru4elephant@users.noreply.github.com>
Date: Mon, 28 Sep 2020 07:56:50 +0800
Subject: [PATCH 260/261] Get final strategy (#27602)

* add get final strategy for user to print final strategy
---
 python/paddle/distributed/fleet/__init__.py   |  1 +
 .../fleet/base/distributed_strategy.py        |  3 +-
 .../distributed/fleet/base/fleet_base.py      | 41 ++++++++++++++-----
 .../test_dist_fleet_a_sync_optimizer_auto.py  |  4 +-
 ..._dist_fleet_a_sync_optimizer_auto_async.py |  4 +-
 ...st_dist_fleet_a_sync_optimizer_auto_geo.py |  4 +-
 .../test_fleet_amp_meta_optimizer.py          |  4 ++
 .../fluid/tests/unittests/test_fleet_auto.py  |  2 +
 .../fluid/tests/unittests/test_fleet_base.py  |  2 +
 .../test_fleet_lamb_meta_optimizer.py         |  2 +
 .../test_fleet_lars_meta_optimizer.py         |  2 +
 11 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index e89cb1f5ec4..ad5a942b53e 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -30,6 +30,7 @@ __all__ = [
 ]
 
 fleet = Fleet()
+_final_strategy = fleet._final_strategy
 init = fleet.init
 is_first_worker = fleet.is_first_worker
 worker_index = fleet.worker_index
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 316b6494e34..1fc29ad0428 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -1244,8 +1244,7 @@ class DistributedStrategy(object):
                         if getattr(self.strategy, f.name):
                             draws += border + "\n"
                             draws += h1_format.format(
-                                "{} = True, please check {}_configs".format(
-                                    f.name, f.name))
+                                "{}=True <-> {}_configs".format(f.name, f.name))
                             draws += line + "\n"
                             my_configs = getattr(self.strategy,
                                                  f.name + "_configs")
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index d0658efdca3..3fdd6e92483 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -119,6 +119,8 @@ class Fleet(object):
         self.strategy_compiler = None
         self._is_collective = False
         self._runtime_handle = None
+        self._util = None
+        self._context = {}
 
     def init(self, role_maker=None, is_collective=False):
         """
@@ -233,7 +235,7 @@ class Fleet(object):
 
         Returns:
             int: worker numbers
-
+        
         Examples:
             .. code-block:: python
 
@@ -569,8 +571,9 @@ class Fleet(object):
 
         if strategy == None:
             strategy = DistributedStrategy()
-        self.user_defined_strategy = strategy
-        self.valid_strategy = None
+
+        self._user_defined_strategy = copy.deepcopy(strategy)
+        self._context = {}
         return self
 
     @dygraph_only
@@ -909,6 +912,15 @@ class Fleet(object):
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.clear_grad()
 
+    def _final_strategy(self):
+        if "valid_strategy" not in self._context:
+            print(
+                "WARNING: You may need to call minimize function before this function is called"
+            )
+            return {}
+        else:
+            return self._context["valid_strategy"]
+
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -958,12 +970,15 @@ class Fleet(object):
                 # for more examples, please reference https://github.com/PaddlePaddle/FleetX
 
         """
+        context = {}
+        context["user_defined_strategy"] = copy.deepcopy(
+            self._user_defined_strategy)
         if paddle.fluid.framework.in_dygraph_mode():
             # imitate target optimizer retrieval
             target_opt = self.user_defined_optimizer
+            self._context = context
             return target_opt.minimize(loss)
 
-        context = {}
         # cache original feed forward program
         self.origin_main_program = loss.block.program
         context["origin_main_program"] = self.origin_main_program
@@ -984,17 +999,19 @@ class Fleet(object):
             MetaOptimizerFactory()._get_valid_meta_optimizers(
                 self.user_defined_optimizer)
 
-        context["user_defined_strategy"] = copy.copy(self.user_defined_strategy)
+        context["user_defined_strategy"] = copy.deepcopy(
+            self._user_defined_strategy)
+        copy_user_defined_strategy = copy.deepcopy(self._user_defined_strategy)
 
         # trigger the auto-parallel in very strict condition
         # strategy = DistributedStrategy()
         # strategy.auto = True
         # optimizer = paddle.optimizer.SGD(learning_rate=0.1)
         # optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        if self.user_defined_strategy._is_strict_auto():
+        if copy_user_defined_strategy._is_strict_auto():
             # turn on all the strategy for each optimizer
             for opt in distributed_optimizer_list:
-                opt._enable_strategy(self.user_defined_strategy, context)
+                opt._enable_strategy(copy_user_defined_strategy, context)
 
         valid_optimizer_list = []
         valid_graph_optimizer_list = []
@@ -1003,7 +1020,7 @@ class Fleet(object):
         for opt in distributed_optimizer_list:
             opt._set_basic_info(loss, self._role_maker,
                                 self.user_defined_optimizer,
-                                self.user_defined_strategy)
+                                copy_user_defined_strategy)
             if opt._can_apply() and not opt._is_graph_out():
                 valid_optimizer_list.append(opt)
             elif opt._can_apply() and opt._is_graph_out():
@@ -1014,13 +1031,15 @@ class Fleet(object):
         meta_optimizer, graph_optimizer = \
             self.strategy_compiler.generate_optimizer(
                 loss, self._role_maker, self.user_defined_optimizer,
-                self.user_defined_strategy, valid_optimizer_list,
+                copy_user_defined_strategy, valid_optimizer_list,
                 valid_graph_optimizer_list)
 
         valid_strategy = self.strategy_compiler._get_valid_strategy(
-            self.user_defined_strategy, can_not_apply_optimizer_list)
+            copy_user_defined_strategy, can_not_apply_optimizer_list)
+
+        context["valid_strategy"] = copy.deepcopy(valid_strategy)
 
-        context["valid_strategy"] = valid_strategy
+        self._context = context
 
         self.valid_strategy = valid_strategy
         self.valid_strategy._enable_env()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
index 5b7e0fb94c6..b8393f1e28a 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(fleet._final_strategy().a_sync)
+        a_sync_configs = fleet._final_strategy().a_sync_configs
         self.assertTrue(a_sync_configs['k_steps'] == 0)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
index 3dff9d0f9d8..49b34f059e8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -72,8 +72,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(fleet._final_strategy().a_sync)
+        a_sync_configs = fleet._final_strategy().a_sync_configs
         self.assertTrue(a_sync_configs['k_steps'] == 0)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
index bdfa3a9a7d5..334a4e028b2 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(fleet._final_strategy().a_sync)
+        a_sync_configs = fleet._final_strategy().a_sync_configs
         self.assertTrue(a_sync_configs['k_steps'] == 800)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
index 73e014b3500..362428631e6 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -18,6 +18,8 @@ import unittest
 import paddle
 import os
 
+paddle.enable_static()
+
 
 class TestFleetAMPOptimizer(unittest.TestCase):
     def setUp(self):
@@ -55,6 +57,8 @@ class TestFleetAMPOptimizer(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
+        strategy = fleet._final_strategy()
+
         ops = [op.type for op in avg_cost.block.ops]
         self.assertIn('cast', ops)
         self.assertIn('check_finite_and_unscale', ops)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_auto.py b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
index 020f2f4db38..0a4e2f631d6 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
@@ -18,6 +18,8 @@ import os
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 
+paddle.enable_static()
+
 
 class TestDistributedStrategyAuto(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index ccd57c4d515..4945c158025 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -167,6 +167,8 @@ class TestFleetDygraph(unittest.TestCase):
         state_dict = adam.state_dict()
         adam.set_state_dict(state_dict)
 
+        final_strategy = fleet._final_strategy()
+
 
 class TestFleetBaseSingleRunCollective(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
index ec055178d90..022e0b99ce8 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
@@ -19,6 +19,8 @@ import os
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 
+paddle.enable_static()
+
 
 class TestFleetLambMetaOptimizer(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
index 0a70710b459..e4cc3682d1a 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
@@ -19,6 +19,8 @@ import os
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 
+paddle.enable_static()
+
 
 class TestFleetLarsMetaOptimizer(unittest.TestCase):
     def setUp(self):
-- 
GitLab


From 7c5162400f546b5dd1f666e4b66f7e6256ac7932 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Mon, 28 Sep 2020 08:07:30 +0800
Subject: [PATCH 261/261] [API 2.0]Migrate api example for
 gradients/append_backward/program_guard (#27570)

* modify sample code

* variable -> tensor

* migrate program_guard sample code

* refine error message

* migrate program_guard

* refine comment style

* fix indent
---
 python/paddle/fluid/backward.py  | 126 ++++++++++++++++---------------
 python/paddle/fluid/framework.py |  39 +++++-----
 2 files changed, 86 insertions(+), 79 deletions(-)

diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 478fecf74e4..590d76ae170 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1291,17 +1291,17 @@ def append_backward(loss,
     It will be automatically invoked by the optimizer's `minimize` function.
 
     Parameters:
-        loss( :ref:`api_guide_Variable_en` ): The loss variable of the network.
-        parameter_list(list[Variable|str], optional): List of Parameters or Parameter.names
+        loss(Tensor): The loss Tensor of the network.
+        parameter_list(list[Tensor|str], optional): List of Parameters or Parameter.names
                                            that need to be updated by optimizers.
                                            If it is None, all parameters
                                            will be updated.
                                            Default: None.
-        no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
-                               should be ignored. All variables with
+        no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
+                               should be ignored. All Tensors with
                                `stop_gradient=True` from all blocks will
                                be automatically added into this set.
-                               If this parameter is not None, the Variables or Variable.names in this set will be added to the default set.
+                               If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
                                Default: None.
         callbacks(list[callable object], optional): List of callback functions.
                                                The callbacks are used for
@@ -1312,70 +1312,73 @@ def append_backward(loss,
                                                new gradient operator is added
                                                into the program. The callable
                                                object must have two input
-                                               parameters: 'block' and 'context'.
-                                               The 'block' is the :ref:`api_guide_Block_en` which
+                                               parameters: ``block`` and ``context`` .
+                                               The ``block`` is the :ref:`api_guide_Block_en` which
                                                the new gradient operator will
-                                               be added to. The 'context' is a
+                                               be added to. The ``context`` is a
                                                map, whose keys are gradient
-                                               variable names and values are
-                                               corresponding original :ref:`api_guide_Variable_en` .
-                                               In addition to this, the 'context'
+                                               Tensor names and values are
+                                               corresponding original :ref:`api_guide_tensor_en` .
+                                               In addition to this, the ``context``
                                                has another special key-value pair:
-                                               the key is string '__current_op_desc__'
+                                               the key is string ``__current_op_desc__``
                                                and the value is the op_desc of the
                                                gradient operator who has just
                                                triggered the callable object.
                                                Default: None.
 
     Returns:
-        list of tuple ( :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` ): Pairs of parameter and its corresponding gradients.
-        The key is the parameter and the value is gradient variable.
+        list of tuple ( :ref:`api_guide_tensor_en` , :ref:`api_guide_tensor_en` ): Pairs of parameter and its corresponding gradients.
+        The key is the parameter and the value is gradient Tensor.
 
     Raises:
-        AssertionError: If `loss` is not an instance of Variable.
+        AssertionError: If ``loss`` is not an instance of Tensor.
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
+            import paddle.nn.functional as F
 
-            x = fluid.data(name='x', shape=[None, 13], dtype='int64')
-            y = fluid.data(name='y', shape=[None, 1], dtype='float32')
-            x_emb = fluid.embedding(x, size=[100, 256])
-            y_predict = fluid.layers.fc(input=x_emb, size=1, act=None, name='my_fc')
-            loss = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_loss = fluid.layers.mean(loss)
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[None, 13], dtype='int64')
+            y = paddle.static.data(name='y', shape=[None, 1], dtype='float32')
+            x_emb = paddle.static.nn.embedding(x, size=[100, 256])
+            y_predict = paddle.static.nn.fc(input=x_emb, size=1, act=None, name='my_fc')
+            loss = F.square_error_cost(input=y_predict, label=y)
+            avg_loss = paddle.mean(loss)
 
             # Get all weights in main_program, not include bias.
-            all_weights = [param for param in fluid.default_main_program().block(0).all_parameters() if 'w_' in param.name]
+            all_weights = [param for param in paddle.static.default_main_program().block(0).all_parameters() if 'w_' in param.name]
             all_weights_name = [w.name for w in all_weights]
 
             # return all param_grads needed to be updated if parameter_list set default None.
-            p_g_list1 = fluid.backward.append_backward(loss=avg_loss)
+            p_g_list1 = paddle.static.append_backward(loss=avg_loss)
             # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
 
-            # return the param_grads corresponding to parameter_list that can be list of param (Variable).
-            p_g_list2 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights)
+            # return the param_grads corresponding to parameter_list that can be list of param (Tensor).
+            p_g_list2 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights)
             # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
 
             # parameter_list can be list of param.name (str).
-            p_g_list3 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights_name)
+            p_g_list3 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights_name)
             # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
 
-            # no_grad_set can be set of Variables that means grad will be cut off from these Variables.
-            p_g_list4 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
+            # no_grad_set can be set of Tensors that means grad will be cut off from these Tensors.
+            p_g_list4 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
             # output: [(my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
 
-            # no_grad_set can be set of Variable.name when the Variable is created inside layers and can't be specified explicitly.
-            p_g_list5 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
+            # no_grad_set can be set of Tensor.name when the Tensor is created inside layers and can't be specified explicitly.
+            p_g_list5 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
             # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
 
             # return [] because all param_grads are filtered by no_grad_set.
-            p_g_list6 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
+            p_g_list6 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
 
     """
     check_type(loss, 'loss', framework.Variable,
-               'fluid.backward.append_backward')
+               'paddle.static.append_backward')
 
     if loss.op is None:
         # the loss is from a cloned program. Find loss op manually.
@@ -1387,7 +1390,7 @@ def append_backward(loss,
 
     if callbacks is not None:
         check_type(callbacks, 'callbacks', list,
-                   'fluid.backward.append_backward')
+                   'paddle.static.append_backward')
 
     program = loss.block.program
     root_block = program.block(0)
@@ -1727,21 +1730,21 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
     Backpropagate the gradients of targets to inputs.
 
     Args:
-        targets(Variable|list[Variable]): The target variables
-        inputs(Variable|list[Variable]): The input variables
-        target_gradients (Variable|list[Variable], optional): The gradient variables
+        targets(Tensor|list[Tensor]): The target Tensors
+        inputs(Tensor|list[Tensor]): The input Tensors
+        target_gradients (Tensor|list[Tensor], optional): The gradient Tensors
             of targets which has the same shape with targets, If None, ones will
             be created for them.
-        no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
-                               should be ignored. All variables with
+        no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
+                               should be ignored. All Tensors with
                                `stop_gradient=True` from all blocks will
                                be automatically added into this set.
-                               If this parameter is not None, the Variables or Variable.names in this set will be added to the default set.
+                               If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
                                Default: None.
 
     Return:
-        (list[Variable]): A list of gradients for inputs
-        If an input does not affect targets, the corresponding gradient variable
+        (list[Tensor]): A list of gradients for inputs
+        If an input does not affect targets, the corresponding gradient Tensor
         will be None
     """
     targets = _as_list(targets)
@@ -1865,41 +1868,42 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
     Backpropagate the gradients of targets to inputs.
 
     Args:
-        targets (Variable|list[Variable]): The target variables.
-        inputs (Variable|list[Variable]): The input variables.
-        target_gradients (Variable|list[Variable], optional): The gradient variables
+        targets (Tensor|list[Tensor]): The target Tensors.
+        inputs (Tensor|list[Tensor]): The input Tensors.
+        target_gradients (Tensor|list[Tensor], optional): The gradient Tensor
             of targets which has the same shape with targets, If None, ones will
             be created for them.
-        no_grad_set (set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
-            should be ignored. All variables with `stop_gradient=True` from all blocks will
-            be automatically added into this set. If this parameter is not None, the Variables or Variable.names
+        no_grad_set (set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
+            should be ignored. All Tensors with ``stop_gradient=True`` from all blocks will
+            be automatically added into this set. If this parameter is not None, the Tensors or Tensor.names
             in this set will be added to the default set. Default: None.
 
     Return:
-        (list[Variable]): A list of gradients for inputs
-        If an input does not affect targets, the corresponding gradient variable
+        (list[Tensor]): A list of gradients for inputs
+        If an input does not affect targets, the corresponding gradient Tensor
         will be None.
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
+            import paddle.nn.functional as F
+
+            paddle.enable_static()
 
-            x = fluid.data(name='x', shape=[None,2,8,8], dtype='float32')
+            x = paddle.static.data(name='x', shape=[None, 2, 8, 8], dtype='float32')
             x.stop_gradient=False
-            y = fluid.layers.conv2d(x, 4, 1, bias_attr=False)
-            y = fluid.layers.relu(y)
-            y = fluid.layers.conv2d(y, 4, 1, bias_attr=False)
-            y = fluid.layers.relu(y)
-            z = fluid.gradients([y], x)
-            print(z)
+            y = paddle.static.nn.conv2d(x, 4, 1, bias_attr=False)
+            y = F.relu(y)
+            z = paddle.static.gradients([y], x)
+            print(z) # [var x@GRAD : fluid.VarType.LOD_TENSOR.shape(-1L, 2L, 8L, 8L).astype(VarType.FP32)]
     """
     check_type(targets, 'targets', (framework.Variable, list),
-               'fluid.backward.gradients')
+               'paddle.static.gradients')
     check_type(inputs, 'inputs', (framework.Variable, list),
-               'fluid.backward.gradients')
+               'paddle.static.gradients')
     check_type(target_gradients, 'target_gradients', (
-        framework.Variable, list, type(None)), 'fluid.backward.gradients')
+        framework.Variable, list, type(None)), 'paddle.static.gradients')
 
     outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
     return _as_list(outs)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index b4cea6761dc..106c9a00361 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -5396,13 +5396,13 @@ def program_guard(main_program, startup_program=None):
     """
     :api_attr: Static Graph
 
-    Change the global main program and startup program with `"with"` statement.
-    Layer functions in the Python `"with"` block will append operators and
-    variables to the new main programs.
+    Change the global main program and startup program with ``with`` statement.
+    Layer functions in the Python ``with`` block will append operators and
+    Tensors to the new main programs.
 
     Args:
-        main_program(Program): New main program inside `"with"` statement.
-        startup_program(Program, optional): New startup program inside `"with"` 
+        main_program(Program): New main program inside ``with`` statement.
+        startup_program(Program, optional): New startup program inside ``with`` 
             statement. :code:`None` means not changing startup program, 
             default_startup_program is still used.
             Default: None.
@@ -5410,13 +5410,14 @@ def program_guard(main_program, startup_program=None):
     Examples:
        .. code-block:: python
        
-         import paddle.fluid as fluid
+          import paddle
 
-         main_program = fluid.Program()
-         startup_program = fluid.Program()
-         with fluid.program_guard(main_program, startup_program):
-             data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32')
-             hidden = fluid.layers.fc(input=data, size=10, act='relu')
+          paddle.enable_static()
+          main_program = paddle.static.Program()
+          startup_program = paddle.static.Program()
+          with paddle.static.program_guard(main_program, startup_program):
+              data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
+              hidden = paddle.static.nn.fc(input=data, size=10, act='relu')
 
     Notes: The temporary :code:`Program` can be used if the user does not need
     to construct either of startup program or main program.
@@ -5424,20 +5425,22 @@ def program_guard(main_program, startup_program=None):
     Examples:
        .. code-block:: python
 
-         import paddle.fluid as fluid
+          import paddle
 
-         main_program = fluid.Program()
-         # does not care about startup program. Just pass a temporary value.
-         with fluid.program_guard(main_program, fluid.Program()):
-             data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32')
+          paddle.enable_static()
+          main_program = paddle.static.Program()
+          # does not care about startup program. Just pass a temporary value.
+          with paddle.static.program_guard(main_program, paddle.static.Program()):
+              data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
     
     """
     from .data_feeder import check_type
-    check_type(main_program, 'main_program', Program, 'fluid.program_guard')
+    check_type(main_program, 'main_program', Program,
+               'paddle.static.program_guard')
     main_program = switch_main_program(main_program)
     if startup_program is not None:
         check_type(startup_program, 'startup_program', Program,
-                   'fluid.program_guard')
+                   'paddle.static.program_guard')
         startup_program = switch_startup_program(startup_program)
     try:
         yield
-- 
GitLab