Merge remote-tracking branch 'origin/develop' into memory/stable

bddd4bc0 · dzhwinter · da8adf1d · 437debf4 · bddd4bc0 · bddd4bc0
8 changed file
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -16,7 +16,9 @@ find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
    DOC "Path to TensorRT library.")
 if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
+  if(WITH_DSO)
    set(TENSORRT_FOUND ON)
+  endif(WITH DSO)
 else()
    set(TENSORRT_FOUND OFF)
 endif()

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -429,7 +429,7 @@ struct LSTM : public PatternBase {
 struct GRU : public PatternBase {
  GRU(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "lstm") {}
+      : PatternBase(pattern, name_scope, "gru") {}
  PDNode* operator()(PDNode* x);

--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -125,7 +125,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
  VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope));
  s->Prepare(h, time_out);
-  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
+  framework::AsyncIO([var_name_val, s, this] {
    // prepare input
    sendrecv::VariableMessage req;
    req.set_varname(var_name_val);
@@ -166,7 +166,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
  s->Prepare(h, time_out);
  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      time_out, s, this] {
+                      s, this] {
    auto* var = p_scope->FindVar(in_var_name_val);
    ::grpc::ByteBuffer req;

--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -103,6 +103,58 @@ class MaxSeqPoolGradFunctor {
  }
 };
+template <typename T>
+class LastSeqPoolFunctor {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& input,
+                  framework::Tensor* output) {
+    // Create pointers to input and output data
+    auto* in_data = input.data<T>();
+    auto* out_data = output->data<T>();
+    // Calculate the size of each item in sequence
+    int64_t item_size = input.numel() / input.dims()[0];
+    auto lod = input.lod()[0];
+    int seq_num = static_cast<int>(lod.size()) - 1;
+    for (int i = 0; i < seq_num; ++i) {
+      // Calculate the length of each sequence
+      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      // Point to the begin of next sequence
+      in_data += seq_len * item_size;
+      // Copy the last item of sequence to output
+      std::memcpy(out_data, (in_data - item_size), item_size * sizeof(T));
+      out_data += item_size;
+    }
+  }
+};
+template <typename T>
+class FirstSeqPoolFunctor {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& input,
+                  framework::Tensor* output) {
+    // Create pointers to input and output data
+    auto* in_data = input.data<T>();
+    auto* out_data = output->data<T>();
+    // Calculate the size of each item in sequence
+    int64_t item_size = input.numel() / input.dims()[0];
+    auto lod = input.lod()[0];
+    int seq_num = static_cast<int>(lod.size()) - 1;
+    for (int i = 0; i < seq_num; ++i) {
+      // Calculate the length of each sequence
+      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      // Copy the first item of sequence to output
+      std::memcpy(out_data, in_data, item_size * sizeof(T));
+      // Point to the next sequence
+      in_data += seq_len * item_size;
+      out_data += item_size;
+    }
+  }
+};
 template <typename T>
 class SequencePoolFunctor<platform::CPUDeviceContext, T> {
 public:
@@ -116,6 +168,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
      max_pool(context, input, output, index);
      return;
    }
+    if (pooltype == "LAST") {
+      math::LastSeqPoolFunctor<T> last_pool;
+      last_pool(context, input, output);
+      return;
+    }
+    if (pooltype == "FIRST") {
+      math::FirstSeqPoolFunctor<T> first_pool;
+      first_pool(context, input, output);
+      return;
+    }
    auto lod = input.lod()[0];
    auto& place = *context.eigen_device();
    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
@@ -133,10 +195,6 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
      } else if (pooltype == "SQRT") {
        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                              std::sqrt(static_cast<T>(h));
-      } else if (pooltype == "LAST") {
-        out_e.device(place) = in_e.chip(h - 1, 0);
-      } else if (pooltype == "FIRST") {
-        out_e.device(place) = in_e.chip(0, 0);
      } else {
        PADDLE_THROW("unsupported pooling pooltype");
      }

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -33,6 +33,7 @@ function print_usage() {
    ${BLUE}single_test${NONE}: run a single unit test
    ${BLUE}bind_test${NONE}: parallel tests bind to different GPU
    ${BLUE}doc${NONE}: generate paddle documents
+    ${BLUE}gen_doc_lib${NONE}: generate paddle documents library
    ${BLUE}html${NONE}: convert C++ source code into HTML
    ${BLUE}dockerfile${NONE}: generate paddle release dockerfile
    ${BLUE}capi${NONE}: generate paddle CAPI package
@@ -431,24 +432,60 @@ EOF
    linkchecker doc/v2/cn/html/index.html
    linkchecker doc/v2/api/en/html/index.html
-    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+#    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+#
+#    # Deploy to the the content server if its a "develop" or "release/version" branch
+#    # The "develop_doc" branch is reserved to test full deploy process without impacting the real content.
+#    if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then
+#        PPO_SCRIPT_BRANCH=develop
+#    elif [[ "$TRAVIS_BRANCH" == "develop"  ||  "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then
+#        PPO_SCRIPT_BRANCH=master
+#    else
+#        # Early exit, this branch doesn't require documentation build
+#        return 0;
+#    fi
+#     # Fetch the paddlepaddle.org deploy_docs.sh from the appopriate branch
+#    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh
+#    export PYTHONPATH=$PYTHONPATH:${PADDLE_ROOT}/build/python:/paddle/build/python
+#    cd ..
+#    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH ${PADDLE_ROOT} ${PADDLE_ROOT}/build/doc/ ${PPO_SCRIPT_BRANCH}
+#    cd -
+}
-    # Deploy to the the content server if its a "develop" or "release/version" branch
+function gen_doc_lib() {
-    # The "develop_doc" branch is reserved to test full deploy process without impacting the real content.
+    mkdir -p ${PADDLE_ROOT}/build
-    if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then
+    cd ${PADDLE_ROOT}/build
-        PPO_SCRIPT_BRANCH=develop
+    cat <<EOF
-    elif [[ "$TRAVIS_BRANCH" == "develop"  ||  "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then
+    ========================================
-        PPO_SCRIPT_BRANCH=master
+    Building documentation library ...
-    else
+    In /paddle/build
-        # Early exit, this branch doesn't require documentation build
+    ========================================
-        return 0;
+EOF
-    fi
+    cmake .. \
-     # Fetch the paddlepaddle.org deploy_docs.sh from the appopriate branch
+        -DCMAKE_BUILD_TYPE=Release \
-    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh
+        -DWITH_DOC=ON \
-    export PYTHONPATH=$PYTHONPATH:${PADDLE_ROOT}/build/python:/paddle/build/python
+        -DWITH_GPU=OFF \
-    cd ..
+        -DWITH_MKL=OFF \
-    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH ${PADDLE_ROOT} ${PADDLE_ROOT}/build/doc/ ${PPO_SCRIPT_BRANCH}
+        -DWITH_FLUID_ONLY=ON
-    cd -
+    local LIB_TYPE=$1
+    case $LIB_TYPE in
+      full)
+        # Build full Paddle Python module. Will timeout without caching 'copy_paddle_pybind' first
+        make -j `nproc` gen_proto_py framework_py_proto copy_paddle_pybind paddle_python
+        ;;
+      pybind)
+        # Build paddle pybind library. Takes 49 minutes to build. Might timeout
+        make -j `nproc` copy_paddle_pybind
+        ;;
+      proto)
+        # Even smaller library.
+        make -j `nproc` framework_py_proto
+        ;;
+      *)
+        exit 0
+        ;;
+      esac
 }
 function gen_html() {
@@ -608,6 +645,9 @@ function main() {
      doc)
        gen_docs
        ;;
+      gen_doc_lib)
+        gen_doc_lib $2
+        ;;
      html)
        gen_html
        ;;

--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -92,7 +92,7 @@ class TrainTaskConfig(object):
    src_vocab_fpath = data_path + "vocab.bpe.32000"
    trg_vocab_fpath = data_path + "vocab.bpe.32000"
    train_file_pattern = data_path + "train.tok.clean.bpe.32000.en-de"
-    val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de"
+    val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de.cut"
    pool_size = 2000
    sort_type = None
    local = True
@@ -624,6 +624,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
            init = True
            # Validate and save the model for inference.
+            if batch_id == 0 or batch_id == 4:
                if TrainTaskConfig.val_file_pattern is not None:
                    val_avg_cost, val_ppl = test()
                    print("[%f]" % val_avg_cost)
@@ -1701,8 +1702,9 @@ class DistTransformer2x2(TestDistRunnerBase):
        exe.run(startup_prog)
        exe.run(pserver_prog)
-    def run_trainer(self, place, args):
+    def run_trainer(self, use_cuda, args):
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        TrainTaskConfig.use_gpu = use_cuda
        sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
            args.is_dist, not args.sync_mode)

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -61,9 +61,10 @@ class TestDistRunnerBase(object):
        exe.run(startup_prog)
        exe.run(pserver_prog)
-    def run_trainer(self, place, args):
+    def run_trainer(self, use_cuda, args):
        import paddle
        import paddle.fluid as fluid
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
            self.get_model(batch_size=2)
        if args.mem_opt:
@@ -91,7 +92,7 @@ class TestDistRunnerBase(object):
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
        exe = fluid.ParallelExecutor(
-            True,
+            use_cuda,
            loss_name=avg_cost.name,
            exec_strategy=strategy,
            build_strategy=build_stra)
@@ -142,9 +143,8 @@ def runtime_main(test_class):
    if args.role == "pserver" and args.is_dist:
        model.run_pserver(args)
    else:
-        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        use_cuda = True if core.is_compiled_with_cuda() else False
-        ) else fluid.CPUPlace()
+        model.run_trainer(use_cuda, args)
-        model.run_trainer(p, args)
 import paddle.compat as cpt
@@ -225,11 +225,12 @@ class TestDistBase(unittest.TestCase):
    def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
        # TODO(typhoonzero): should auto adapt GPU count on the machine.
        required_envs = {
-            "PATH": os.getenv("PATH"),
+            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
-            "FLAGS_cudnn_deterministic": "1"
+            "FLAGS_cudnn_deterministic": "1",
+            "CPU_NUM": "1"
        }
        if check_error_log:

--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -14,6 +14,7 @@
 from __future__ import print_function
+import os
 import unittest
 import paddle
 from test_dist_base import TestDistBase
@@ -44,6 +45,14 @@ def download_files():
    test_url = url_prefix + 'newstest2013.tok.bpe.32000.en-de'
    test_md5 = '9dd74a266dbdb25314183899f269b4a2'
    paddle.dataset.common.download(test_url, 'test_dist_transformer', test_md5)
+    # cut test data for faster CI
+    orig_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "test_dist_transformer",
+                             "newstest2013.tok.bpe.32000.en-de")
+    head_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "test_dist_transformer",
+                             "newstest2013.tok.bpe.32000.en-de.cut")
+    os.system("head -n10 %s > %s" % (orig_path, head_path))
 class TestDistTransformer2x2Sync(TestDistBase):