Merge remote-tracking branch 'yx/fix_bce_cdn_link' into feature/refine_parallel_executor

ceb150e9 · yuyang18 · 8a42c474 · 57734901 · ceb150e9 · ceb150e9
29 changed file
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -159,6 +159,7 @@ def run_benchmark(model, args):
        paddle.dataset.mnist.train(), batch_size=args.batch_size)
    accuracy = fluid.metrics.Accuracy()
+    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        accuracy.reset()
@@ -175,17 +176,20 @@ def run_benchmark(model, args):
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([len(y_data), 1])
-            outs = exe.run(
+            outs = train_exe.run(
-                fluid.default_main_program(),
                feed={"pixel": img_data,
                      "label": y_data},
-                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
+                fetch_list=[
+                    avg_cost.name, batch_acc.name, batch_size_tensor.name
+                ]
            )  # The accuracy is the accumulation of batches, but not the current batch.
-            accuracy.update(value=outs[1], weight=outs[2])
+            accuracy.update(
+                value=np.array(np.mean(outs[1])),
+                weight=np.mean(np.array(outs[2])))
            iters += 1
            num_samples += len(y_data)
-            loss = np.array(outs[0])
+            loss = np.mean(np.array(outs[0]))
-            acc = np.array(outs[1])
+            acc = np.mean(np.array(outs[1]))
            train_losses.append(loss)
            train_accs.append(acc)
            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %

--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
@@ -241,6 +241,7 @@ def run_benchmark(model, args):
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    accuracy = fluid.average.WeightedAverage()
+    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
    if args.use_fake_data:
        data = train_reader().next()
        image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
@@ -264,14 +265,17 @@ def run_benchmark(model, args):
                                     data)).astype('float32')
                label = np.array(map(lambda x: x[1], data)).astype('int64')
                label = label.reshape([-1, 1])
-            loss, acc, weight = exe.run(
+            loss, acc, weight = train_exe.run(
-                fluid.default_main_program(),
                feed={'data': image,
                      'label': label},
-                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+                fetch_list=[
+                    avg_cost.name, batch_acc.name, batch_size_tensor.name
+                ])
            iters += 1
            num_samples += len(label)
-            accuracy.add(value=acc, weight=weight)
+            accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
+            loss = np.mean(np.array(loss))
+            acc = np.mean(np.array(acc))
            train_losses.append(loss)
            train_accs.append(acc)
            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %

--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@@ -169,6 +169,7 @@ def main():
    iters, num_samples, start_time = 0, 0, time.time()
    accuracy = fluid.average.WeightedAverage()
+    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
    for pass_id in range(args.pass_num):
        accuracy.reset()
        train_accs = []
@@ -184,14 +185,17 @@ def main():
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])
-            loss, acc, weight = exe.run(
+            loss, acc, weight = train_exe.run(
-                fluid.default_main_program(),
                feed={"pixel": img_data,
                      "label": y_data},
-                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+                fetch_list=[
-            accuracy.add(value=acc, weight=weight)
+                    avg_cost.name, batch_acc.name, batch_size_tensor.name
+                ])
+            accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
            iters += 1
            num_samples += len(y_data)
+            loss = np.mean(np.array(loss))
+            acc = np.mean(np.array(acc))
            print(
                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
                (pass_id, iters, loss, acc)

--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -24,7 +24,7 @@ set(BOOST_PROJECT       "extern_boost")
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
 set(BOOST_TAR           "boost_1_41_0")
-set(BOOST_URL           "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz")
+set(BOOST_URL           "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -21,11 +21,12 @@ else()
    ExternalProject_Add(
        extern_eigen3
        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+        GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
        # eigen on cuda9.1 missing header of math_funtions.hpp
        # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
        PREFIX          ${EIGEN_SOURCE_DIR}
+        DOWNLOAD_NAME   "eigen"
        UPDATE_COMMAND  ""
        CONFIGURE_COMMAND ""
        BUILD_COMMAND     ""

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.3.20180406")
-SET(MKLML_URL           "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
+SET(MKLML_URL           "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -148,4 +148,10 @@ copy(string_lib
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
 )
+set(module "pybind")
+copy(pybind_lib
+  SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
+  DSTS ${dst_dir}/${module}
+)
 add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -40,7 +40,7 @@ template <typename T>
 class FCOp : public OperatorBase {
 public:
  void Run(...) {
-    add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b");
+    add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b"));
  }
 };
 REGISTER_OP(FCOp, "fc");

--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -58,6 +58,7 @@ static DataTypeMap* InitDataTypeMap() {
  RegType(bool, proto::VarType::BOOL);
  RegType(size_t, proto::VarType::SIZE_T);
  RegType(int16_t, proto::VarType::INT16);
+  RegType(uint8_t, proto::VarType::UINT8);
 #undef RegType
  return retv;

--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -47,8 +47,14 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
    case proto::VarType::BOOL:
      visitor.template operator()<bool>();
      break;
+    case proto::VarType::UINT8:
+      visitor.template operator()<uint8_t>();
+      break;
+    case proto::VarType::INT16:
+      visitor.template operator()<int16_t>();
+      break;
    default:
-      PADDLE_THROW("Not supported");
+      PADDLE_THROW("Not supported %d", type);
  }
 }

--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -48,17 +48,18 @@ void FetchOpHandle::RunImpl() {
  WaitInputVarGenerated(platform::CPUPlace());
  tensors_.resize(inputs_.size());
-  auto *var_handle = static_cast<VarHandle *>(inputs_[0]);
-  auto &var_name = var_handle->name_;
  platform::CPUPlace cpu;
  auto &scopes = *local_scopes_;
-  for (size_t i = 0; i < scopes.size(); ++i) {
+  for (size_t i = 0; i < inputs_.size(); ++i) {
-    auto &scope = scopes[i];
+    auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
-    auto *var =
+    auto &scope = scopes.at(var_handle->scope_idx_);
-        scope->FindVar(kLocalExecScopeName)->Get<Scope *>()->FindVar(var_name);
+    auto *var = scope->FindVar(kLocalExecScopeName)
+                    ->Get<Scope *>()
+                    ->FindVar(var_handle->name_);
    PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
-                            var_name);
+                            var_handle->name_);
    auto &t = var->Get<framework::LoDTensor>();
    if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -70,6 +70,14 @@ class OpHandleBase {
  const std::vector<VarHandleBase *> &Inputs() const { return inputs_; }
+  size_t NoDupInputSize() const {
+    std::unordered_set<VarHandleBase *> res;
+    for (auto *var : inputs_) {
+      res.emplace(var);
+    }
+    return res.size();
+  }
  const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
 protected:

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -174,7 +174,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
 void ThreadedSSAGraphExecutor::InsertPendingOp(
    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
    OpHandleBase *op_instance) const {
-  pending_ops->insert({op_instance, op_instance->Inputs().size()});
+  pending_ops->insert({op_instance, op_instance->NoDupInputSize()});
 }
 void ThreadedSSAGraphExecutor::InsertPendingVar(

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -103,6 +103,7 @@ message VarType {
    FP64 = 6;
    // Tensor<size_t> is used in C++.
    SIZE_T = 19;
+    UINT8 = 20;
    // Other types that may need additional descriptions
    LOD_TENSOR = 7;

--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -228,11 +228,12 @@ TEST(LoD, CheckAbsLoD) {
  ASSERT_FALSE(CheckAbsLoD(abs_lod0));
 }
-TEST(LoDTensor, RecordIO) {
+template <typename T>
+static void TestRecordIO() {
  LoDTensor tensor;
-  int* tmp = tensor.mutable_data<int>(make_ddim({4, 5}), platform::CPUPlace());
+  T* tmp = tensor.mutable_data<T>(make_ddim({4, 5}), platform::CPUPlace());
  for (int i = 0; i < 20; ++i) {
-    tmp[i] = i;
+    tmp[i] = static_cast<T>(i);
  }
  std::stringstream* stream = new std::stringstream();
@@ -247,7 +248,7 @@ TEST(LoDTensor, RecordIO) {
  auto assert_tensor_ok = [](const LoDTensor& tensor) {
    for (int i = 0; i < 20; ++i) {
-      ASSERT_EQ(tensor.data<int>()[i], i);
+      ASSERT_EQ(tensor.data<T>()[i], static_cast<T>(i));
    }
  };
@@ -265,5 +266,13 @@ TEST(LoDTensor, RecordIO) {
  }
 }
+TEST(LoDTensor, RecordIO) {
+  TestRecordIO<int>();
+  TestRecordIO<int16_t>();
+  TestRecordIO<uint8_t>();
+  TestRecordIO<float>();
+  TestRecordIO<double>();
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -49,7 +49,7 @@ class OpConverter {
  // convert fluid block to tensorrt network
  void ConvertBlock(const framework::proto::BlockDesc& block,
                    TensorRTEngine* engine) {
-    for (size_t i = 0; i < block.ops_size(); i++) {
+    for (int i = 0; i < block.ops_size(); i++) {
      const auto& op = block.ops(i);
      OpConverter::Run(op, engine);
    }

--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -38,7 +38,9 @@ template struct SetConstant<platform::CPUDeviceContext, bool>;
  template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
  template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;       \
+  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;    \
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;
 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);

--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -105,7 +105,7 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
+    auto in_dims = ctx->GetInputDim("Diff");
    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
    PADDLE_ENFORCE_GE(out_dims.size(), 2,
@@ -127,12 +127,33 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
  }
 };
+class SmoothL1LossGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("smooth_l1_loss_grad");
+    op->SetInput("InsideWeight", Input("InsideWeight"));
+    op->SetInput("OutsideWeight", Input("OutsideWeight"));
+    op->SetInput("Diff", Output("Diff"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetAttrMap(Attrs());
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::SmoothL1LossGradMaker);
 REGISTER_OPERATOR(smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
 REGISTER_OP_CPU_KERNEL(
    smooth_l1_loss,

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -20,19 +20,15 @@
 #=================================================
 function print_usage() {
-    RED='\033[0;31m'
-    BLUE='\033[0;34m'
-    BOLD='\033[1m'
-    NONE='\033[0m'
    echo -e "\n${RED}Usage${NONE}:
-    ${BOLD}$0${NONE} [OPTION]"
+    ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
    echo -e "\n${RED}Options${NONE}:
    ${BLUE}build${NONE}: run build for x86 platform
    ${BLUE}build_android${NONE}: run build for android platform
    ${BLUE}build_ios${NONE}: run build for ios platform
    ${BLUE}test${NONE}: run all unit tests
+    ${BLUE}single_test${NONE}: run a single unit test
    ${BLUE}bind_test${NONE}: parallel tests bind to different GPU
    ${BLUE}doc${NONE}: generate paddle documents
    ${BLUE}html${NONE}: convert C++ source code into HTML
@@ -45,7 +41,15 @@ function print_usage() {
 }
 function init() {
+    RED='\033[0;31m'
+    BLUE='\033[0;34m'
+    BOLD='\033[1m'
+    NONE='\033[0m'
    PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
+    if [ -z "${SCRIPT_NAME}" ]; then
+        SCRIPT_NAME=$0
+    fi
 }
 function cmake_gen() {
@@ -91,7 +95,6 @@ function cmake_gen() {
        -DWITH_AVX=${WITH_AVX:-OFF}
        -DWITH_GOLANG=${WITH_GOLANG:-OFF}
        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-        -DWITH_SWIG_PY=ON
        -DWITH_C_API=${WITH_C_API:-OFF}
        -DWITH_PYTHON=${WITH_PYTHON:-ON}
        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
@@ -309,6 +312,25 @@ EOF
    fi
 }
+function single_test() {
+    TEST_NAME=$1
+    if [ -z "${TEST_NAME}" ]; then
+        echo -e "${RED}Usage:${NONE}"
+        echo -e "${BOLD}${SCRIPT_NAME}${NONE} ${BLUE}single_test${NONE} [test_name]"
+        exit 1
+    fi
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running ${TEST_NAME} ...
+    ========================================
+EOF
+        ctest --output-on-failure -R ${TEST_NAME}
+    fi
+}
 function bind_test() {
    # the number of process to run tests
    NUM_PROC=6
@@ -480,6 +502,7 @@ function main() {
      build)
        cmake_gen ${PYTHON_ABI:-""}
        build
+        gen_dockerfile
        ;;
      build_android)
        build_android
@@ -490,6 +513,9 @@ function main() {
      test)
        run_test
        ;;
+      single_test)
+        single_test $2
+        ;;
      bind_test)
        bind_test
        ;;

--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -63,6 +63,7 @@ EOL
    ${DOCKER_CMD} run -it \
        --name $CONTAINER_ID \
        ${DOCKER_ENV} \
+        -e SCRIPT_NAME=$0 \
        -v $PADDLE_ROOT:/paddle \
        -v ${HOME}/.ccache:/root/.ccache \
        -w /paddle \

--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -54,9 +54,9 @@ class DataToLoDTensorConverter(object):
            self.data.append(data)
        else:
            cur_lod_len = len(data)
-            lod[-1].append(lod[-1][-1] + cur_lod_len)
+            lod[0].append(lod[0][-1] + cur_lod_len)
            for each_data in data:
-                self._feed_impl_(each_data, lod[:-1], lod_level - 1)
+                self._feed_impl_(each_data, lod[1:], lod_level - 1)
    def done(self):
        arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1329,6 +1329,8 @@ def sequence_pool(input, pool_type):
         sqrt   : out.data = [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
                    6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
         max    : out.data = [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+         last   : out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+         first  : out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
    Args:
        input(variable): The input variable which is a LoDTensor.
@@ -1348,6 +1350,8 @@ def sequence_pool(input, pool_type):
             sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
             sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
             max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
+             last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
+             first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
    """
    helper = LayerHelper('sequence_pool', **locals())
    dtype = helper.input_dtype()
@@ -3263,35 +3267,35 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
    """
    **Smooth L1 Loss Operator. **
-    This operator computes the smooth l1 loss for X and Y.
+    This operator computes the smooth L1 loss for X and Y.
    The operator takes the first dimension of X and Y as batch size.
-    For each instance, it computes the smooth l1 loss element by element first
+    For each instance, it computes the smooth L1 loss element by element first
    and then sums all the losses. So the shape of Out is [batch_size, 1].
    Args:
        x (Variable): A tensor with rank at least 2. The input value of smooth
-            l1 loss op with shape [batch_size, dim1, ..., dimN].
+            L1 loss op with shape [batch_size, dim1, ..., dimN].
        y (Variable): A tensor with rank at least 2. The target value of smooth
-            l1 loss op with same shape as x.
+            L1 loss op with same shape as x.
        inside_weight (Variable|None):  A tensor with rank at least 2. This
            input is optional and should have same shape with x. If provided,
            the result of (x - y) will be multiplied by this tensor element by
            element.
        outside_weight (Variable|None): A tensor with rank at least 2. This
            input is optional and should have same shape with x. If provided,
-            the out smooth l1 loss will be multiplied by this tensor element
+            the out smooth L1 loss will be multiplied by this tensor element
            by element.
-        sigma (float|None): Hyper parameter of smooth l1 loss op. A float scalar
+        sigma (float|None): Hyper parameter of smooth L1 loss op. A float scalar
            with default value 1.0.
    Returns:
-        Variable: A tensor with rank be 2. The output smooth l1 loss with
+        Variable: A tensor with rank be 2. The output smooth L1 loss with
            shape [batch_size, 1].
    Examples:
        .. code-block:: python
            data = fluid.layers.data(name='data', shape=[128], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[100], dtype='int64')
+            label = fluid.layers.data(name='label', shape=[100], dtype='float32')
            fc = fluid.layers.fc(input=data, size=100)
            out = fluid.layers.smooth_l1(x=fc, y=label)
    """

--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -62,7 +62,10 @@ def train(use_cuda, train_program, save_dirname):
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
    trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer=optimizer)
+        train_func=train_program,
+        place=place,
+        optimizer=optimizer,
+        parallel=True)
    def event_handler(event):
        if isinstance(event, fluid.EndEpochEvent):
@@ -87,6 +90,9 @@ def train(use_cuda, train_program, save_dirname):
                    event.epoch + 1, float(avg_cost), float(acc)))
                if math.isnan(float(avg_cost)):
                    sys.exit("got NaN loss, training failed.")
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(numpy.array, event.metrics)))
    train_reader = paddle.batch(
        paddle.reader.shuffle(
@@ -131,4 +137,4 @@ def main(use_cuda):
 if __name__ == '__main__':
    # for use_cuda in (False, True):
-    main(use_cuda=False)
+    main(use_cuda=True)
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -182,12 +182,6 @@ def train(use_cuda, save_dirname=None, is_local=True):
    crf_decode = fluid.layers.crf_decoding(
        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
-    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
-        input=crf_decode,
-        label=target,
-        chunk_scheme="IOB",
-        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
    train_data = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.conll05.test(), buf_size=8192),
@@ -203,7 +197,6 @@ def train(use_cuda, save_dirname=None, is_local=True):
    def train_loop(main_program):
        exe.run(fluid.default_startup_program())
        embedding_param = fluid.global_scope().find_var(
            embedding_name).get_tensor()
        embedding_param.set(
@@ -213,27 +206,19 @@ def train(use_cuda, save_dirname=None, is_local=True):
        start_time = time.time()
        batch_id = 0
        for pass_id in xrange(PASS_NUM):
-            chunk_evaluator.reset(exe)
            for data in train_data():
-                cost, precision, recall, f1_score = exe.run(
+                cost = exe.run(main_program,
-                    main_program,
                               feed=feeder.feed(data),
-                    fetch_list=[avg_cost] + chunk_evaluator.metrics)
+                               fetch_list=[avg_cost])
-                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
+                cost = cost[0]
-                    exe)
                if batch_id % 10 == 0:
-                    print("avg_cost:" + str(cost) + " precision:" + str(
+                    print("avg_cost:" + str(cost))
-                        precision) + " recall:" + str(recall) + " f1_score:" +
-                          str(f1_score) + " pass_precision:" + str(
-                              pass_precision) + " pass_recall:" + str(
-                                  pass_recall) + " pass_f1_score:" + str(
-                                      pass_f1_score))
                    if batch_id != 0:
                        print("second per batch: " + str((time.time(
                        ) - start_time) / batch_id))
                    # Set the threshold low to speed up the CI test
-                    if float(pass_precision) > 0.01:
+                    if float(cost) < 60.0:
                        if save_dirname is not None:
                            # TODO(liuyiqun): Change the target to crf_decode
                            fluid.io.save_inference_model(save_dirname, [

--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -13,15 +13,62 @@
 # limitations under the License.
 import paddle.fluid as fluid
+import unittest
-def test_converter():
+class TestDataFeeder(unittest.TestCase):
+    def test_lod_level_0_converter(self):
        img = fluid.layers.data(name='image', shape=[1, 28, 28])
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
        feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
-    result = feeder.feed([[[0] * 784, [9]], [[1] * 784, [1]]])
+        result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
        print(result)
+        self.assertEqual(result['image'].shape(), [2, 1, 28, 28])
+        self.assertEqual(result['label'].shape(), [2, 1])
+        self.assertEqual(result['image'].lod(), [])
+        self.assertEqual(result['label'].lod(), [])
+    def test_lod_level_1_converter(self):
+        # lod_level = 1
+        # each sentence has a different number of words
+        sentences = fluid.layers.data(
+            name='sentences', shape=[1], dtype='int64', lod_level=1)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([sentences, label], fluid.CPUPlace())
+        # lod = [[0, 3, 5, 9]]
+        # data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
+        # label = [1] * len(data)
+        result = feeder.feed(
+            [([1, 2, 3], [1]), ([4, 5], [1]), ([6, 7, 8, 9], [1])])
+        print(result)
+        self.assertEqual(result['sentences'].shape(), [9, 1])
+        self.assertEqual(result['label'].shape(), [3, 1])
+        self.assertEqual(result['sentences'].lod(), [[0, 3, 5, 9]])
+        self.assertEqual(result['label'].lod(), [])
+    def test_lod_level_2_converter(self):
+        # lod_level = 2
+        # paragraphs -> sentences -> words
+        paragraphs = fluid.layers.data(
+            name='paragraphs', shape=[1], dtype='int64', lod_level=2)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([paragraphs, label], fluid.CPUPlace())
+        # lod = [[0, 2, 3], [0, 3, 5, 9]]
+        # data = [[[1, 2, 3], [4, 5]], [[6, 7, 8, 9]]]
+        # label = [1] * len(data)
+        result = feeder.feed(
+            [([[1, 2, 3], [4, 5]], [1]), ([[6, 7, 8, 9]], [1])])
+        print(result)
+        self.assertEqual(result['paragraphs'].shape(), [9, 1])
+        self.assertEqual(result['label'].shape(), [2, 1])
+        self.assertEqual(result['paragraphs'].lod(), [[0, 2, 3], [0, 3, 5, 9]])
+        self.assertEqual(result['label'].lod(), [])
 if __name__ == '__main__':
-    test_converter()
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -28,11 +28,11 @@ function(py_test_modules TARGET_NAME)
  if(WITH_TESTING)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs MODULES DEPS ARGS ENVS)
+    set(multiValueArgs MODULES DEPS ENVS)
    cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
-             ${PYTHON_EXECUTABLE} -u -m unittest --verbose ${py_test_modules_MODULES} ${py_test_modules_ARGS}
+             ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif()
 endfunction()

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -778,7 +778,7 @@ class TestCRFModel(unittest.TestCase):
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        self.check_network_convergence(
-            is_sparse=False, build_strategy=build_strategy)
+            is_sparse=True, build_strategy=build_strategy)
    def test_update_dense_parameter_reduce(self):
        build_strategy = fluid.BuildStrategy()
@@ -852,8 +852,7 @@ class TestFetchOp(unittest.TestCase):
                    assert not math.isnan(np.sum(ret[i])) and \
                           not math.isinf(np.sum(ret[i]))
-    @unittest.skip("this test is buggy")
+    def test_fetch_op(self):
-    def test_feed(self):
        tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
        tst_reader_iter = tst_reader()

--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -20,6 +20,7 @@ import data_feeder
 import contextlib
 import io
 import unique_name
+import parallel_executor
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 import optimizer as opt_module
@@ -48,12 +49,14 @@ class BeginStepEvent(object):
    def __init__(self, epoch_id, step_id):
        self.epoch = epoch_id
        self.step = step_id
+        self.fetch_metrics = True
 class EndStepEvent(object):
-    def __init__(self, epoch_id, step_id):
+    def __init__(self, epoch_id, step_id, metrics):
        self.epoch = epoch_id
        self.step = step_id
+        self.metrics = metrics
 def check_and_get_place(place):
@@ -87,12 +90,17 @@ class Trainer(object):
    Args:
        train_func(callable): A function which will return loss. The loss must be a scalar.
-        infer_func(callable): A function which will return predict, used to save inference model
        optimizer(optimizer.Optimizer): The optimizer should be an instance of Optimizer
        place: The device place of this trainer.
    """
-    def __init__(self, train_func, optimizer, param_path=None, place=None):
+    def __init__(self,
+                 train_func,
+                 optimizer,
+                 param_path=None,
+                 place=None,
+                 parallel=False):
+        self.parallel = parallel
        # 1. we need to generate a framework.Program by calling
        # program_func. Reference: fluid.program_guard in
        # test_word2vec.py
@@ -106,14 +114,14 @@ class Trainer(object):
        with framework.program_guard(self.train_program, self.startup_program):
            program_func_outs = train_func()
-            self.test_outputs = program_func_outs if isinstance(
+            self.train_func_outputs = program_func_outs if isinstance(
                program_func_outs, list) else [program_func_outs]
            self.test_program = self.train_program.clone()
            if not isinstance(optimizer, opt_module.Optimizer):
                raise TypeError(
                    "The optimizer should be an instance of Optimizer")
            # The fisrt element of program_func_outs is loss.
-            loss = self.test_outputs[0]
+            loss = self.train_func_outputs[0]
            optimize_ops, params_grads = optimizer.minimize(loss)
        self.place = check_and_get_place(place)
@@ -131,7 +139,40 @@ class Trainer(object):
            # load params from param_path into scope
            io.load_persistables(exe, dirname=param_path)
+    def _transpile_nccl2_dist(self):
+        # PADDLE_TRAINER_IPS
+        if "PADDLE_TRAINER_IPS" not in os.environ:
+            self.nccl_id_var = None
+        else:
+            self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+            port = os.getenv("PADDLE_PSERVER_PORT")
+            worker_ips = os.getenv("PADDLE_TRAINER_IPS")
+            worker_endpoints = []
+            for ip in worker_ips.split(","):
+                worker_endpoints.append(':'.join([ip, port]))
+            self.num_trainers = len(worker_endpoints)
+            current_endpoint = os.getenv("POD_IP") + ":" + port
+            worker_endpoints.remove(current_endpoint)
+            # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id
+            # in ParallelExecutor to start
+            # distributed training using NCCL2
+            self.nccl_id_var = self.startup_program.global_block().create_var(
+                name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
+            self.startup_program.global_block().append_op(
+                type="gen_nccl_id",
+                inputs={},
+                outputs={"NCCLID": self.nccl_id_var},
+                attrs={
+                    "endpoint": current_endpoint,
+                    "endpoint_list": worker_endpoints,
+                    "trainer_id": self.trainer_id
+                })
    def _dist_transpile_if_necessary(self, optimize_ops, params_grads):
+        self._transpile_nccl2_dist()
+        if self.nccl_id_var != None:
+            return
        if "PADDLE_TRAINING_ROLE" not in os.environ:
            return
@@ -169,12 +210,7 @@ class Trainer(object):
                    'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
                )
-    def train(self,
+    def train(self, num_epochs, event_handler, reader=None, feed_order=None):
-              num_epochs,
-              event_handler,
-              reader,
-              feed_order,
-              parallel=False):
        """
        Train the model.
@@ -182,25 +218,24 @@ class Trainer(object):
            num_epochs: The number of epoch. An epoch will process all data in reader
            event_handler: The event handler. A function with type (ev:Event)->void
            reader:
-            parallel: True if use multi-CPUs or multi-GPUs
            feed_order: Feeding order of reader. None will following the defining
                order in program
        Returns:
        """
-        if parallel:
-            raise NotImplementedError(
-                "Parallel Executor version of trainer is not implemented")
        training_role = os.getenv("PADDLE_TRAINING_ROLE", "")
        if training_role == "PSERVER":
            with self._prog_and_scope_guard():
                exe = executor.Executor(self.place)
                exe.run()
                return
+        if self.parallel:
-        self._train_by_executor(num_epochs, event_handler, reader, feed_order)
+            self._train_by_parallel_executor(num_epochs, event_handler, reader,
+                                             feed_order)
+        else:
+            self._train_by_executor(num_epochs, event_handler, reader,
+                                    feed_order)
    def test(self, reader, feed_order):
        """
@@ -212,7 +247,8 @@ class Trainer(object):
                order in program
        """
-        return self._test_by_executor(reader, feed_order, self.test_outputs)
+        return self._test_by_executor(reader, feed_order,
+                                      self.train_func_outputs)
    def save_params(self, param_path):
        # reference: save_persistables in io.py
@@ -246,12 +282,24 @@ class Trainer(object):
            feeder = data_feeder.DataFeeder(
                feed_list=feed_var_list, place=self.place)
            exe = executor.Executor(self.place)
+            reader = feeder.decorate_reader(reader, multi_devices=False)
+            self._train_by_any_executor(event_handler, exe, num_epochs, reader)
+    def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
        for epoch_id in range(num_epochs):
            event_handler(BeginEpochEvent(epoch_id))
            for step_id, data in enumerate(reader()):
-                    event_handler(BeginStepEvent(epoch_id, step_id))
+                begin_event = BeginStepEvent(epoch_id, step_id)
-                    exe.run(feed=feeder.feed(data), fetch_list=[])
+                event_handler(begin_event)
-                    event_handler(EndStepEvent(epoch_id, step_id))
+                if begin_event.fetch_metrics:
+                    metrics = exe.run(feed=data,
+                                      fetch_list=[
+                                          var.name
+                                          for var in self.train_func_outputs
+                                      ])
+                else:
+                    metrics = exe.run(feed=data, fetch_list=[])
+                event_handler(EndStepEvent(epoch_id, step_id, metrics))
            event_handler(EndEpochEvent(epoch_id))
    def _test_by_executor(self, reader, feed_order, fetch_list):
@@ -271,6 +319,28 @@ class Trainer(object):
            return [x / count for x in accumulated]
+    def _train_by_parallel_executor(self, num_epochs, event_handler, reader,
+                                    feed_order):
+        with self._prog_and_scope_guard():
+            pe = self._get_or_create_parallel_executor()
+            feed_var_list = build_feed_var_list(self.train_program, feed_order)
+            feeder = data_feeder.DataFeeder(
+                feed_list=feed_var_list, place=self.place)
+            reader = feeder.decorate_reader(reader, multi_devices=True)
+            for epoch_id in range(num_epochs):
+                self._train_by_any_executor(event_handler, pe, num_epochs,
+                                            reader)
+    def _get_parallel_executor(self):
+        return getattr(self, 'parallel_executor', None)
+    def _get_or_create_parallel_executor(self):
+        if self._get_parallel_executor() is None:
+            self.parallel_executor = parallel_executor.ParallelExecutor(
+                use_cuda=isinstance(self.place, core.CUDAPlace),
+                loss_name=self.train_func_outputs[0].name)
+        return self._get_parallel_executor()
 def build_feed_var_list(program, feed_order):
    if not isinstance(program, framework.Program):

--- a/tools/test_runner.py
+++ b/tools/test_runner.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import os
+import sys
+import paddle.fluid as fluid
+import importlib
+import cStringIO
+def main():
+    sys.path.append(os.getcwd())
+    some_test_failed = False
+    for module_name in sys.argv[1:]:
+        buffer = cStringIO.StringIO()
+        main = fluid.Program()
+        startup = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.program_guard(main, startup):
+            with fluid.scope_guard(scope):
+                with fluid.unique_name.guard():
+                    test_loader = unittest.TestLoader()
+                    module = importlib.import_module(module_name)
+                    tests = test_loader.loadTestsFromModule(module)
+                    res = unittest.TextTestRunner(stream=buffer).run(tests)
+                    if not res.wasSuccessful():
+                        some_test_failed = True
+                        print >> sys.stderr, module_name, 'failed\n', buffer.getvalue(
+                        )
+    if some_test_failed:
+        exit(1)
+if __name__ == '__main__':
+    main()