Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into detection_output

b41894d1 · sweetsky0901 · fe177b62 · 5ba231d8 · b41894d1 · b41894d1
18 changed file
--- a/paddle/capi/Main.cpp
+++ b/paddle/capi/Main.cpp
@@ -43,4 +43,11 @@ paddle_error paddle_init(int argc, char** argv) {
  isInit = true;
  return kPD_NO_ERROR;
 }
+
+paddle_error paddle_init_thread() {
+  if (FLAGS_use_gpu) {
+    hl_init(FLAGS_gpu_id);
+  }
+  return kPD_NO_ERROR;
+}
 }
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -40,7 +40,7 @@ paddle_error paddle_matrix_destroy(paddle_matrix mat) {
 paddle_error paddle_matrix_set_row(paddle_matrix mat,
                                   uint64_t rowID,
                                   paddle_real* rowArray) {
-  if (mat == nullptr) return kPD_NULLPTR;
+  if (mat == nullptr || rowArray == nullptr) return kPD_NULLPTR;
  auto ptr = cast(mat);
  if (ptr->mat == nullptr) return kPD_NULLPTR;
  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;

--- a/paddle/capi/error.cpp
+++ b/paddle/capi/error.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "error.h"
+
+const char* paddle_error_string(paddle_error err) {
+  switch (err) {
+    case kPD_NULLPTR:
+      return "nullptr error";
+    case kPD_OUT_OF_RANGE:
+      return "out of range error";
+    case kPD_PROTOBUF_ERROR:
+      return "protobuf error";
+    case kPD_NOT_SUPPORTED:
+      return "not supported error";
+    case kPD_UNDEFINED_ERROR:
+      return "undefined error";
+    default:
+      return "";
+  }
+}
--- a/paddle/capi/error.h
+++ b/paddle/capi/error.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifndef __PADDLE_CAPI_ERROR_H__
 #define __PADDLE_CAPI_ERROR_H__

+#include "config.h"
+
 /**
 * Error Type for Paddle API.
 */
@@ -27,4 +29,9 @@ typedef enum {
  kPD_UNDEFINED_ERROR = -1,
 } paddle_error;

+/**
+ * Error string for Paddle API.
+ */
+PD_API const char* paddle_error_string(paddle_error err);
+
 #endif
--- a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
+++ b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
 project(multi_thread)
 cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
+
 find_package (Threads)
+
+if(NOT PADDLE_ROOT)
+  set(PADDLE_ROOT $ENV{PADDLE_ROOT} CACHE PATH "Paddle Path")
+endif()
+if(PADDLE_ROOT)
+  include_directories(${PADDLE_ROOT}/include)
+  link_directories(${PADDLE_ROOT}/lib)
+endif()
+
+set(CPU_SRCS main.c)
+add_executable(${PROJECT_NAME} ${CPU_SRCS})
 set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared
+target_link_libraries(${PROJECT_NAME}
+                      -lpaddle_capi_shared
                      ${CMAKE_THREAD_LIBS_INIT})
+
+find_package(CUDA QUIET)
+if(CUDA_FOUND)
+  set(GPU_SRCS main_gpu.c)
+  cuda_add_executable(${PROJECT_NAME}_gpu ${GPU_SRCS})
+  set_property(TARGET ${PROJECT_NAME}_gpu PROPERTY C_STANDARD 99)
+  target_link_libraries(${PROJECT_NAME}_gpu
+                        -lpaddle_capi_shared
+                        ${CMAKE_THREAD_LIBS_INIT})
+endif(CUDA_FOUND)
--- a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
+++ b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
+#include <paddle/capi.h>
+#include <pthread.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+#define NUM_THREAD 4
+#define NUM_ITER 1000
+
+pthread_mutex_t mutex;
+
+/*
+ * @brief It is an simple inference example that runs multi-threads on a GPU.
+ *        Each thread holds it own local gradient_machine but shares the same
+ *        parameters.
+ *        If you want to run on different GPUs, you need to launch
+ *        multi-processes or set trainer_count > 1.
+ */
+void* thread_main(void* gm_ptr) {
+  // Initialize the thread environment of Paddle.
+  CHECK(paddle_init_thread());
+
+  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
+  // Create input arguments.
+  paddle_arguments in_args = paddle_arguments_create_none();
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+                                           /* size */ 784,
+                                           /* useGPU */ true);
+  // Create output arguments.
+  paddle_arguments out_args = paddle_arguments_create_none();
+  // Create output matrix.
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  // CPU buffer to cache the input and output.
+  paddle_real* cpu_input = (paddle_real*)malloc(784 * sizeof(paddle_real));
+  paddle_real* cpu_output = (paddle_real*)malloc(10 * sizeof(paddle_real));
+  for (int iter = 0; iter < NUM_ITER; ++iter) {
+    // There is only one input layer of this network.
+    CHECK(paddle_arguments_resize(in_args, 1));
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+    for (int i = 0; i < 784; ++i) {
+      cpu_input[i] = rand() / ((float)RAND_MAX);
+    }
+    CHECK(paddle_matrix_set_value(mat, cpu_input));
+
+    CHECK(paddle_gradient_machine_forward(machine,
+                                          in_args,
+                                          out_args,
+                                          /* isTrain */ false));
+
+    CHECK(paddle_arguments_get_value(out_args, 0, prob));
+    CHECK(paddle_matrix_get_value(prob, cpu_output));
+
+    pthread_mutex_lock(&mutex);
+    printf("Prob: ");
+    for (int i = 0; i < 10; ++i) {
+      printf("%.2f ", cpu_output[i]);
+    }
+    printf("\n");
+    pthread_mutex_unlock(&mutex);
+  }
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  free(cpu_input);
+  free(cpu_output);
+
+  return NULL;
+}
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=True"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  srand(time(0));
+  pthread_mutex_init(&mutex, NULL);
+
+  pthread_t threads[NUM_THREAD];
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    paddle_gradient_machine thread_local_machine;
+    CHECK(paddle_gradient_machine_create_shared_param(
+        machine, buf, size, &thread_local_machine));
+    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
+  }
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+
+  pthread_mutex_destroy(&mutex);
+
+  return 0;
+}
--- a/paddle/capi/main.h
+++ b/paddle/capi/main.h
@@ -26,6 +26,13 @@ extern "C" {
 */
 PD_API paddle_error paddle_init(int argc, char** argv);

+/**
+ * Initialize the thread environment of Paddle.
+ * @note it is requisite for GPU runs but optional for CPU runs.
+ *       For GPU runs, all threads will run on the same GPU devices.
+ */
+PD_API paddle_error paddle_init_thread();
+
 #ifdef __cplusplus
 }
 #endif

--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -59,7 +59,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
    auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
    auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
    if (in_var->GetType() != VarDesc::LOD_TENSOR) {
-      VLOG(3) << "input " << in << "is not LodTensor";
+      VLOG(3) << "input " << in << " is not LodTensor";
      return;
    }
    PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,

--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -41,14 +41,18 @@ class ConcatOp : public framework::OperatorWithKernel {
      for (size_t j = 0; j < in_zero_dims_size; j++) {
        if (j == axis) {
          out_dims[axis] += ins[i][j];
-          continue;
-        }
+        } else {
          PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
                            "Input tensors should have the same "
                            "elements except the specify axis.");
        }
      }
+    }
+    if (out_dims[axis] < 0) {
+      out_dims[axis] = -1;
+    }
    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
  }
 };


--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -95,6 +95,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                        "Input(Label) should be 1.");
    }
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
  }

 protected:

--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -122,10 +122,6 @@ Place CUDADeviceContext::GetPlace() const { return place_; }

 void CUDADeviceContext::Wait() const {
  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-}
-
-void CUDADeviceContext::Finish() const {
-  Wait();
  PADDLE_ENFORCE(cudaGetLastError());
 }


--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -46,8 +46,6 @@ class DeviceContext {
  DeviceType* GetEigenDevice() const;

  virtual void Wait() const {}
-
-  virtual void Finish() const {}
 };

 class CPUDeviceContext : public DeviceContext {
@@ -79,9 +77,6 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Wait for all operations completion in the stream. */
  void Wait() const override;

-  /*! \brief  Check potential errors for the cuda kernel calls. */
-  void Finish() const override;
-
  /*! \brief  Return place in the device context. */
  Place GetPlace() const override;


--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -113,7 +113,10 @@ EOF
            -DWITH_SWIG_PY=ON \
            -DWITH_STYLE_CHECK=OFF
        make -j `nproc` gen_proto_py
+        make -j `nproc` paddle_python
        make -j `nproc` paddle_docs paddle_docs_cn
+        make -j `nproc` print_operators_doc
+        paddle/pybind/print_operators_doc > doc/en/html/operators.json
        popd
    fi

@@ -185,14 +188,6 @@ EOF
    ${DOCKERFILE_GPU_ENV}
    ADD go/cmd/pserver/pserver /usr/bin/
    ADD go/cmd/master/master /usr/bin/
-EOF
-
-    if [[ ${WITH_DOC:-OFF} == 'ON' ]]; then
-        cat >> /paddle/build/Dockerfile <<EOF
-        ADD paddle/pybind/print_operators_doc /usr/bin/
-EOF
-    fi
-    cat >> /paddle/build/Dockerfile <<EOF
    # default command shows the paddle version and exit
    CMD ["paddle", "version"]
 EOF

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -33,6 +33,12 @@ if(WITH_MKLDNN)
  list(APPEND MKL_DEPENDS mkldnn)
 endif()

+if(WITH_GPU)
+  SET(PACKAGE_NAME "paddlepaddle-gpu")
+else()
+  SET(PACKAGE_NAME "paddlepaddle")
+endif()
+
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)


--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2722,15 +2722,15 @@ def img_pool_layer(input,

    ..  math::

-        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        w = 1 + \frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} \\\\
+        h = 1 + \frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}

    - ceil_mode=False:

    ..  math::

-        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        w = 1 + \frac{floor(input\_width + 2 * padding - pool\_size)}{stride} \\\\
+        h = 1 + \frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}

    The example usage is:

@@ -2863,17 +2863,17 @@ def img_pool3d_layer(input,

    ..  math::

-        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
-        d = 1 + int(ceil(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z))
+        w = 1 + \frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} \\\\
+        h = 1 + \frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} \\\\
+        d = 1 + \frac{ceil(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}

    - ceil_mode=False:

    ..  math::

-        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
-        d = 1 + int(floor(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z))
+        w = 1 + \frac{floor(input\_width + 2 * padding - pool\_size)}{stride} \\\\
+        h = 1 + \frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} \\\\
+        d = 1 + \frac{floor(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z} \\\\

    The example usage is:

@@ -2996,7 +2996,7 @@ def spp_layer(input,

    Reference:
        `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
-        https://arxiv.org/abs/1406.4729`_
+        <https://arxiv.org/abs/1406.4729>`_

    The example usage is:

@@ -3098,7 +3098,7 @@ def img_cmrnorm_layer(input,

    Reference:
        `ImageNet Classification with Deep Convolutional Neural Networks
-        http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf`_
+        <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_

    The example usage is:

@@ -3166,7 +3166,7 @@ def batch_norm_layer(input,
    Reference:
        `Batch Normalization: Accelerating Deep Network Training by Reducing
        Internal Covariate Shift
-        http://arxiv.org/abs/1502.03167`_
+        <http://arxiv.org/abs/1502.03167>`_

    The example usage is:

@@ -5424,17 +5424,19 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):

    Reference:
        `Maxout Networks
-        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf`_
+        <http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf>`_
        `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
-        https://arxiv.org/pdf/1312.6082v4.pdf`_
+        <https://arxiv.org/pdf/1312.6082v4.pdf>`_
+

    .. math::
-       y_{si+j} = \max_k x_{gsi + sk + j}
-       g = groups
-       s = input.size / num_channels
-       0 \le i < num_channels / groups
-       0 \le j < s
-       0 \le k < groups
+       out = \max_k (in[n, k, o_c , s])   \\\\
+       out_{i * s + j} = \max_k in_{  k * o_{c} * s + i * s + j}  \\\\
+       s = \frac{input.size}{ num\_channels}       \\\\
+       o_{c} =\frac{num\_channels}{groups}         \\\\
+       0 \le i < o_{c}                             \\\\
+       0 \le j < s                                 \\\\
+       0 \le k < groups                            \\\\

    The simple usage is:

@@ -5493,7 +5495,7 @@ def ctc_layer(input,
    Reference:
        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
        with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
+        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_

    Note:
        Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
@@ -5567,7 +5569,7 @@ def warp_ctc_layer(input,
    Reference:
        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
        with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
+        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_

    Note:
        - Let num_classes represents the category number. Considering the 'blank'
@@ -5788,7 +5790,7 @@ def nce_layer(input,

    Reference:
        `A fast and simple algorithm for training neural probabilistic language
-        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf`_
+        models. <https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf>`_

    The example usage is:

@@ -5904,7 +5906,7 @@ def rank_cost(left,

    Reference:
        `Learning to Rank using Gradient Descent
-        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf`_
+        <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_

    .. math::

@@ -6440,7 +6442,7 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):

    Reference:
        `Fast R-CNN
-        https://arxiv.org/pdf/1504.08083v2.pdf`_
+        <https://arxiv.org/pdf/1504.08083v2.pdf>`_

    The example usage is:

@@ -6647,7 +6649,7 @@ def prelu_layer(input,

    Reference:
        `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf`_
+        ImageNet Classification <http://arxiv.org/pdf/1502.01852v1.pdf>`_

    .. math::
       z_i &\\quad if \\quad z_i > 0 \\\\
@@ -6744,7 +6746,7 @@ def gated_unit_layer(input,

    Reference:
        `Language Modeling with Gated Convolutional Networks
-        https://arxiv.org/abs/1612.08083`_
+        <https://arxiv.org/abs/1612.08083>`_

    .. math::
       y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)

--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
@@ -430,7 +430,8 @@ def _create_op_func_(op_type):
                    dtype = each.dtype
                elif dtype != each.dtype:
                    raise ValueError(
-                        "operator {0} must input same dtype".format(op_type))
+                        "operator {0} must input same dtype. {1} vs {2}".format(
+                            op_type, dtype, each.dtype))

        return dtype


--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor, g_scope
-from paddle.v2.fluid.optimizer import SGDOptimizer
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.layers as pd
+from paddle.v2.fluid.executor import Executor

 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
 src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
-hidden_dim = 512
-word_dim = 512
+hidden_dim = 32
+word_dim = 16
 IS_SPARSE = True
-batch_size = 50
+batch_size = 10
 max_length = 50
 topk_size = 50
 trg_dic_size = 10000

-src_word_id = layers.data(name="src_word_id", shape=[1], dtype='int64')
-src_embedding = layers.embedding(
+decoder_size = hidden_dim
+
+
+def encoder_decoder():
+    # encoder
+    src_word_id = layers.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
        input=src_word_id,
        size=[dict_size, word_dim],
        dtype='float32',
        is_sparse=IS_SPARSE,
        param_attr=fluid.ParamAttr(name='vemb'))

+    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_pool(input=lstm_hidden0, pool_type="last")

-def encoder():
-
-    lstm_hidden0, lstm_0 = layers.dynamic_lstm(
-        input=src_embedding,
-        size=hidden_dim,
-        candidate_activation='sigmoid',
-        cell_activation='sigmoid')
-
-    lstm_hidden1, lstm_1 = layers.dynamic_lstm(
-        input=src_embedding,
-        size=hidden_dim,
-        candidate_activation='sigmoid',
-        cell_activation='sigmoid',
-        is_reverse=True)
-
-    bidirect_lstm_out = layers.concat([lstm_hidden0, lstm_hidden1], axis=0)
-
-    return bidirect_lstm_out
+    # decoder
+    trg_language_word = layers.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))

+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        mem = rnn.memory(init=encoder_out)
+        fc1 = fluid.layers.fc(input=[current_word, mem],
+                              size=decoder_size,
+                              act='tanh')
+        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
+        rnn.update_memory(mem, fc1)
+        rnn.output(out)

-def decoder_trainer(context):
-    '''
-    decoder with trainer
-    '''
-    pass
+    return rnn()


 def to_lodtensor(data, place):
@@ -72,13 +75,18 @@ def to_lodtensor(data, place):


 def main():
-    encoder_out = encoder()
-    # TODO(jacquesqiao) call here
-    decoder_trainer(encoder_out)
+    rnn_out = encoder_decoder()
+    label = layers.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimizer.minimize(avg_cost)

    train_data = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(8000), buf_size=1000),
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
        batch_size=batch_size)

    place = core.CPUPlace()
@@ -88,15 +96,23 @@ def main():

    batch_id = 0
    for pass_id in xrange(2):
-        print 'pass_id', pass_id
        for data in train_data():
-            print 'batch', batch_id
-            batch_id += 1
-            if batch_id > 10: break
            word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
            outs = exe.run(framework.default_main_program(),
-                           feed={'src_word_id': word_data, },
-                           fetch_list=[encoder_out])
+                           feed={
+                               'src_word_id': word_data,
+                               'target_language_word': trg_word,
+                               'target_language_next_word': trg_word_next
+                           },
+                           fetch_list=[avg_cost])
+            avg_cost_val = np.array(outs[0])
+            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                  " avg_cost=" + str(avg_cost_val))
+            if batch_id > 3:
+                exit(0)
+            batch_id += 1


 if __name__ == '__main__':

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -5,7 +5,7 @@ class BinaryDistribution(Distribution):
        return True

 MAJOR   = 0
-MINOR   = 10
+MINOR   = 11
 PATCH   = 0
 RC      = 0
 ISTAGED = False
@@ -89,7 +89,7 @@ paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
 if '${MKL_SHARED_LIBS}'!= '':
  paddle_rt_libs += '${MKL_SHARED_LIBS}'.split(';')

-setup(name='paddlepaddle',
+setup(name='${PACKAGE_NAME}',
      version='${PADDLE_VERSION}',
      description='Parallel Distributed Deep Learning',
      install_requires=setup_requires,