diff --git a/fluid/DeepASR/examples/aishell/download_pretrained_model.sh b/fluid/DeepASR/examples/aishell/download_pretrained_model.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a8813e241c4f6e40392dff6f173160d2bbd77175
--- /dev/null
+++ b/fluid/DeepASR/examples/aishell/download_pretrained_model.sh
@@ -0,0 +1,15 @@
+url=http://deep-asr-data.gz.bcebos.com/aishell_pretrained_model.tar.gz
+md5=7b51bde64e884f43901b7a3461ccbfa3
+
+wget -c $url
+
+echo "Checking md5 sum ..."
+md5sum_tmp=`md5sum aishell_pretrained_model.tar.gz | cut -d ' ' -f1`
+
+if [ $md5sum_tmp !=  $md5 ]; then
+    echo "Md5sum check failed, please remove and redownload "
+          "aishell_pretrained_model.tar.gz."
+    exit 1
+fi
+
+tar xvf aishell_pretrained_model.tar.gz 
diff --git a/fluid/DeepASR/score_error_rate.py b/fluid/DeepASR/score_error_rate.py
index 5ecbca0862e3ea5981ef9ed8537fb98fabf2f62d..dde5a2448afffcae61c4d033159a5b081e6c79e8 100644
--- a/fluid/DeepASR/score_error_rate.py
+++ b/fluid/DeepASR/score_error_rate.py
@@ -16,10 +16,18 @@ def parse_args():
         default='cer',
         choices=['cer', 'wer'],
         help="Error rate type. (default: %(default)s)")
+    parser.add_argument(
+        '--special_tokens',
+        type=str,
+        default='<SPOKEN_NOISE>',
+        help="Special tokens in scoring CER, seperated by space. "
+        "They shouldn't be splitted and should be treated as one special "
+        "character. Example: '<SPOKEN_NOISE> <bos> <eos>' "
+        "(default: %(default)s)")
     parser.add_argument(
         '--ref', type=str, required=True, help="The ground truth text.")
     parser.add_argument(
-        '--hyp', type=str, required=True, help="The decoding result.")
+        '--hyp', type=str, required=True, help="The decoding result text.")
     args = parser.parse_args()
     return args
 
@@ -31,6 +39,8 @@ if __name__ == '__main__':
     sum_errors, sum_ref_len = 0.0, 0
     sent_cnt, not_in_ref_cnt = 0, 0
 
+    special_tokens = args.special_tokens.split(" ")
+
     with open(args.ref, "r") as ref_txt:
         line = ref_txt.readline()
         while line:
@@ -51,6 +61,8 @@ if __name__ == '__main__':
                 continue
 
             if args.error_rate_type == 'cer':
+                for sp_tok in special_tokens:
+                    sent = sent.replace(sp_tok, '\0')
                 errors, ref_len = char_errors(
                     ref_dict[key].decode("utf8"),
                     sent.decode("utf8"),
diff --git a/fluid/face_detection/.gitignore b/fluid/face_detection/.gitignore
index ea3e7b052591ddb7d19525a685c13971bededf6f..0636bd5b2995e0a0fa27fe54be6ccbbb78074dca 100644
--- a/fluid/face_detection/.gitignore
+++ b/fluid/face_detection/.gitignore
@@ -9,3 +9,4 @@ log*
 output*
 pred
 eval_tools
+box*
diff --git a/fluid/face_detection/README_cn.md b/fluid/face_detection/README_cn.md
index 1213a59dba4dc7b4c001deef7e2029f45c232ff0..8987b00fb2c66daedd42993214b5c9bab42a99c3 100644
--- a/fluid/face_detection/README_cn.md
+++ b/fluid/face_detection/README_cn.md
@@ -93,7 +93,7 @@ tar -xf vgg_ilsvrc_16_fc_reduced.tar.gz && rm -f vgg_ilsvrc_16_fc_reduced.tar.gz
 `train.py` 是训练模块的主要执行程序，调用示例如下：
 
 ```bash
-python -u train.py --batch_size=16 --pretrained_model=vgg_ilsvrc_16_fc_reduced
+python -u train.py --batch_size=12 --pretrained_model=vgg_ilsvrc_16_fc_reduced
 ```
   - 可以通过设置 `export CUDA_VISIBLE_DEVICES=0,1,2,3` 指定想要使用的GPU数量。
   - 更多的可选参数见:
diff --git a/fluid/face_detection/train.py b/fluid/face_detection/train.py
index b62ac26d0d7236421e80ed4396c6ed3d0f72c310..1680dc5ce06a6bd4e7dcc910a68382a6846adc77 100644
--- a/fluid/face_detection/train.py
+++ b/fluid/face_detection/train.py
@@ -16,14 +16,14 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('parallel',         bool,  True,            "Whether use multi-GPU/threads or not.")
 add_arg('learning_rate',    float, 0.001,           "The start learning rate.")
-add_arg('batch_size',       int,   16,              "Minibatch size.")
+add_arg('batch_size',       int,   12,              "Minibatch size.")
 add_arg('num_passes',       int,   160,             "Epoch number.")
 add_arg('use_gpu',          bool,  True,            "Whether use GPU.")
 add_arg('use_pyramidbox',   bool,  True,            "Whether use PyramidBox model.")
 add_arg('model_save_dir',   str,   'output',        "The path to save model.")
 add_arg('resize_h',         int,   640,             "The resized image height.")
 add_arg('resize_w',         int,   640,             "The resized image width.")
-add_arg('with_mem_opt',     bool,  True,            "Whether to use memory optimization or not.")
+add_arg('with_mem_opt',     bool,  False,            "Whether to use memory optimization or not.")
 add_arg('pretrained_model', str,   './vgg_ilsvrc_16_fc_reduced/', "The init model path.")
 #yapf: enable
 
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py b/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py
index 05fbd6b85c2d70124817e7c5a2d5a90e78ba7847..45b1f5303ce77de7c7f5e3a232517c26e159b2fa 100644
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py
@@ -24,15 +24,10 @@ def calc_diff(f1, f2):
     #print d2.shape
     #print d1[0, 0, 0:10, 0:10]
     #print d2[0, 0, 0:10, 0:10]
-    #d1 = d1[:, :, 1:-2, 1:-2]
-    #d2 = d2[:, :, 1:-2, 1:-2]
 
     d1 = d1.flatten()
     d2 = d2.flatten()
 
-    #print d1[:10]
-    #print d2[:10]
-
     d1_num = reduce(lambda x, y: x * y, d1.shape)
     d2_num = reduce(lambda x, y: x * y, d2.shape)
     if d1_num != d2_num:
@@ -41,7 +36,11 @@ def calc_diff(f1, f2):
         assert (d1_num == d2_num), "their shape is not consistent"
 
     try:
+        mask = np.abs(d1) >= np.abs(d2)
+        mask = mask.astype('int32')
+
         df = np.abs(d1 - d2)
+        df = df / (1.0e-10 + np.abs(d1) * mask + np.abs(d2) * (1 - mask))
         max_df = np.max(df)
         sq_df = np.mean(df * df)
         return max_df, sq_df
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py
index 703c6a0a8091df79c73465be8c52248af518f3ca..73c7bed2a4ce475c84337b813a5552abc57ab998 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py
@@ -8,6 +8,12 @@ import axpy
 import flatten
 import argmax
 import reshape
+import roipooling
+import priorbox
+import permute
+import detection_out
+import normalize
+import select
 
 #custom layer import ends
 
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/detection_out.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/detection_out.py
new file mode 100644
index 0000000000000000000000000000000000000000..b59930a74a28fa82b701b413556371075d6e8113
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/detection_out.py
@@ -0,0 +1,79 @@
+""" A custom layer for 'detectionout' used in 'SSD' model to produce outputs
+    Note: Since Paddle's implementation of 'detectionout' applied 'flatten' and 'softmax' ops on the input of 'conf', 
+    while Caffe's implementation do not. Hence, you should ajust generated 'ssd.py' to remove 'softmax' and 'flatten' ops applied on 'conf' input.
+"""
+
+from .register import register
+
+
+def detectionoutput_shape(input_shape):
+    """ the output shape of this layer is dynamic and not determined by 'input_shape'
+
+    Args:
+        @input_shape (list of int): input shape
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+    output_shape = [-1, 6]
+    return output_shape
+
+
+def detectionoutput_layer(inputs,
+                          name,
+                          background_label=0,
+                          share_location=True,
+                          nms_param=None,
+                          keep_top_k=100,
+                          confidence_threshold=0.1):
+    """ build a layer of type 'detectionout' using fluid
+
+    Args:
+        @inputs (list of variables): input fluid variables for this layer
+        @name (str): name for this layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+    import paddle.fluid as fluid
+
+    if nms_param is None:
+        nms_param = {"nms_threshold": 0.3, "top_k": 10, "eta": 1.0}
+
+    mbox_conf_flatten = inputs[1]
+    mbox_priorbox = inputs[2]
+    mbox_priorbox_list = fluid.layers.split(mbox_priorbox, 2, dim=1)
+    pb = mbox_priorbox_list[0]
+    pbv = mbox_priorbox_list[1]
+    pb = fluid.layers.reshape(x=pb, shape=[-1, 4])
+    pbv = fluid.layers.reshape(x=pbv, shape=[-1, 4])
+    mbox_loc = inputs[0]
+    mbox_loc = fluid.layers.reshape(
+        x=mbox_loc, shape=[-1, mbox_conf_flatten.shape[1], 4])
+
+    default = {"nms_threshold": 0.3, "top_k": 10, "eta": 1.0}
+    fields = ['eta', 'top_k', 'nms_threshold']
+
+    for f in default.keys():
+        if not nms_param.has_key(f):
+            nms_param[f] = default[f]
+
+    nmsed_outs = fluid.layers.detection_output(
+        scores=mbox_conf_flatten,
+        loc=mbox_loc,
+        prior_box=pb,
+        prior_box_var=pbv,
+        background_label=background_label,
+        nms_threshold=nms_param["nms_threshold"],
+        nms_top_k=nms_param["top_k"],
+        keep_top_k=keep_top_k,
+        score_threshold=confidence_threshold,
+        nms_eta=nms_param["eta"])
+
+    return nmsed_outs
+
+
+register(
+    kind='DetectionOutput',
+    shape=detectionoutput_shape,
+    layer=detectionoutput_layer)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py
index 8f7af4266f7fd4b7b6e8ee868f44f1b35f35cb00..ebb97718e3294bb473752bc6235917bed0db0650 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py
@@ -4,11 +4,6 @@
 from .register import register
 
 
-def import_fluid():
-    import paddle.fluid as fluid
-    return fluid
-
-
 def flatten_shape(input_shape, axis=1, end_axis=-1):
     """ calculate the output shape of this layer using input shape
 
@@ -28,7 +23,7 @@ def flatten_shape(input_shape, axis=1, end_axis=-1):
         start_axis += len(input_shape)
 
     if end_axis < 0:
-        end_axis += len(input_shape)
+        end_axis += len(input_shape) + 1
 
     assert start_axis <= end_axis, 'invalid axis[%d] or end_axis[%d] params'\
             % (start_axis, end_axis)
@@ -52,18 +47,16 @@ def flatten_layer(input, name, axis=1, end_axis=-1):
     Returns:
         output (variable): output variable for this layer
     """
-    fluid = import_fluid()
+    import paddle.fluid as fluid
 
     input_shape = list(input.shape)
-    dims = len(input_shape)
-    start_axis = axis if axis >= 0 else axis + dims
-    end_axis = end_axis if end_axis >= 0 else end_axis + dims
 
-    assert start_axis <= end_axis, 'invalid axis or end_axis params'
-    output_shape = input_shape[0:start_axis]
-    flat_sz = reduce(lambda a, b: a * b, input_shape[start_axis:end_axis])
-    output_shape += [flat_sz]
-    output_shape += input_shape[end_axis:-1]
+    if input_shape[0] == -1:
+        input_shape[0] = 1
+        output_shape = flatten_shape(input_shape, axis=axis, end_axis=end_axis)
+        output_shape[0] = -1
+    else:
+        output_shape = flatten_shape(input_shape, axis=axis, end_axis=end_axis)
 
     output = fluid.layers.reshape(input, shape=output_shape, name=name)
 
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/normalize.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6e8c00fb126009c5724fa6509c32c2b8c96bace
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/normalize.py
@@ -0,0 +1,56 @@
+""" A custom layer for 'normalize' op
+"""
+
+from .register import register
+
+
+def normalize_shape(input_shape,
+                    across_spatial=True,
+                    scale_filler=True,
+                    eps=1e-10):
+    """ calculate the output shape of this layer using input shapes
+
+    Args:
+        @input_shape (list of tuples): input shape
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+    output_shape = input_shape
+    return output_shape
+
+
+def normalize_layer(input,
+                    name,
+                    across_spatial=True,
+                    scale_filler=True,
+                    channel_shared=False,
+                    eps=1e-10):
+    """ build a layer of type 'normalize' using fluid
+
+    Args:
+        @inputs (list of variables): input fluid variables for this layer
+        @name (str): name for this layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+    import paddle.fluid as fluid
+
+    param_prefix = name.split('.')[0]
+
+    assert across_spatial == False, "Only support across_spatial == False for Normalize[%s]" % (
+        name)
+    l2_norm = fluid.layers.l2_normalize(input, axis=1)  # l2 norm along channel
+
+    shape = [1] if channel_shared else [input.shape[1]]
+    scale_attr = fluid.ParamAttr(name=param_prefix + '_scale')
+    scale_param = fluid.layers.create_parameter(
+        shape=shape, dtype=input.dtype, name=name, attr=scale_attr)
+
+    out = fluid.layers.elementwise_mul(
+        x=l2_norm, y=scale_param, axis=-1 if channel_shared else 1)
+    return out
+
+
+register(kind='Normalize', shape=normalize_shape, layer=normalize_layer)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/permute.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/permute.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0633fd5ff6b24a47adcd765e221e916bb1508f6
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/permute.py
@@ -0,0 +1,40 @@
+""" A custom layer for 'Permute' which is equivalent to transpose in paddle
+"""
+
+from .register import register
+
+
+def permute_shape(input_shape, order):
+    """ calculate the output shape of this layer using input shapes
+
+    Args:
+        @input_shape (list of numbers): input shape
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+    output_shape = []
+    for ii in order:
+        assert ii < len(input_shape), "invalid order for permute[%s]" % (name)
+        output_shape.append(input_shape[ii])
+    return output_shape
+
+
+def permute_layer(input, name, order):
+    """ build a layer of type 'permute' using fluid
+
+    Args:
+        @input (input variable): input fluid variables for this layer
+        @name (str): name for this layer
+        @order (list of int): order to permute the dims
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+    import paddle.fluid as fluid
+    output = fluid.layers.transpose(input, order, name=name)
+
+    return output
+
+
+register(kind='Permute', shape=permute_shape, layer=permute_layer)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/priorbox.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/priorbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3c23fbdb17a4992f41946a9889790f0782bd7e7
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/priorbox.py
@@ -0,0 +1,100 @@
+""" A custom layer for 'priorbox' which is used in ssd to generate prior box info
+    Since the order of prior box is different between caffe and paddle,
+    we use 'slice' and 'concate' ops to align them.
+"""
+
+from .register import register
+
+
+def priorbox_shape(input_shapes, min_size, max_size=None, aspect_ratio=None):
+    """ calculate the output shape of this layer using input shapes
+
+    Args:
+        @input_shapes (list of tuples): a list of input shapes
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+    assert len(input_shapes) == 2, "invalid inputs for Priorbox[%s]" % (name)
+    fc_shape = input_shapes[0]
+    N = 1
+    if not max_size == None:
+        N += 1
+    if not aspect_ratio == None:
+        N += 2 * len(aspect_ratio)
+
+    N_bbx = fc_shape[2] * fc_shape[3] * N
+    output_shape = [1, 2, 4 * N_bbx]
+    return output_shape
+
+
+def priorbox_layer(inputs,
+                   name,
+                   min_size,
+                   step,
+                   max_size=None,
+                   aspect_ratio=None,
+                   flip=True,
+                   clip=False,
+                   variance=[],
+                   offset=0.5):
+    """ build a layer of type 'Priorbox' using fluid
+
+    Args:
+        @inputs (list of variables): input fluid variables for this layer
+        @name (str): name for this layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+    import paddle.fluid as fluid
+
+    assert len(inputs) == 2, "invalid inputs for Priorbox[%s]" % (name)
+    input = inputs[0]
+    image = inputs[1]
+    box, variance_ = fluid.layers.prior_box(
+        input,
+        image,
+        min_size,
+        max_size,
+        aspect_ratio,
+        variance,
+        flip,
+        clip, (step, step),
+        offset,
+        min_max_aspect_ratios_order=True)
+    """
+    #adjust layout when the output is not consistent with caffe's
+
+    feat_shape = list(input.shape)
+    H = feat_shape[2]
+    W = feat_shape[3]
+    box_tmp = fluid.layers.reshape(box, [H, W, -1, 4])
+    nb_prior_bbx = int(box_tmp.shape[2])
+    tensor_list = fluid.layers.split(box_tmp, nb_prior_bbx, 2)
+
+    #TODO:
+    #   current implementation for this layer is not efficient
+    #   and we should fix this bug in future when Paddle support the same prior-box layout with Caffe
+    index_list = [0]
+    index_list = index_list * nb_prior_bbx
+    index_offset = 0
+    if max_size is not None:
+        index_list[1] = -1
+        index_offset = 1
+    for ii in xrange(2 * len(aspect_ratio)):
+        index_list[ii + 1 + index_offset] = ii + 1
+
+    tensor_list_gathered = [tensor_list[ii] for ii in index_list]
+    caffe_prior_bbx = fluid.layers.concat(tensor_list_gathered, axis=2)
+    box = fluid.layers.reshape(caffe_prior_bbx, [1, 1, -1])
+    """
+
+    box = fluid.layers.reshape(box, [1, 1, -1])
+    variance_ = fluid.layers.reshape(variance_, [1, 1, -1])
+    output = fluid.layers.concat([box, variance_], axis=1)
+
+    return output
+
+
+register(kind='PriorBox', shape=priorbox_shape, layer=priorbox_layer)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/reshape.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/reshape.py
index 6b8d5681ec68c7a899cb3fdbd4fca0249402bfa0..da82e4d67c7cbb558c223bce528cb23c7feb91c8 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/reshape.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/reshape.py
@@ -68,15 +68,23 @@ def reshape_shape(input_sp, shape, axis=0, num_axes=-1):
         top_dim = shape['dim'][i]
         if top_dim == 0:
             copy_axes.append(i)
+            copy_axis_index = start_axis + i
+            output_shape[copy_axis_index] = input_shape[copy_axis_index]
         elif top_dim == -1:
             assert inferred_axis == -1, "[Reshape]new shape contains multiple -1 dims"
+            inferred_axis = i
         else:
             constant_count *= top_dim
 
     if inferred_axis >= 0:
         explicit_count = constant_count
-        explicit_count *= count(input_shape[0:start_axis])
-        explicit_count *= count(input_shape[end_axis:])
+        l = input_shape[0:start_axis]
+        if len(l) > 0:
+            explicit_count *= count(l)
+
+        l = input_shape[end_axis:]
+        if len(l) > 0:
+            explicit_count *= count(l)
 
         for i in range(len(copy_axes)):
             explicit_count *= output_shape[start_axis + copy_axes[i]]
@@ -84,6 +92,7 @@ def reshape_shape(input_sp, shape, axis=0, num_axes=-1):
         assert input_count % explicit_count == 0, "[Reshape]botom count[%d] "\
                 "must be divisible by product of the specified dimensions[%d] "\
                 % (input_count, explicit_count)
+        output_shape[start_axis + inferred_axis] = input_count / explicit_count
 
     output_count = count(output_shape)
     assert output_count == input_count, "[Reshape]output count[%d] must match input count[%d]" % (
@@ -117,6 +126,7 @@ def reshape_layer(input, name, shape, axis=0, num_axes=-1):
         output_shape = reshape_shape(input_shape, shape, axis, num_axes)
 
     output = fluid.layers.reshape(input, shape=output_shape, name=name)
+
     return output
 
 
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/roipooling.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/roipooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccbf24ab7e7ed624f76dff3c9392315f8020a6bf
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/roipooling.py
@@ -0,0 +1,53 @@
+""" a custom layer for 'ROIPooling', maybe we should implement this in standard way.
+    more info can be found here: http://caffe.berkeleyvision.org/tutorial/layers/ROIPooling.html
+"""
+from .register import register
+
+
+def roipooling_shape(input_shapes, pooled_h, pooled_w, spatial_scale):
+    """ calculate the output shape of this layer using input shape
+
+    Args:
+        @input_shape (list of num): a list of number which represents the input shape
+        @out_max_val (bool): parameter from caffe's ROIPooling layer
+        @top_k (int): parameter from caffe's ROIPooling layer
+        @axis (int): parameter from caffe's ROIPooling layer
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+    assert len(input_shapes) == 2, "not valid input shape for roipooling layer"
+    base_fea_shape = input_shapes[0]
+    rois_shape = input_shapes[1]
+    output_shape = base_fea_shape
+    output_shape[0] = rois_shape[0]
+    output_shape[2] = pooled_h
+    output_shape[3] = pooled_w
+    return output_shape
+
+
+def roipooling_layer(inputs, name, pooled_h, pooled_w, spatial_scale):
+    """ build a layer of type 'ROIPooling' using fluid
+
+    Args:
+        @input (variable): input fluid variable for this layer
+        @name (str): name for this layer
+        @out_max_val (bool): parameter from caffe's ROIPooling layer
+        @top_k (int): parameter from caffe's ROIPooling layer
+        @axis (int): parameter from caffe's ROIPooling layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+
+    import paddle.fluid as fluid
+    assert len(inputs) == 2, "not valid input shape for roipooling layer"
+    base_fea = inputs[0]
+    rois = inputs[1][:, 1:5]
+    rois_fea = fluid.layers.roi_pool(base_fea, rois, pooled_h, pooled_w,
+                                     spatial_scale)
+
+    return rois_fea
+
+
+register(kind='ROIPooling', shape=roipooling_shape, layer=roipooling_layer)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/select.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/select.py
new file mode 100644
index 0000000000000000000000000000000000000000..708ac64801914fde8792e4f26edf561829063e14
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/select.py
@@ -0,0 +1,67 @@
+""" a custom layer for 'select' which is used to replace standard 'Slice' layer 
+    for converting layer with multiple different output tensors
+"""
+from .register import register
+
+
+def select_shape(input_shape, slice_point, axis=1):
+    """ calculate the output shape of this layer using input shape
+
+    Args:
+        @input_shape (list of num): a list of number which represents the input shape
+        @slice_point (list): parameter from caffe's Slice layer
+        @axis (int): parameter from caffe's Slice layer
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+
+    input_shape = list(input_shape)
+    start = slice_point[0]
+    if len(slice_point) == 2:
+        end = slice_point[1]
+    else:
+        end = input_shape[axis]
+
+    assert end > start, "invalid slice_point with [start:%d, end:%d]"\
+             % (start, end)
+    output_shape = input_shape
+    output_shape[axis] = end - start
+    return output_shape
+
+
+def select_layer(input, name, slice_point, axis=1):
+    """ build a layer of type 'Slice' using fluid
+
+    Args:
+        @input (variable): input fluid variable for this layer
+        @name (str): name for this layer
+        @slice_point (list): parameter from caffe's Slice layer
+        @axis (int): parameter from caffe's Slice layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+    import paddle.fluid as fluid
+    input_shape = list(input.shape)
+
+    start = slice_point[0]
+    if len(slice_point) == 2:
+        end = slice_point[1]
+    else:
+        end = input_shape[axis]
+
+    sections = []
+    if start > 0:
+        sections.append(start)
+
+    pos = len(sections)
+    sections.append(end - start)
+    if end != input_shape[axis]:
+        sections.append(input_shape[axis] - end)
+
+    outputs = fluid.layers.split(input, sections, dim=axis, name=name)
+    return outputs[pos]
+
+
+register(kind='Select', shape=select_shape, layer=select_layer)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/layers.py b/fluid/image_classification/caffe2fluid/kaffe/layers.py
index f2d54c59fe8ee78840ce7d23a67694e495ceddf8..98ef6b65329dd7ba314efdd638f72313d796e39f 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/layers.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/layers.py
@@ -16,7 +16,7 @@ LAYER_DESCRIPTORS = {
     'Concat': shape_concat,
     'ContrastiveLoss': shape_scalar,
     'Convolution': shape_convolution,
-    'Deconvolution': shape_not_implemented,
+    'Deconvolution': shape_deconvolution,
     'Data': shape_data,
     'Dropout': shape_identity,
     'DummyData': shape_data,
@@ -39,6 +39,7 @@ LAYER_DESCRIPTORS = {
     'Pooling': shape_pool,
     'Power': shape_identity,
     'ReLU': shape_identity,
+    'PReLU': shape_identity,
     'Scale': shape_identity,
     'Sigmoid': shape_identity,
     'SigmoidCrossEntropyLoss': shape_scalar,
@@ -179,6 +180,11 @@ class LayerAdapter(object):
     @property
     def parameters(self):
         name = NodeDispatch.get_handler_name(self.kind)
+        if self.kind.lower() == "normalize":
+            name = "norm"
+        elif self.kind.lower() == "deconvolution":
+            name = "convolution"
+
         name = '_'.join((name, 'param'))
         try:
             return getattr(self.layer, name)
@@ -207,7 +213,9 @@ class LayerAdapter(object):
 
     @property
     def kernel_parameters(self):
-        assert self.kind in (NodeKind.Convolution, NodeKind.Pooling)
+        assert self.kind in (NodeKind.Convolution, NodeKind.Pooling,\
+                    NodeKind.Deconvolution)
+
         params = self.parameters
         k_h = self.get_kernel_value(params.kernel_h, params.kernel_size, 0)
         k_w = self.get_kernel_value(params.kernel_w, params.kernel_size, 1)
@@ -217,9 +225,25 @@ class LayerAdapter(object):
             params.stride_w, params.stride, 1, default=1)
         p_h = self.get_kernel_value(params.pad_h, params.pad, 0, default=0)
         p_w = self.get_kernel_value(params.pad_w, params.pad, 1, default=0)
-        return KernelParameters(k_h, k_w, s_h, s_w, p_h, p_w)
-
 
-KernelParameters = namedtuple('KernelParameters', [
-    'kernel_h', 'kernel_w', 'stride_h', 'stride_w', 'pad_h', 'pad_w'
-])
+        dila_h = dila_w = 1
+        if self.kind in (NodeKind.Convolution, NodeKind.Deconvolution):
+            dila_len = len(params.dilation)
+            if dila_len == 2:
+                dila_h = params.dilation[0]
+                dila_w = params.dilation[1]
+            elif dila_len == 1:
+                dila_h = dila_w = params.dilation[0]
+            else:
+                assert dila_len == 0, "invalid length[%s] of dilation in convolution" % (
+                    dila_len)
+
+        return KernelParameters(k_h, k_w, s_h, s_w, p_h, p_w, dila_h, dila_w)
+
+
+KernelParameters = namedtuple(
+    'KernelParameters',
+    [
+        'kernel_h', 'kernel_w', 'stride_h', 'stride_w', 'pad_h', 'pad_w',
+        'dila_h', 'dila_w'
+    ], )
diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
index e8b0f2c3a91aaafcfc0951524ac64ed9723ad902..1fc98b057dbf16228c834674f5aee8c4bd123935 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
@@ -91,7 +91,7 @@ class Network(object):
                     name = '%s_%s' % (op_name, param_name)
                     v = fluid.global_scope().find_var(name)
                     w = v.get_tensor()
-                    w.set(data, place)
+                    w.set(data.reshape(w.shape()), place)
                 except ValueError:
                     if not ignore_missing:
                         raise
@@ -144,6 +144,7 @@ class Network(object):
              relu=True,
              relu_negative_slope=0.0,
              padding=None,
+             dilation=1,
              group=1,
              biased=True):
         if padding is None:
@@ -173,6 +174,7 @@ class Network(object):
             num_filters=c_o,
             stride=[s_h, s_w],
             padding=padding,
+            dilation=dilation,
             groups=group,
             param_attr=fluid.ParamAttr(name=prefix + "weights"),
             bias_attr=fluid.ParamAttr(name=prefix + "biases"),
@@ -183,13 +185,71 @@ class Network(object):
 
         return output
 
+    @layer
+    def deconv(self,
+               input,
+               k_h,
+               k_w,
+               c_o,
+               s_h,
+               s_w,
+               name,
+               relu=True,
+               relu_negative_slope=0.0,
+               padding=None,
+               dilation=1,
+               biased=True):
+        if padding is None:
+            padding = [0, 0]
+
+        # Get the number of channels in the input
+        c_i, h_i, w_i = input.shape[1:]
+
+        fluid = import_fluid()
+        prefix = name + '_'
+        leaky_relu = False
+        act = 'relu'
+        if relu is False:
+            act = None
+        elif relu_negative_slope != 0.0:
+            leaky_relu = True
+            act = None
+
+        p_h = padding[0]
+        p_w = padding[1]
+        h_o = (h_i - 1) * s_h - 2 * p_h + dilation * (k_h - 1) + 1
+        w_o = (w_i - 1) * s_w - 2 * p_w + dilation * (k_w - 1) + 1
+        output = fluid.layers.conv2d_transpose(
+            name=self.get_unique_output_name(name, 'conv2d_transpose'),
+            input=input,
+            num_filters=c_o,
+            output_size=[h_o, w_o],
+            filter_size=[k_h, k_w],
+            padding=padding,
+            stride=[s_h, s_w],
+            dilation=dilation,
+            param_attr=fluid.ParamAttr(name=prefix + "weights"),
+            bias_attr=fluid.ParamAttr(name=prefix + "biases"),
+            act=act)
+
+        if leaky_relu:
+            output = fluid.layers.leaky_relu(output, alpha=relu_negative_slope)
+
+        return output
+
     @layer
     def relu(self, input, name):
         fluid = import_fluid()
-        output = fluid.layers.relu(
-            name=self.get_unique_output_name(name, 'relu'), x=input)
+        output = fluid.layers.relu(input)
         return output
 
+    @layer
+    def prelu(self, input, channel_shared, name):
+        #fluid = import_fluid()
+        #output = fluid.layers.relu(input)
+        #return output
+        raise NotImplementedError('prelu not implemented')
+
     def pool(self, pool_type, input, k_h, k_w, s_h, s_w, ceil_mode, padding,
              name):
         # Get the number of channels in the input
@@ -256,6 +316,12 @@ class Network(object):
         return fluid.layers.sigmoid(
             input, name=self.get_unique_output_name(name, 'sigmoid'))
 
+    @layer
+    def tanh(self, input, name):
+        fluid = import_fluid()
+        return fluid.layers.tanh(
+            input, name=self.get_unique_output_name(name, 'tanh'))
+
     @layer
     def lrn(self, input, radius, alpha, beta, name, bias=1.0):
         fluid = import_fluid()
@@ -322,7 +388,8 @@ class Network(object):
                             name,
                             scale_offset=True,
                             eps=1e-5,
-                            relu=False):
+                            relu=False,
+                            relu_negative_slope=0.0):
         # NOTE: Currently, only inference is supported
         fluid = import_fluid()
         prefix = name + '_'
@@ -332,6 +399,15 @@ class Network(object):
             name=prefix + 'offset')
         mean_name = prefix + 'mean'
         variance_name = prefix + 'variance'
+
+        leaky_relu = False
+        act = 'relu'
+        if relu is False:
+            act = None
+        elif relu_negative_slope != 0.0:
+            leaky_relu = True
+            act = None
+
         output = fluid.layers.batch_norm(
             name=self.get_unique_output_name(name, 'batch_norm'),
             input=input,
@@ -341,7 +417,10 @@ class Network(object):
             moving_mean_name=mean_name,
             moving_variance_name=variance_name,
             epsilon=eps,
-            act='relu' if relu is True else None)
+            act=act)
+
+        if leaky_relu:
+            output = fluid.layers.leaky_relu(output, alpha=relu_negative_slope)
 
         return output
 
diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
index 02a600bcd0ac7732b5162070064cd10ff1359dc2..76a318d68de2932c83d158f38a5619043c55f0a8 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
@@ -9,21 +9,6 @@ from ..transformers import (DataInjector, DataReshaper, NodeRenamer,
 from . import network
 
 
-def get_padding_type(kernel_params, input_shape, output_shape):
-    '''Translates Caffe's numeric padding to one of ('SAME', 'VALID').
-    Caffe supports arbitrary padding values, while Paddle only
-    supports 'SAME' and 'VALID' modes. So, not all Caffe paddings
-    can be translated to Paddle. There are some subtleties to
-    how the padding edge-cases are handled. These are described here:
-    https://github.com/Yangqing/caffe2/blob/master/caffe2/proto/caffe2_legacy.proto
-    '''
-    k_h, k_w, s_h, s_w, p_h, p_w = kernel_params
-    if p_h > 0 or p_w > 0:
-        return [p_h, p_w]
-    else:
-        return None
-
-
 class PaddleNode(object):
     '''An intermediate representation for Paddle operations.'''
 
@@ -78,10 +63,11 @@ class PaddleMapper(NodeMapper):
     def get_kernel_params(self, node):
         kernel_params = node.layer.kernel_parameters
         input_shape = node.get_only_parent().output_shape
-        padding = get_padding_type(kernel_params, input_shape,
-                                   node.output_shape)
-        # Only emit the padding if it's not the default value.
-        padding = {'padding': padding} if padding is not None else {}
+        padding = [kernel_params.pad_h, kernel_params.pad_w]
+        if padding[0] == 0 and padding[1] == 0:
+            padding = {}
+        else:
+            padding = {'padding': padding}
         return (kernel_params, padding)
 
     def map_convolution(self, node):
@@ -95,15 +81,44 @@ class PaddleMapper(NodeMapper):
             kwargs['group'] = group
         if not node.parameters.bias_term:
             kwargs['biased'] = False
+
+        if kernel_params.dila_h != 1 or kernel_params.dila_w != 1:
+            kwargs['dilation'] = (kernel_params.dila_h, kernel_params.dila_w)
+
         assert kernel_params.kernel_h == h
         assert kernel_params.kernel_w == w
         return MaybeActivated(node)(
             'conv', kernel_params.kernel_h, kernel_params.kernel_w, c_o,
             kernel_params.stride_h, kernel_params.stride_w, **kwargs)
 
+    def map_deconvolution(self, node):
+        (kernel_params, kwargs) = self.get_kernel_params(node)
+        h = kernel_params.kernel_h
+        w = kernel_params.kernel_w
+        c_o = node.output_shape[1]
+        c_i = node.parents[0].output_shape[1]
+        if not node.parameters.bias_term:
+            kwargs['biased'] = False
+
+        if kernel_params.dila_h != 1 or kernel_params.dila_w != 1:
+            kwargs['dilation'] = (kernel_params.dila_h, kernel_params.dila_w)
+
+        assert kernel_params.kernel_h == h
+        assert kernel_params.kernel_w == w
+        return MaybeActivated(node)(
+            'deconv', kernel_params.kernel_h, kernel_params.kernel_w, c_o,
+            kernel_params.stride_h, kernel_params.stride_w, **kwargs)
+
     def map_relu(self, node):
         return PaddleNode('relu')
 
+    def map_prelu(self, node):
+        channel_shared = getattr(node.parameters, 'channel_shared', False)
+        return PaddleNode('prelu', channel_shared)
+
+    def map_tanh(self, node):
+        return PaddleNode('tanh')
+
     def map_pooling(self, node):
         pool_type = node.parameters.pool
         if pool_type == 0:
diff --git a/fluid/image_classification/caffe2fluid/kaffe/shapes.py b/fluid/image_classification/caffe2fluid/kaffe/shapes.py
index 379cfce6dd3d4c562fd5b89d3b13c467f65c83f8..0e00dca55f1c4df7a3ce8924836db42b00641a32 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/shapes.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/shapes.py
@@ -6,6 +6,8 @@ from .errors import KaffeError
 Tensor4DShape = namedtuple('Tensor4DShape',
                            ['batch_size', 'channels', 'height', 'width'])
 
+Tensor3DShape = namedtuple('Tensor3DShape', ['batch_size', 'data1', 'data2'])
+
 Tensor2DShape = namedtuple('Tensor2DShape', ['batch_size', 'data'])
 
 ScalarShape = namedtuple('ScalarShape', ['batch_size'])
@@ -14,6 +16,8 @@ ScalarShape = namedtuple('ScalarShape', ['batch_size'])
 def make_tensor(batch_size, d1=None, d2=None, d3=None):
     if d3 is not None:
         return Tensor4DShape(batch_size, d1, d2, d3)
+    elif d1 is not None and d2 is not None:
+        return Tensor3DShape(batch_size, d1, d2)
     elif d1 is not None and d2 is None:
         return Tensor2DShape(batch_size, d1)
     elif d1 is None and d2 is None and d3 is None:
@@ -24,10 +28,14 @@ def make_tensor(batch_size, d1=None, d2=None, d3=None):
 
 
 def get_filter_output_shape(i_h, i_w, params, round_func):
-    o_h = (i_h + 2 * params.pad_h - params.kernel_h
-           ) / float(params.stride_h) + 1
-    o_w = (i_w + 2 * params.pad_w - params.kernel_w
-           ) / float(params.stride_w) + 1
+    dila_h = getattr(params, 'dila_h', 1)
+    dila_w = getattr(params, 'dila_w', 1)
+
+    o_h = (i_h + 2 * params.pad_h -
+           (dila_h * (params.kernel_h - 1) + 1)) / float(params.stride_h) + 1
+    o_w = (i_w + 2 * params.pad_w -
+           (dila_w * (params.kernel_w - 1) + 1)) / float(params.stride_w) + 1
+
     return (int(round_func(o_h)), int(round_func(o_w)))
 
 
@@ -97,6 +105,34 @@ def shape_convolution(node):
     return get_strided_kernel_output_shape(node, math.floor)
 
 
+def shape_deconvolution(node):
+    assert node.layer is not None
+    input_shape = node.get_only_parent().output_shape
+    h_i = input_shape.height
+    w_i = input_shape.width
+
+    params = node.layer.kernel_parameters
+    p_h = params.pad_h
+    p_w = params.pad_w
+
+    dila_h = params.dila_h
+    dila_w = params.dila_w
+
+    k_h = params.kernel_h
+    k_w = params.kernel_w
+
+    s_h = params.stride_h
+    s_w = params.stride_w
+
+    h_o = (h_i - 1) * s_h - 2 * p_h + dila_h * (k_h - 1) + 1
+    w_o = (w_i - 1) * s_w - 2 * p_w + dila_w * (k_w - 1) + 1
+
+    params = node.layer.parameters
+    has_c_o = hasattr(params, 'num_output')
+    c = params.num_output if has_c_o else input_shape.channels
+    return make_tensor(input_shape.batch_size, c, h_o, w_o)
+
+
 def shape_pool(node):
     global_pool = getattr(node.layer.parameters, 'global_pooling', False)
     if global_pool:
diff --git a/fluid/image_classification/caffe2fluid/kaffe/transformers.py b/fluid/image_classification/caffe2fluid/kaffe/transformers.py
index 6b53e05a57a657015662c24ae2d974d6f25e7d00..a07ad42541cd342f70a87974ec140e23a10b4efe 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/transformers.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/transformers.py
@@ -325,7 +325,8 @@ class ParameterNamer(object):
         for node in graph.nodes:
             if node.data is None:
                 continue
-            if node.kind in (NodeKind.Convolution, NodeKind.InnerProduct):
+            if node.kind in (NodeKind.Convolution, NodeKind.InnerProduct,\
+                    NodeKind.Deconvolution):
                 names = ('weights', )
                 if node.parameters.bias_term:
                     names += ('biases', )
@@ -337,6 +338,8 @@ class ParameterNamer(object):
                 names = ('scale', )
                 if getattr(node.parameters, 'bias_term', False):
                     names = ('scale', 'offset')
+            elif node.kind == "Normalize":
+                names = ('scale', )
             else:
                 warn('Unhandled parameters when naming this it[%s]' %
                      (node.kind))
diff --git a/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh b/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh
index 947b8900bd944759437a55c20fb32bca4a1b9380..3e6e0ce6d6df0b8c5a5e7814e510eb64006ce34d 100644
--- a/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh
+++ b/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh
@@ -34,7 +34,7 @@ tar xf ${valid_tar} -C ${valid_folder}
 
 echo "Download imagenet label file: val_list.txt & train_list.txt"
 label_file=ImageNet_label.tgz
-label_url=http://imagenet-data.bj.bcebos.com/${label_file}
+label_url=http://paddle-imagenet-models.bj.bcebos.com/${label_file}
 wget -nd -c ${label_url}
 tar zxf ${label_file}
 
diff --git a/fluid/image_classification/reader.py b/fluid/image_classification/reader.py
index b503b67ce09fba80bc49a07665ba0290e75f1ed1..3ea26ccbfead97dc97e8858ee05a6582f2b3bc9e 100644
--- a/fluid/image_classification/reader.py
+++ b/fluid/image_classification/reader.py
@@ -160,5 +160,5 @@ def val(file_list=TEST_LIST):
     return _reader_creator(file_list, 'val', shuffle=False)
 
 
-def test(file_list):
+def test(file_list=TEST_LIST):
     return _reader_creator(file_list, 'test', shuffle=False)
diff --git a/fluid/image_classification/train.py b/fluid/image_classification/train.py
index 74588e21c93e40ee7f5bcde7d6cbbc7c873278ba..51bf9901246cb554baaef22a8e526d0ecd81bd0a 100644
--- a/fluid/image_classification/train.py
+++ b/fluid/image_classification/train.py
@@ -157,7 +157,8 @@ def train(args):
     test_reader = paddle.batch(reader.val(), batch_size=test_batch_size)
     feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
 
-    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
+    train_exe = fluid.ParallelExecutor(
+        use_cuda=True if args.use_gpu else False, loss_name=avg_cost.name)
 
     fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
 
diff --git a/fluid/language_model/.run_ce.sh b/fluid/language_model/.run_ce.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5ee2d8aa0582b2b8504f9ba645b6252aa75f23bf
--- /dev/null
+++ b/fluid/language_model/.run_ce.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+cudaid=${language_model:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train.py --enable_ce | python _ce.py
+
+cudaid=${language_model_m:=0,1,2,3} # use 0,1,2,3 card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train.py --enable_ce | python _ce.py
diff --git a/fluid/language_model/_ce.py b/fluid/language_model/_ce.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4999d7a1e14e333f1c7056b3dc2c5b506682ec6
--- /dev/null
+++ b/fluid/language_model/_ce.py
@@ -0,0 +1,62 @@
+# this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi
+from kpi import DurationKpi
+
+imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0)
+imikolov_20_pass_duration_kpi = DurationKpi(
+    'imikolov_20_pass_duration', 0.02, 0, actived=True)
+imikolov_20_avg_ppl_kpi_card4 = CostKpi('imikolov_20_avg_ppl_card4', 0.2, 0)
+imikolov_20_pass_duration_kpi_card4 = DurationKpi(
+    'imikolov_20_pass_duration_card4', 0.03, 0, actived=True)
+
+tracking_kpis = [
+    imikolov_20_avg_ppl_kpi,
+    imikolov_20_pass_duration_kpi,
+    imikolov_20_avg_ppl_kpi_card4,
+    imikolov_20_pass_duration_kpi_card4,
+]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    log_to_ce(log)
diff --git a/fluid/language_model/train.py b/fluid/language_model/train.py
index 59fc3a987746af7aec9b61b5c817400b6b6546d0..f3e7a7398bf13e14c74ce1d10d90b7bf34031698 100644
--- a/fluid/language_model/train.py
+++ b/fluid/language_model/train.py
@@ -1,14 +1,28 @@
+import os
 import sys
 import time
 
 import numpy as np
 import math
-
+import argparse
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 
 import utils
 
+SEED = 102
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("language_model benchmark.")
+    parser.add_argument(
+        '--enable_ce',
+        action='store_true',
+        help='If set, run \
+        the task with continuous evaluation logs.')
+    args = parser.parse_args()
+    return args
+
 
 def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
     """ network definition """
@@ -63,31 +77,26 @@ def train(train_reader,
           init_low_bound=-0.04,
           init_high_bound=0.04):
     """ train network """
+
+    args = parse_args()
+    if args.enable_ce:
+        # random seed must set before configuring the network.
+        fluid.default_startup_program().random_seed = SEED
     vocab_size = len(vocab)
 
+    #Input data
     src_wordseq = fluid.layers.data(
         name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
     dst_wordseq = fluid.layers.data(
         name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)
 
+    # Train program
     avg_cost = None
-    if not parallel:
-        cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
-                       init_low_bound, init_high_bound)
-        avg_cost = fluid.layers.mean(x=cost)
-    else:
-        places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            cost = network(
-                pd.read_input(src_wordseq),
-                pd.read_input(dst_wordseq), vocab_size, hid_size,
-                init_low_bound, init_high_bound)
-            pd.write_output(cost)
-
-        cost = pd()
-        avg_cost = fluid.layers.mean(x=cost)
+    cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
+                   init_low_bound, init_high_bound)
+    avg_cost = fluid.layers.mean(x=cost)
 
+    # Optimization to minimize lost
     sgd_optimizer = fluid.optimizer.SGD(
         learning_rate=fluid.layers.exponential_decay(
             learning_rate=base_lr,
@@ -96,39 +105,56 @@ def train(train_reader,
             staircase=True))
     sgd_optimizer.minimize(avg_cost)
 
+    # Initialize executor
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
-
     exe.run(fluid.default_startup_program())
+
+    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
+
     total_time = 0.0
+    fetch_list = [avg_cost.name]
     for pass_idx in xrange(pass_num):
         epoch_idx = pass_idx + 1
         print "epoch_%d start" % epoch_idx
 
         t0 = time.time()
         i = 0
+        newest_ppl = 0
         for data in train_reader():
             i += 1
             lod_src_wordseq = utils.to_lodtensor(
                 map(lambda x: x[0], data), place)
             lod_dst_wordseq = utils.to_lodtensor(
                 map(lambda x: x[1], data), place)
-            ret_avg_cost = exe.run(fluid.default_main_program(),
-                                   feed={
-                                       "src_wordseq": lod_src_wordseq,
-                                       "dst_wordseq": lod_dst_wordseq
-                                   },
-                                   fetch_list=[avg_cost],
-                                   use_program_cache=True)
-            avg_ppl = math.exp(ret_avg_cost[0])
+            ret_avg_cost = train_exe.run(feed={
+                "src_wordseq": lod_src_wordseq,
+                "dst_wordseq": lod_dst_wordseq
+            },
+                                         fetch_list=fetch_list)
+            avg_ppl = np.exp(ret_avg_cost[0])
+            newest_ppl = np.mean(avg_ppl)
             if i % 100 == 0:
-                print "step:%d ppl:%.3f" % (i, avg_ppl)
+                print "step:%d ppl:%.3f" % (i, newest_ppl)
 
         t1 = time.time()
         total_time += t1 - t0
         print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i,
                                                          total_time / epoch_idx)
 
+        if pass_idx == pass_num - 1 and args.enable_ce:
+            #Note: The following logs are special for CE monitoring.
+            #Other situations do not need to care about these logs.
+            gpu_num = get_cards()
+            if gpu_num == 1:
+                print("kpis	imikolov_20_pass_duration	%s" %
+                      (total_time / epoch_idx))
+                print("kpis	imikolov_20_avg_ppl	%s" % newest_ppl)
+            else:
+                print("kpis	imikolov_20_pass_duration_card%s	%s" % \
+                                (gpu_num, total_time / epoch_idx))
+                print("kpis	imikolov_20_avg_ppl_card%s	%s" %
+                      (gpu_num, newest_ppl))
         save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
         feed_var_names = ["src_wordseq", "dst_wordseq"]
         fetch_vars = [avg_cost]
@@ -138,11 +164,22 @@ def train(train_reader,
     print("finish training")
 
 
+def get_cards(enable_ce):
+    if enable_ce:
+        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
+        num = len(cards.split(","))
+        return num
+    else:
+        return fluid.core.get_cuda_device_count()
+
+
 def train_net():
     """ do training """
     batch_size = 20
+    args = parse_args()
     vocab, train_reader, test_reader = utils.prepare_data(
-        batch_size=batch_size, buffer_size=1000, word_freq_threshold=0)
+        batch_size=batch_size * get_cards(args.enable_ce), buffer_size=1000, \
+        word_freq_threshold=0, enable_ce = args.enable_ce)
     train(
         train_reader=train_reader,
         vocab=vocab,
@@ -152,7 +189,7 @@ def train_net():
         batch_size=batch_size,
         pass_num=12,
         use_cuda=True,
-        parallel=False,
+        parallel=True,
         model_dir="model",
         init_low_bound=-0.1,
         init_high_bound=0.1)
diff --git a/fluid/language_model/utils.py b/fluid/language_model/utils.py
index c5909046176586556a2aedba5dd5d12810b3ea8d..dd03a89835e620dc8432a6ca16392fc5173a12d4 100644
--- a/fluid/language_model/utils.py
+++ b/fluid/language_model/utils.py
@@ -3,7 +3,7 @@ import time
 import numpy as np
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 
 
 def to_lodtensor(data, place):
@@ -22,17 +22,28 @@ def to_lodtensor(data, place):
     return res
 
 
-def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0):
+def prepare_data(batch_size,
+                 buffer_size=1000,
+                 word_freq_threshold=0,
+                 enable_ce=False):
     """ prepare the English Pann Treebank (PTB) data """
     vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold)
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
+    if enable_ce:
+        train_reader = paddle.batch(
             paddle.dataset.imikolov.train(
                 vocab,
                 buffer_size,
                 data_type=paddle.dataset.imikolov.DataType.SEQ),
-            buf_size=buffer_size),
-        batch_size)
+            batch_size)
+    else:
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.imikolov.train(
+                    vocab,
+                    buffer_size,
+                    data_type=paddle.dataset.imikolov.DataType.SEQ),
+                buf_size=buffer_size),
+            batch_size)
     test_reader = paddle.batch(
         paddle.dataset.imikolov.test(
             vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ),
diff --git a/fluid/mnist/.run_ce.sh b/fluid/mnist/.run_ce.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d6ccf429b52da1ff26ac02df5af287461a823a98
--- /dev/null
+++ b/fluid/mnist/.run_ce.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# This file is only used for continuous evaluation.
+
+rm -rf *_factor.txt
+model_file='model.py'
+python $model_file --batch_size 128 --pass_num 5 --device CPU | python _ce.py
diff --git a/fluid/mnist/_ce.py b/fluid/mnist/_ce.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c2dba53526d2e976252fce05c7ff7f0f44b39b2
--- /dev/null
+++ b/fluid/mnist/_ce.py
@@ -0,0 +1,61 @@
+# this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+# NOTE kpi.py should shared in models in some way!!!!
+
+train_cost_kpi = CostKpi('train_cost', 0.02, actived=True)
+test_acc_kpi = AccKpi('test_acc', 0.005, actived=True)
+train_duration_kpi = DurationKpi('train_duration', 0.06, actived=True)
+train_acc_kpi = AccKpi('train_acc', 0.005, actived=True)
+
+tracking_kpis = [
+    train_acc_kpi,
+    train_cost_kpi,
+    test_acc_kpi,
+    train_duration_kpi,
+]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    log_to_ce(log)
diff --git a/fluid/mnist/model.py b/fluid/mnist/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e719cca404092fdf96484093d39f2c7c54cd9988
--- /dev/null
+++ b/fluid/mnist/model.py
@@ -0,0 +1,199 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+SEED = 90
+DTYPE = "float32"
+
+# random seed must set before configuring the network.
+fluid.default_startup_program().random_seed = SEED
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("mnist model benchmark.")
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--iterations', type=int, default=35, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=5, help='The number of passes.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    test_pass_acc = fluid.average.WeightedAverage()
+    for batch_id, data in enumerate(test_reader()):
+        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
+                                data)).astype(DTYPE)
+        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+        y_data = y_data.reshape([len(y_data), 1])
+
+        acc, weight = exe.run(inference_program,
+                              feed={"pixel": img_data,
+                                    "label": y_data},
+                              fetch_list=[batch_acc, batch_size_tensor])
+        test_pass_acc.add(value=acc, weight=weight)
+        pass_acc = test_pass_acc.eval()
+    return pass_acc
+
+
+def run_benchmark(model, args):
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+    start_time = time.time()
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+    opt.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+
+    accuracy = fluid.average.WeightedAverage()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        pass_start = time.time()
+        every_pass_loss = []
+        for batch_id, data in enumerate(train_reader()):
+            img_data = np.array(
+                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([len(y_data), 1])
+
+            start = time.time()
+            loss, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+            end = time.time()
+            accuracy.add(value=acc, weight=weight)
+            every_pass_loss.append(loss)
+            print("Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                  (pass_id, batch_id, loss, acc))
+
+        pass_end = time.time()
+
+        train_avg_acc = accuracy.eval()
+        train_avg_loss = np.mean(every_pass_loss)
+        test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
+                                 inference_program)
+
+        print(
+            "pass=%d, train_avg_acc=%f,train_avg_loss=%f, test_avg_acc=%f, elapse=%f"
+            % (pass_id, train_avg_acc, train_avg_loss, test_avg_acc,
+               (pass_end - pass_start)))
+        #Note: The following logs are special for CE monitoring.
+        #Other situations do not need to care about these logs.
+        print("kpis	train_acc	%f" % train_avg_acc)
+        print("kpis	train_cost	%f" % train_avg_loss)
+        print("kpis	test_acc	%f" % test_avg_acc)
+        print("kpis	train_duration	%f" % (pass_end - pass_start))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if args.use_nvprof and args.device == 'GPU':
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            run_benchmark(cnn_model, args)
+    else:
+        run_benchmark(cnn_model, args)
diff --git a/fluid/neural_machine_translation/rnn_search/.run_ce.sh b/fluid/neural_machine_translation/rnn_search/.run_ce.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6be159cb5268ae215998e7a19045f7aa0d620f63
--- /dev/null
+++ b/fluid/neural_machine_translation/rnn_search/.run_ce.sh
@@ -0,0 +1,5 @@
+###!/bin/bash
+####This file is only used for continuous evaluation.
+
+model_file='train.py'
+python $model_file --pass_num 1 --learning_rate 0.001 --save_interval 10 --enable_ce | python _ce.py
diff --git a/fluid/neural_machine_translation/rnn_search/_ce.py b/fluid/neural_machine_translation/rnn_search/_ce.py
new file mode 100644
index 0000000000000000000000000000000000000000..e948336e82141c4a2072a02f73b51cb7b4396ca0
--- /dev/null
+++ b/fluid/neural_machine_translation/rnn_search/_ce.py
@@ -0,0 +1,63 @@
+####this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+#### NOTE kpi.py should shared in models in some way!!!!
+
+train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
+test_cost_kpi = CostKpi('test_cost', 0.005, 0, actived=True)
+train_duration_kpi = DurationKpi('train_duration', 0.06, 0, actived=True)
+
+tracking_kpis = [
+    train_cost_kpi,
+    test_cost_kpi,
+    train_duration_kpi,
+]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            print("-----%s" % fs)
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    print("*****")
+    print(log)
+    print("****")
+    log_to_ce(log)
diff --git a/fluid/neural_machine_translation/rnn_search/args.py b/fluid/neural_machine_translation/rnn_search/args.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc0b61b2da1f1a4c2ddbe5785cb4f2f6aad92af6
--- /dev/null
+++ b/fluid/neural_machine_translation/rnn_search/args.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import distutils.util
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--embedding_dim",
+        type=int,
+        default=512,
+        help="The dimension of embedding table. (default: %(default)d)")
+    parser.add_argument(
+        "--encoder_size",
+        type=int,
+        default=512,
+        help="The size of encoder bi-rnn unit. (default: %(default)d)")
+    parser.add_argument(
+        "--decoder_size",
+        type=int,
+        default=512,
+        help="The size of decoder rnn unit. (default: %(default)d)")
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="The sequence number of a mini-batch data. (default: %(default)d)")
+    parser.add_argument(
+        "--dict_size",
+        type=int,
+        default=30000,
+        help="The dictionary capacity. Dictionaries of source sequence and "
+        "target dictionary have same capacity. (default: %(default)d)")
+    parser.add_argument(
+        "--pass_num",
+        type=int,
+        default=5,
+        help="The pass number to train. (default: %(default)d)")
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=0.01,
+        help="Learning rate used to train the model. (default: %(default)f)")
+    parser.add_argument(
+        "--no_attention",
+        action='store_true',
+        help="If set, run no attention model instead of attention model.")
+    parser.add_argument(
+        "--beam_size",
+        type=int,
+        default=3,
+        help="The width for beam searching. (default: %(default)d)")
+    parser.add_argument(
+        "--use_gpu",
+        type=distutils.util.strtobool,
+        default=True,
+        help="Whether to use gpu. (default: %(default)d)")
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=50,
+        help="The maximum length of sequence when doing generation. "
+        "(default: %(default)d)")
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        default="model",
+        help="Specify the path to save trained models.")
+    parser.add_argument(
+        "--save_interval",
+        type=int,
+        default=1,
+        help="Save the trained model every n passes."
+        "(default: %(default)d)")
+    parser.add_argument(
+        "--enable_ce",
+        action='store_true',
+        help="If set, run the task with continuous evaluation logs.")
+    args = parser.parse_args()
+    return args
diff --git a/fluid/neural_machine_translation/rnn_search/attention_model.py b/fluid/neural_machine_translation/rnn_search/attention_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cf23a96efcdf1fe69fbf26905bcd8a113db6a7d
--- /dev/null
+++ b/fluid/neural_machine_translation/rnn_search/attention_model.py
@@ -0,0 +1,221 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.fluid as fluid
+from paddle.fluid.contrib.decoder.beam_search_decoder import *
+
+
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+    def linear(inputs):
+        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+    cell_t = fluid.layers.sums(input=[
+        fluid.layers.elementwise_mul(
+            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+                x=input_gate, y=cell_tilde)
+    ])
+
+    hidden_t = fluid.layers.elementwise_mul(
+        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+    return hidden_t, cell_t
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size, max_length):
+    """Construct a seq2seq network."""
+
+    def bi_lstm_encoder(input_seq, gate_size):
+        # A bi-directional lstm encoder implementation.
+        # Linear transformation part for input gate, output gate, forget gate
+        # and cell activation vectors need be done outside of dynamic_lstm.
+        # So the output size is 4 times of gate_size.
+        input_forward_proj = fluid.layers.fc(input=input_seq,
+                                             size=gate_size * 4,
+                                             act='tanh',
+                                             bias_attr=False)
+        forward, _ = fluid.layers.dynamic_lstm(
+            input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
+        input_reversed_proj = fluid.layers.fc(input=input_seq,
+                                              size=gate_size * 4,
+                                              act='tanh',
+                                              bias_attr=False)
+        reversed, _ = fluid.layers.dynamic_lstm(
+            input=input_reversed_proj,
+            size=gate_size * 4,
+            is_reverse=True,
+            use_peepholes=False)
+        return forward, reversed
+
+    # The encoding process. Encodes the input words into tensors.
+    src_word_idx = fluid.layers.data(
+        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    src_embedding = fluid.layers.embedding(
+        input=src_word_idx,
+        size=[source_dict_dim, embedding_dim],
+        dtype='float32')
+
+    src_forward, src_reversed = bi_lstm_encoder(
+        input_seq=src_embedding, gate_size=encoder_size)
+
+    encoded_vector = fluid.layers.concat(
+        input=[src_forward, src_reversed], axis=1)
+
+    encoded_proj = fluid.layers.fc(input=encoded_vector,
+                                   size=decoder_size,
+                                   bias_attr=False)
+
+    backward_first = fluid.layers.sequence_pool(
+        input=src_reversed, pool_type='first')
+
+    decoder_boot = fluid.layers.fc(input=backward_first,
+                                   size=decoder_size,
+                                   bias_attr=False,
+                                   act='tanh')
+
+    cell_init = fluid.layers.fill_constant_batch_size_like(
+        input=decoder_boot,
+        value=0.0,
+        shape=[-1, decoder_size],
+        dtype='float32')
+    cell_init.stop_gradient = False
+
+    # Create a RNN state cell by providing the input and hidden states, and
+    # specifies the hidden state as output.
+    h = InitState(init=decoder_boot, need_reorder=True)
+    c = InitState(init=cell_init)
+
+    state_cell = StateCell(
+        inputs={'x': None,
+                'encoder_vec': None,
+                'encoder_proj': None},
+        states={'h': h,
+                'c': c},
+        out_state='h')
+
+    def simple_attention(encoder_vec, encoder_proj, decoder_state):
+        # The implementation of simple attention model
+        decoder_state_proj = fluid.layers.fc(input=decoder_state,
+                                             size=decoder_size,
+                                             bias_attr=False)
+        decoder_state_expand = fluid.layers.sequence_expand(
+            x=decoder_state_proj, y=encoder_proj)
+        # concated lod should inherit from encoder_proj
+        concated = fluid.layers.concat(
+            input=[encoder_proj, decoder_state_expand], axis=1)
+        attention_weights = fluid.layers.fc(input=concated,
+                                            size=1,
+                                            bias_attr=False)
+        attention_weights = fluid.layers.sequence_softmax(
+            input=attention_weights)
+        weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
+        scaled = fluid.layers.elementwise_mul(
+            x=encoder_vec, y=weigths_reshape, axis=0)
+        context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+        return context
+
+    @state_cell.state_updater
+    def state_updater(state_cell):
+        # Define the updater of RNN state cell
+        current_word = state_cell.get_input('x')
+        encoder_vec = state_cell.get_input('encoder_vec')
+        encoder_proj = state_cell.get_input('encoder_proj')
+        prev_h = state_cell.get_state('h')
+        prev_c = state_cell.get_state('c')
+        context = simple_attention(encoder_vec, encoder_proj, prev_h)
+        decoder_inputs = fluid.layers.concat(
+            input=[context, current_word], axis=1)
+        h, c = lstm_step(decoder_inputs, prev_h, prev_c, decoder_size)
+        state_cell.set_state('h', h)
+        state_cell.set_state('c', c)
+
+    # Define the decoding process
+    if not is_generating:
+        # Training process
+        trg_word_idx = fluid.layers.data(
+            name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+        trg_embedding = fluid.layers.embedding(
+            input=trg_word_idx,
+            size=[target_dict_dim, embedding_dim],
+            dtype='float32')
+
+        # A decoder for training
+        decoder = TrainingDecoder(state_cell)
+
+        with decoder.block():
+            current_word = decoder.step_input(trg_embedding)
+            encoder_vec = decoder.static_input(encoded_vector)
+            encoder_proj = decoder.static_input(encoded_proj)
+            decoder.state_cell.compute_state(inputs={
+                'x': current_word,
+                'encoder_vec': encoder_vec,
+                'encoder_proj': encoder_proj
+            })
+            h = decoder.state_cell.get_state('h')
+            decoder.state_cell.update_states()
+            out = fluid.layers.fc(input=h,
+                                  size=target_dict_dim,
+                                  bias_attr=True,
+                                  act='softmax')
+            decoder.output(out)
+
+        label = fluid.layers.data(
+            name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+        cost = fluid.layers.cross_entropy(input=decoder(), label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
+        return avg_cost, feeding_list
+
+    else:
+        # Inference
+        init_ids = fluid.layers.data(
+            name="init_ids", shape=[1], dtype="int64", lod_level=2)
+        init_scores = fluid.layers.data(
+            name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+        # A beam search decoder
+        decoder = BeamSearchDecoder(
+            state_cell=state_cell,
+            init_ids=init_ids,
+            init_scores=init_scores,
+            target_dict_dim=target_dict_dim,
+            word_dim=embedding_dim,
+            input_var_dict={
+                'encoder_vec': encoded_vector,
+                'encoder_proj': encoded_proj
+            },
+            topk_size=50,
+            sparse_emb=True,
+            max_len=max_length,
+            beam_size=beam_size,
+            end_id=1,
+            name=None)
+
+        decoder.decode()
+
+        translation_ids, translation_scores = decoder()
+        feeding_list = ["source_sequence"]
+
+        return translation_ids, translation_scores, feeding_list
diff --git a/fluid/neural_machine_translation/rnn_search/infer.py b/fluid/neural_machine_translation/rnn_search/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..51bdf9cda4694d4d849ff333e5c8e47978fb8815
--- /dev/null
+++ b/fluid/neural_machine_translation/rnn_search/infer.py
@@ -0,0 +1,136 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import os
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.executor import Executor
+from paddle.fluid.contrib.decoder.beam_search_decoder import *
+
+from args import *
+import attention_model
+import no_attention_model
+
+
+def infer():
+    args = parse_args()
+
+    # Inference
+    if args.no_attention:
+        translation_ids, translation_scores, feed_order = \
+            no_attention_model.seq_to_seq_net(
+            args.embedding_dim,
+            args.encoder_size,
+            args.decoder_size,
+            args.dict_size,
+            args.dict_size,
+            True,
+            beam_size=args.beam_size,
+            max_length=args.max_length)
+    else:
+        translation_ids, translation_scores, feed_order = \
+            attention_model.seq_to_seq_net(
+            args.embedding_dim,
+            args.encoder_size,
+            args.decoder_size,
+            args.dict_size,
+            args.dict_size,
+            True,
+            beam_size=args.beam_size,
+            max_length=args.max_length)
+
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size,
+        drop_last=False)
+
+    place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    model_path = os.path.join(args.save_dir, str(args.pass_num))
+    fluid.io.load_persistables(
+        executor=exe,
+        dirname=model_path,
+        main_program=framework.default_main_program())
+
+    src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
+
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order[0:1]
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    for batch_id, data in enumerate(test_batch_generator()):
+        # The value of batch_size may vary in the last batch
+        batch_size = len(data)
+
+        # Setup initial ids and scores lod tensor
+        init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64')
+        init_scores_data = np.array(
+            [1. for _ in range(batch_size)], dtype='float32')
+        init_ids_data = init_ids_data.reshape((batch_size, 1))
+        init_scores_data = init_scores_data.reshape((batch_size, 1))
+        init_recursive_seq_lens = [1] * batch_size
+        init_recursive_seq_lens = [
+            init_recursive_seq_lens, init_recursive_seq_lens
+        ]
+        init_ids = fluid.create_lod_tensor(init_ids_data,
+                                           init_recursive_seq_lens, place)
+        init_scores = fluid.create_lod_tensor(init_scores_data,
+                                              init_recursive_seq_lens, place)
+
+        # Feed dict for inference
+        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict['init_ids'] = init_ids
+        feed_dict['init_scores'] = init_scores
+
+        fetch_outs = exe.run(framework.default_main_program(),
+                             feed=feed_dict,
+                             fetch_list=[translation_ids, translation_scores],
+                             return_numpy=False)
+
+        # Split the output words by lod levels
+        lod_level_1 = fetch_outs[0].lod()[1]
+        token_array = np.array(fetch_outs[0])
+        result = []
+        for i in xrange(len(lod_level_1) - 1):
+            sentence_list = [
+                trg_dict[token]
+                for token in token_array[lod_level_1[i]:lod_level_1[i + 1]]
+            ]
+            sentence = " ".join(sentence_list[1:-1])
+            result.append(sentence)
+        lod_level_0 = fetch_outs[0].lod()[0]
+        paragraphs = [
+            result[lod_level_0[i]:lod_level_0[i + 1]]
+            for i in xrange(len(lod_level_0) - 1)
+        ]
+
+        for paragraph in paragraphs:
+            print(paragraph)
+
+
+if __name__ == '__main__':
+    infer()
diff --git a/fluid/neural_machine_translation/rnn_search/no_attention_model.py b/fluid/neural_machine_translation/rnn_search/no_attention_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..57e7dbe42ad37bbd5d4c85ab4d58b2e1dd3d961b
--- /dev/null
+++ b/fluid/neural_machine_translation/rnn_search/no_attention_model.py
@@ -0,0 +1,127 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.fluid.layers as layers
+from paddle.fluid.contrib.decoder.beam_search_decoder import *
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size, max_length):
+    def encoder():
+        # Encoder implementation of RNN translation
+        src_word = layers.data(
+            name="src_word", shape=[1], dtype='int64', lod_level=1)
+        src_embedding = layers.embedding(
+            input=src_word,
+            size=[source_dict_dim, embedding_dim],
+            dtype='float32',
+            is_sparse=True)
+
+        fc1 = layers.fc(input=src_embedding, size=encoder_size * 4, act='tanh')
+        lstm_hidden0, lstm_0 = layers.dynamic_lstm(
+            input=fc1, size=encoder_size * 4)
+        encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+        return encoder_out
+
+    def decoder_state_cell(context):
+        # Decoder state cell, specifies the hidden state variable and its updater
+        h = InitState(init=context, need_reorder=True)
+        state_cell = StateCell(
+            inputs={'x': None}, states={'h': h}, out_state='h')
+
+        @state_cell.state_updater
+        def updater(state_cell):
+            current_word = state_cell.get_input('x')
+            prev_h = state_cell.get_state('h')
+            # make sure lod of h heritted from prev_h
+            h = layers.fc(input=[prev_h, current_word],
+                          size=decoder_size,
+                          act='tanh')
+            state_cell.set_state('h', h)
+
+        return state_cell
+
+    def decoder_train(state_cell):
+        # Decoder for training implementation of RNN translation
+        trg_word = layers.data(
+            name="target_word", shape=[1], dtype='int64', lod_level=1)
+        trg_embedding = layers.embedding(
+            input=trg_word,
+            size=[target_dict_dim, embedding_dim],
+            dtype='float32',
+            is_sparse=True)
+
+        # A training decoder
+        decoder = TrainingDecoder(state_cell)
+
+        # Define the computation in each RNN step done by decoder
+        with decoder.block():
+            current_word = decoder.step_input(trg_embedding)
+            decoder.state_cell.compute_state(inputs={'x': current_word})
+            current_score = layers.fc(input=decoder.state_cell.get_state('h'),
+                                      size=target_dict_dim,
+                                      act='softmax')
+            decoder.state_cell.update_states()
+            decoder.output(current_score)
+
+        return decoder()
+
+    def decoder_infer(state_cell):
+        # Decoder for inference implementation
+        init_ids = layers.data(
+            name="init_ids", shape=[1], dtype="int64", lod_level=2)
+        init_scores = layers.data(
+            name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+        # A beam search decoder for inference
+        decoder = BeamSearchDecoder(
+            state_cell=state_cell,
+            init_ids=init_ids,
+            init_scores=init_scores,
+            target_dict_dim=target_dict_dim,
+            word_dim=embedding_dim,
+            input_var_dict={},
+            topk_size=50,
+            sparse_emb=True,
+            max_len=max_length,
+            beam_size=beam_size,
+            end_id=1,
+            name=None)
+        decoder.decode()
+        translation_ids, translation_scores = decoder()
+
+        return translation_ids, translation_scores
+
+    context = encoder()
+    state_cell = decoder_state_cell(context)
+
+    if not is_generating:
+        label = layers.data(
+            name="target_next_word", shape=[1], dtype='int64', lod_level=1)
+
+        rnn_out = decoder_train(state_cell)
+
+        cost = layers.cross_entropy(input=rnn_out, label=label)
+        avg_cost = layers.mean(x=cost)
+
+        feeding_list = ['src_word', 'target_word', 'target_next_word']
+        return avg_cost, feeding_list
+    else:
+        translation_ids, translation_scores = decoder_infer(state_cell)
+        feeding_list = ['src_word']
+        return translation_ids, translation_scores, feeding_list
diff --git a/fluid/neural_machine_translation/rnn_search/train.py b/fluid/neural_machine_translation/rnn_search/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..ade0dd751af1a2e83bb99da22281061dce44fbd1
--- /dev/null
+++ b/fluid/neural_machine_translation/rnn_search/train.py
@@ -0,0 +1,170 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import time
+import os
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.executor import Executor
+from paddle.fluid.contrib.decoder.beam_search_decoder import *
+
+from args import *
+import attention_model
+import no_attention_model
+
+
+def train():
+    args = parse_args()
+
+    if args.enable_ce:
+        framework.default_startup_program().random_seed = 111
+
+    # Training process
+    if args.no_attention:
+        avg_cost, feed_order = no_attention_model.seq_to_seq_net(
+            args.embedding_dim,
+            args.encoder_size,
+            args.decoder_size,
+            args.dict_size,
+            args.dict_size,
+            False,
+            beam_size=args.beam_size,
+            max_length=args.max_length)
+    else:
+        avg_cost, feed_order = attention_model.seq_to_seq_net(
+            args.embedding_dim,
+            args.encoder_size,
+            args.decoder_size,
+            args.dict_size,
+            args.dict_size,
+            False,
+            beam_size=args.beam_size,
+            max_length=args.max_length)
+
+    # clone from default main program and use it as the validation program
+    main_program = fluid.default_main_program()
+    inference_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(
+        learning_rate=args.learning_rate,
+        regularization=fluid.regularizer.L2DecayRegularizer(
+            regularization_coeff=1e-5))
+
+    optimizer.minimize(avg_cost)
+
+    # Disable shuffle for Continuous Evaluation only
+    if not args.enable_ce:
+        train_batch_generator = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+            batch_size=args.batch_size,
+            drop_last=False)
+
+        test_batch_generator = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+            batch_size=args.batch_size,
+            drop_last=False)
+    else:
+        train_batch_generator = paddle.batch(
+            paddle.dataset.wmt14.train(args.dict_size),
+            batch_size=args.batch_size,
+            drop_last=False)
+
+        test_batch_generator = paddle.batch(
+            paddle.dataset.wmt14.test(args.dict_size),
+            batch_size=args.batch_size,
+            drop_last=False)
+
+    place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    feed_list = [
+        main_program.global_block().var(var_name) for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    def validation():
+        # Use test set as validation each pass
+        total_loss = 0.0
+        count = 0
+        val_feed_list = [
+            inference_program.global_block().var(var_name)
+            for var_name in feed_order
+        ]
+        val_feeder = fluid.DataFeeder(val_feed_list, place)
+
+        for batch_id, data in enumerate(test_batch_generator()):
+            val_fetch_outs = exe.run(inference_program,
+                                     feed=val_feeder.feed(data),
+                                     fetch_list=[avg_cost],
+                                     return_numpy=False)
+
+            total_loss += np.array(val_fetch_outs[0])[0]
+            count += 1
+
+        return total_loss / count
+
+    for pass_id in range(1, args.pass_num + 1):
+        pass_start_time = time.time()
+        words_seen = 0
+        for batch_id, data in enumerate(train_batch_generator()):
+            words_seen += len(data) * 2
+
+            fetch_outs = exe.run(framework.default_main_program(),
+                                 feed=feeder.feed(data),
+                                 fetch_list=[avg_cost])
+
+            avg_cost_train = np.array(fetch_outs[0])
+            print('pass_id=%d, batch_id=%d, train_loss: %f' %
+                  (pass_id, batch_id, avg_cost_train))
+            # This is for continuous evaluation only
+            if args.enable_ce and batch_id >= 100:
+                break
+
+        pass_end_time = time.time()
+        test_loss = validation()
+        time_consumed = pass_end_time - pass_start_time
+        words_per_sec = words_seen / time_consumed
+        print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
+              (pass_id, test_loss, words_per_sec, time_consumed))
+
+        # This log is for continuous evaluation only
+        if args.enable_ce:
+            print("kpis\ttrain_cost\t%f" % avg_cost_train)
+            print("kpis\ttest_cost\t%f" % test_loss)
+            print("kpis\ttrain_duration\t%f" % time_consumed)
+
+        if pass_id % args.save_interval == 0:
+            model_path = os.path.join(args.save_dir, str(pass_id))
+            if not os.path.isdir(model_path):
+                os.makedirs(model_path)
+
+            fluid.io.save_persistables(
+                executor=exe,
+                dirname=model_path,
+                main_program=framework.default_main_program())
+
+
+if __name__ == '__main__':
+    train()
diff --git a/fluid/neural_machine_translation/transformer/README_cn.md b/fluid/neural_machine_translation/transformer/README_cn.md
index 547b525b40abbfc3009e3948273db52ff394e535..561c5c30debc60a07050a2988bde8a70f9bc3bb5 100644
--- a/fluid/neural_machine_translation/transformer/README_cn.md
+++ b/fluid/neural_machine_translation/transformer/README_cn.md
@@ -9,13 +9,14 @@
 ```text
 .
 ├── images               # README 文档中的图片
-├── optim.py             # learning rate scheduling 计算程序
+├── config.py            # 训练、预测以及模型参数配置
 ├── infer.py             # 预测脚本
 ├── model.py             # 模型定义
+├── optim.py             # learning rate scheduling 计算程序
 ├── reader.py            # 数据读取接口
 ├── README.md            # 文档
 ├── train.py             # 训练脚本
-└── config.py            # 训练、预测以及模型参数配置
+└── util.py              # wordpiece 数据解码工具
 ```
 
 ### 简介
@@ -58,34 +59,43 @@ Decoder 具有和 Encoder 类似的结构，只是相比于组成 Encoder 的 la
 
 ### 数据准备
 
-我们以 [WMT'16 EN-DE 数据集](http://www.statmt.org/wmt16/translation-task.html)作为示例，同时参照论文中的设置使用 BPE（byte-pair encoding）[4]编码的数据，使用这种方式表示的数据能够更好的解决未登录词（out-of-vocabulary，OOV）的问题。用到的 BPE 数据可以参照[这里](https://github.com/google/seq2seq/blob/master/docs/data.md)进行下载，下载后解压，其中 `train.tok.clean.bpe.32000.en` 和 `train.tok.clean.bpe.32000.de` 为使用 BPE 的训练数据（平行语料，分别对应了英语和德语，经过了 tokenize 和 BPE 的处理），`newstest2013.tok.bpe.32000.en` 和 `newstest2013.tok.bpe.32000.de` 等为测试数据（`newstest2013.tok.en` 和 `newstest2013.tok.de` 等则为对应的未使用 BPE 的测试数据），`vocab.bpe.32000` 为相应的词典文件（源语言和目标语言共享该词典文件）。
+WMT 数据集是机器翻译领域公认的主流数据集；WMT 英德和英法数据集也是 Transformer 论文中所用数据集，其中英德数据集使用了 BPE（byte-pair encoding）[4]编码的数据，英法数据集使用了 wordpiece [5]的数据。我们这里也将使用 WMT 英德和英法翻译数据，并和论文保持一致使用 BPE 和 wordpiece 的数据，下面给出了使用的方法。对于其他自定义数据，参照下文遵循或转换为类似的数据格式即可。
+
+#### WMT 英德翻译数据
 
-由于本示例中的数据读取脚本 `reader.py` 使用的样本数据的格式为 `\t` 分隔的的源语言和目标语言句子对（句子中的词之间使用空格分隔）， 因此需要将源语言到目标语言的平行语料库文件合并为一个文件，可以执行以下命令进行合并：
+[WMT'16 EN-DE 数据集](http://www.statmt.org/wmt16/translation-task.html)是一个中等规模的数据集。参照论文，英德数据集我们使用 BPE 编码的数据，这能够更好的解决未登录词（out-of-vocabulary，OOV）的问题[4]。用到的 BPE 数据可以参照[这里](https://github.com/google/seq2seq/blob/master/docs/data.md)进行下载（如果希望在自定义数据中使用 BPE 编码，可以参照[这里](https://github.com/rsennrich/subword-nmt)进行预处理），下载后解压，其中 `train.tok.clean.bpe.32000.en` 和 `train.tok.clean.bpe.32000.de` 为使用 BPE 的训练数据（平行语料，分别对应了英语和德语，经过了 tokenize 和 BPE 的处理），`newstest2013.tok.bpe.32000.en` 和 `newstest2013.tok.bpe.32000.de` 等为测试数据（`newstest2013.tok.en` 和 `newstest2013.tok.de` 等则为对应的未使用 BPE 的测试数据），`vocab.bpe.32000` 为相应的词典文件（源语言和目标语言共享该词典文件）。
+
+由于本示例中的数据读取脚本 `reader.py` 默认使用的样本数据的格式为 `\t` 分隔的的源语言和目标语言句子对（默认句子中的词之间使用空格分隔），因此需要将源语言到目标语言的平行语料库文件合并为一个文件，可以执行以下命令进行合并：
 ```sh
 paste -d '\t' train.tok.clean.bpe.32000.en train.tok.clean.bpe.32000.de > train.tok.clean.bpe.32000.en-de
 ```
-此外，下载的词典文件 `vocab.bpe.32000` 中未包含表示序列开始、序列结束和未登录词的特殊符号，可以使用如下命令在词典中加入 `<s>` 、`<e>` 和 `<unk>` 作为这三个特殊符号。
+此外，下载的词典文件 `vocab.bpe.32000` 中未包含表示序列开始、序列结束和未登录词的特殊符号，可以使用如下命令在词典中加入 `<s>` 、`<e>` 和 `<unk>` 作为这三个特殊符号（用 BPE 表示数据已有效避免了未登录词的问题，这里加入只是做通用处理）。
 ```sh
 sed -i '1i\<s>\n<e>\n<unk>' vocab.bpe.32000
 ```
 
-对于其他自定义数据，遵循或转换为上述的数据格式即可。如果希望在自定义数据中使用 BPE 编码，可以参照[这里](https://github.com/rsennrich/subword-nmt)进行预处理。
+#### WMT 英法翻译数据
+
+[WMT'14 EN-FR 数据集](http://www.statmt.org/wmt14/translation-task.html)是一个较大规模的数据集。参照论文，英法数据我们使用 wordpiece 表示的数据，wordpiece 和 BPE 类似同为采用 sub-word units 来解决 OOV 问题的方法[5]。我们提供了已完成预处理的 wordpiece 数据的下载，可以从[这里](http://transformer-data.bj.bcebos.com/wmt14_enfr.tar)下载，其中 `train.wordpiece.en-fr` 为使用 wordpiece 的训练数据，`newstest2014.wordpiece.en-fr` 为测试数据（`newstest2014.tok.en` 和 `newstest2014.tok.fr` 为对应的未经 wordpiece 处理过的测试数据，使用[脚本](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl)进行了 tokenize 的处理），`vocab.wordpiece.en-fr` 为相应的词典文件（源语言和目标语言共享该词典文件）。
+
+提供的英法翻译数据无需进行额外的处理，可以直接使用；需要注意的是，这些用 wordpiece 表示的数据中句子内的 token 之间使用 `\x01` 而非空格进行分隔（因部分 token 内包含空格），这需要在训练时进行指定。
 
 ### 模型训练
 
-`train.py` 是模型训练脚本，可以执行以下命令进行模型训练：
+`train.py` 是模型训练脚本。以英德翻译数据为例，可以执行以下命令进行模型训练：
 ```sh
 python -u train.py \
   --src_vocab_fpath data/vocab.bpe.32000 \
   --trg_vocab_fpath data/vocab.bpe.32000 \
   --special_token '<s>' '<e>' '<unk>' \
   --train_file_pattern data/train.tok.clean.bpe.32000.en-de \
+  --token_delimiter ' ' \
   --use_token_batch True \
   --batch_size 3200 \
   --sort_type pool \
-  --pool_size 200000 \
+  --pool_size 200000
 ```
-上述命令中设置了源语言词典文件路径（`src_vocab_fpath`）、目标语言词典文件路径（`trg_vocab_fpath`）、训练数据文件（`train_file_pattern`，支持通配符）等数据相关的参数和构造 batch 方式（`use_token_batch` 指出数据按照 token 数目或者 sequence 数目组成 batch）等 reader 相关的参数。有关这些参数更详细的信息可以通过执行以下命令查看：
+上述命令中设置了源语言词典文件路径（`src_vocab_fpath`）、目标语言词典文件路径（`trg_vocab_fpath`）、训练数据文件（`train_file_pattern`，支持通配符）等数据相关的参数和构造 batch 方式（`use_token_batch` 指定了数据按照 token 数目或者 sequence 数目组成 batch）等 reader 相关的参数。有关这些参数更详细的信息可以通过执行以下命令查看：
 ```sh
 python train.py --help
 ```
@@ -98,19 +108,20 @@ python -u train.py \
   --trg_vocab_fpath data/vocab.bpe.32000 \
   --special_token '<s>' '<e>' '<unk>' \
   --train_file_pattern data/train.tok.clean.bpe.32000.en-de \
+  --token_delimiter ' ' \
   --use_token_batch True \
   --batch_size 3200 \
   --sort_type pool \
   --pool_size 200000 \
-  n_layer 8 \
+  n_layer 6 \
   n_head 16 \
   d_model 1024 \
   d_inner_hid 4096 \
   dropout 0.3
 ```
-有关这些参数更详细信息的还请参考 `config.py` 中的注释说明。
+有关这些参数更详细信息的请参考 `config.py` 中的注释说明。对于英法翻译数据，执行训练和英德翻译训练类似，修改命令中的词典和数据文件为英法数据相应文件的路径，另外要注意的是由于英法翻译数据 token 间不是使用空格进行分隔，需要修改 `token_delimiter` 参数的设置为 `--token_delimiter '\x01'`。
 
-训练时默认使用所有 GPU，可以通过 `CUDA_VISIBLE_DEVICES` 环境变量来设置使用的 GPU 数目。在训练过程中，每个 epoch 结束后将保存模型到参数 `model_dir` 指定的目录，每个 iteration 将打印如下的日志到标准输出：
+训练时默认使用所有 GPU，可以通过 `CUDA_VISIBLE_DEVICES` 环境变量来设置使用的 GPU 数目。也可以只使用 CPU 训练(通过参数 `--divice CPU` 设置)，训练速度相对较慢。在训练过程中，每个 epoch 结束后将保存模型到参数 `model_dir` 指定的目录，每个 epoch 内也会每隔1000个 iteration 进行一次保存，每个 iteration 将打印如下的日志到标准输出：
 ```txt
 epoch: 0, batch: 0, sum loss: 258793.343750, avg loss: 11.069005, ppl: 64151.644531
 epoch: 0, batch: 1, sum loss: 256140.718750, avg loss: 11.059616, ppl: 63552.148438
@@ -126,38 +137,120 @@ epoch: 0, batch: 9, sum loss: 245157.500000, avg loss: 10.966562, ppl: 57905.187
 
 ### 模型预测
 
-`infer.py` 是模型预测脚本，模型训练完成后可以执行以下命令对指定文件中的文本进行翻译：
+`infer.py` 是模型预测脚本。以英德翻译数据为例，模型训练完成后可以执行以下命令对指定文件中的文本进行翻译：
 ```sh
 python -u infer.py \
   --src_vocab_fpath data/vocab.bpe.32000 \
   --trg_vocab_fpath data/vocab.bpe.32000 \
   --special_token '<s>' '<e>' '<unk>' \
   --test_file_pattern data/newstest2013.tok.bpe.32000.en-de \
+  --use_wordpiece False \
+  --token_delimiter ' ' \
   --batch_size 4 \
   model_path trained_models/pass_20.infer.model \
-  beam_size 5
+  beam_size 5 \
   max_out_len 256
 ```
 和模型训练时类似，预测时也需要设置数据和 reader 相关的参数，并可以执行 `python infer.py --help` 查看这些参数的说明（部分参数意义和训练时略有不同）；同样可以在预测命令中设置模型超参数，但应与模型训练时的设置一致；此外相比于模型训练，预测时还有一些额外的参数，如需要设置 `model_path` 来给出模型所在目录，可以设置 `beam_size` 和 `max_out_len` 来指定 Beam Search 算法的搜索宽度和最大深度（翻译长度），这些参数也可以在 `config.py` 中的 `InferTaskConfig` 内查阅注释说明并进行更改设置。
 
-执行以上预测命令会打印翻译结果到标准输出，每行输出是对应行输入的得分最高的翻译。需要注意，对于使用 BPE 的数据，预测出的翻译结果也将是 BPE 表示的数据，要恢复成原始的数据（这里指 tokenize 后的数据）才能进行正确的评估，可以使用以下命令来恢复 `predict.txt` 内的翻译结果到 `predict.tok.txt` 中。
-
+执行以上预测命令会打印翻译结果到标准输出，每行输出是对应行输入的得分最高的翻译。对于使用 BPE 的英德数据，预测出的翻译结果也将是 BPE 表示的数据，要还原成原始的数据（这里指 tokenize 后的数据）才能进行正确的评估，可以使用以下命令来恢复 `predict.txt` 内的翻译结果到 `predict.tok.txt` 中（无需再次 tokenize 处理）：
 ```sh
 sed 's/@@ //g' predict.txt > predict.tok.txt
 ```
 
-接下来就可以使用参考翻译（这里使用的是 `newstest2013.tok.de`）对翻译结果进行 BLEU 指标的评估了。计算 BLEU 值的一个较为广泛使用的脚本可以从[这里](https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl)获取，获取后执行如下命令：
+对于英法翻译的 wordpiece 数据，执行预测和英德翻译预测类似，修改命令中的词典和数据文件为英法数据相应文件的路径，另外需要注意修改 `token_delimiter` 参数的设置为 `--token_delimiter '\x01'`；同时要修改 `use_wordpiece` 参数的设置为 `--use_wordpiece True`，这会在预测时将翻译得到的 wordpiece 数据还原为原始数据输出。为了使用 tokenize 的数据进行评估，还需要对翻译结果进行 tokenize 的处理，[Moses](https://github.com/moses-smt/mosesdecoder) 提供了一系列机器翻译相关的脚本。执行 `git clone https://github.com/moses-smt/mosesdecoder.git` 克隆 mosesdecoder 仓库后，可以使用其中的 `tokenizer.perl` 脚本对 `predict.txt` 内的翻译结果进行 tokenize 处理并输出到 `predict.tok.txt` 中，如下：
+```sh
+perl mosesdecoder/scripts/tokenizer/tokenizer.perl -l fr < predict.txt > predict.tok.txt
+```
+
+接下来就可以使用参考翻译对翻译结果进行 BLEU 指标的评估了。计算 BLEU 值的脚本也在 Moses 中包含，以英德翻译 `newstest2013.tok.de` 数据为例，执行如下命令：
 ```sh
-perl multi-bleu.perl data/newstest2013.tok.de < predict.tok.txt
+perl mosesdecoder/scripts/generic/multi-bleu.perl data/newstest2013.tok.de < predict.tok.txt
 ```
 可以看到类似如下的结果。
 ```
 BLEU = 25.08, 58.3/31.5/19.6/12.6 (BP=0.966, ratio=0.967, hyp_len=61321, ref_len=63412)
 ```
+目前在未使用 model average 的情况下，使用默认配置单机八卡（同论文中 base model 的配置）进行训练，英德翻译在 `newstest2013` 上测试 BLEU 值为25.，在 `newstest2014` 上测试 BLEU 值为26.；英法翻译在 `newstest2014` 上测试  BLEU 值为36.。
 
-### 参考文献
+### 分布式训练
+
+Transformer 模型支持同步或者异步的分布式训练。分布式的配置主要两个方面:
+
+1 命令行配置
+
+  - `--local`，有两个取值，`True`表示单机训练，而`False`表示使用分布式训练。默认为单机训练模式。
+
+  - `--sync`，有两个取值，但只有当`--local`参数为False才会产生影响，其中`True`表示同步训练模式，`False`表示异步训练模式。默认为同步训练模式。
+
+2 环境变量配置
+
+  在分布式训练模式下，会手动配置训练的trainer数量和pserver数量。在网络拓扑上，每一个trainer都会和每一个pserver相连，pserver作为服务端，而trainer作为客户端。下面分pserver和trainer说明具体的参数配置：
+
+1) pserver配置
+
+- `PADDLE_IS_LOCAL=[0|1]` 是否是分布式训练，`0`标识是分布式，`1`标识是单机
 
+- `TRAINING_ROLE=PSERVER` 标识当前节点是pserver
+
+- `POD_IP=ip` 设置当前pserver使用对外服务的地址
+
+- `PADDLE_PORT=port` 设置当前pserver对外服务监听端口号，和`POD_IP`共同构成对外的唯一标识
+
+- `PADDLE_TRAINERS_NUM=num` 设置pserver连接的trainer的数量
+
+下面是配置的示例, 使用两个pserver, 192.168.2.2上的配置如下:
+```
+export PADDLE_PSERVERS=192.168.2.2,192.168.2.3
+export POD_IP=192.168.2.2
+export PADDLE_TRAINERS_NUM=2
+export TRAINING_ROLE=PSERVER
+export PADDLE_IS_LOCAL=0
+export PADDLE_PORT=6177
+```
+192.168.2.3上的配置如下:
+```
+export PADDLE_PSERVERS=192.168.2.2,192.168.2.3
+export POD_IP=192.168.2.3
+export PADDLE_TRAINERS_NUM=2
+export TRAINING_ROLE=PSERVER
+export PADDLE_IS_LOCAL=0
+export PADDLE_PORT=6177
+```
+2) trainer配置
+
+- `PADDLE_IS_LOCAL=[0|1]` 是否是分布式训练，`0`标识是分布式，`1`标识是单机
+
+- `TRAINING_ROLE=TRAINER` 标识当前节点是trainer
+
+- `PADDLE_PSERVERS=[ip1,ip2,……]` 设置pserver的ip地址,用于告知trainer互联的pserver的ip, 使用`,`分割
+
+- `PADDLE_TRAINER_ID=num` 设置当前节点的编号, 编号的取值范围为0到N-1的整数
+
+- `PADDLE_PORT=port` 设置请求的pserver服务端口号
+
+下面是配置的示例, 使用两个trainer, trainer 1上的配置如下:
+```
+export TRAINING_ROLE=TRAINER
+export PADDLE_PSERVERS=192.168.2.2,192.168.2.3
+export PADDLE_TRAINERS_NUM=2
+export PADDLE_TRAINER_ID=0
+export PADDLE_IS_LOCAL=0
+export PADDLE_PORT=6177
+```
+trainer 2上的配置如下:
+```
+export TRAINING_ROLE=TRAINER
+export PADDLE_PSERVERS=192.168.2.2,192.168.2.3
+export PADDLE_TRAINERS_NUM=2
+export PADDLE_TRAINER_ID=1
+export PADDLE_IS_LOCAL=0
+export PADDLE_PORT=6177
+```
+
+### 参考文献
 1. Vaswani A, Shazeer N, Parmar N, et al. [Attention is all you need](http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf)[C]//Advances in Neural Information Processing Systems. 2017: 6000-6010.
 2. He K, Zhang X, Ren S, et al. [Deep residual learning for image recognition](http://openaccess.thecvf.com/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf)[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2016: 770-778.
 3. Ba J L, Kiros J R, Hinton G E. [Layer normalization](https://arxiv.org/pdf/1607.06450.pdf)[J]. arXiv preprint arXiv:1607.06450, 2016.
 4. Sennrich R, Haddow B, Birch A. [Neural machine translation of rare words with subword units](https://arxiv.org/pdf/1508.07909)[J]. arXiv preprint arXiv:1508.07909, 2015.
+5. Wu Y, Schuster M, Chen Z, et al. [Google's neural machine translation system: Bridging the gap between human and machine translation](https://arxiv.org/pdf/1609.08144.pdf)[J]. arXiv preprint arXiv:1609.08144, 2016.
diff --git a/fluid/neural_machine_translation/transformer/infer.py b/fluid/neural_machine_translation/transformer/infer.py
index 505bf0b0062bda27a0299ed7d844e2f05abd95b8..8ac6837a3bddf0d280e10aee92964c1a501c0626 100644
--- a/fluid/neural_machine_translation/transformer/infer.py
+++ b/fluid/neural_machine_translation/transformer/infer.py
@@ -1,5 +1,7 @@
 import argparse
+import ast
 import numpy as np
+from functools import partial
 
 import paddle
 import paddle.fluid as fluid
@@ -11,6 +13,7 @@ from model import fast_decode as fast_decoder
 from config import *
 from train import pad_batch_data
 import reader
+import util
 
 
 def parse_args():
@@ -46,6 +49,22 @@ def parse_args():
         default=["<s>", "<e>", "<unk>"],
         nargs=3,
         help="The <bos>, <eos> and <unk> tokens in the dictionary.")
+    parser.add_argument(
+        "--use_wordpiece",
+        type=ast.literal_eval,
+        default=False,
+        help="The flag indicating if the data in wordpiece. The EN-FR data "
+        "we provided is wordpiece data. For wordpiece data, converting ids to "
+        "original words is a little different and some special codes are "
+        "provided in util.py to do this.")
+    parser.add_argument(
+        "--token_delimiter",
+        type=partial(
+            str.decode, encoding="string-escape"),
+        default=" ",
+        help="The delimiter used to split tokens in source or target sentences. "
+        "For EN-DE BPE data we provided, use spaces as token delimiter.; "
+        "For EN-FR wordpiece data we provided, use '\x01' as token delimiter.")
     parser.add_argument(
         'opts',
         help='See config.py for all options',
@@ -320,7 +339,7 @@ def post_process_seq(seq,
         seq)
 
 
-def py_infer(test_data, trg_idx2word):
+def py_infer(test_data, trg_idx2word, use_wordpiece):
     """
     Inference by beam search implented by python, while the calculations from
     symbols to probilities execute by Fluid operators.
@@ -399,7 +418,10 @@ def py_infer(test_data, trg_idx2word):
             seqs = map(post_process_seq, batch_seqs[i])
             scores = batch_scores[i]
             for seq in seqs:
-                print(" ".join([trg_idx2word[idx] for idx in seq]))
+                if use_wordpiece:
+                    print(util.subword_ids_to_str(seq, trg_idx2word))
+                else:
+                    print(" ".join([trg_idx2word[idx] for idx in seq]))
 
 
 def prepare_batch_input(insts, data_input_names, util_input_names, src_pad_idx,
@@ -465,7 +487,7 @@ def prepare_batch_input(insts, data_input_names, util_input_names, src_pad_idx,
     return input_dict
 
 
-def fast_infer(test_data, trg_idx2word):
+def fast_infer(test_data, trg_idx2word, use_wordpiece):
     """
     Inference by beam search decoder based solely on Fluid operators.
     """
@@ -520,7 +542,9 @@ def fast_infer(test_data, trg_idx2word):
                     trg_idx2word[idx]
                     for idx in post_process_seq(
                         np.array(seq_ids)[sub_start:sub_end])
-                ]))
+                ]) if not use_wordpiece else util.subtoken_ids_to_str(
+                    post_process_seq(np.array(seq_ids)[sub_start:sub_end]),
+                    trg_idx2word))
                 scores[i].append(np.array(seq_scores)[sub_end - 1])
                 print hyps[i][-1]
                 if len(hyps[i]) >= InferTaskConfig.n_best:
@@ -534,8 +558,9 @@ def infer(args, inferencer=fast_infer):
         src_vocab_fpath=args.src_vocab_fpath,
         trg_vocab_fpath=args.trg_vocab_fpath,
         fpattern=args.test_file_pattern,
-        batch_size=args.batch_size,
+        token_delimiter=args.token_delimiter,
         use_token_batch=False,
+        batch_size=args.batch_size,
         pool_size=args.pool_size,
         sort_type=reader.SortType.NONE,
         shuffle=False,
@@ -548,7 +573,7 @@ def infer(args, inferencer=fast_infer):
         clip_last_batch=False)
     trg_idx2word = test_data.load_dict(
         dict_path=args.trg_vocab_fpath, reverse=True)
-    inferencer(test_data, trg_idx2word)
+    inferencer(test_data, trg_idx2word, args.use_wordpiece)
 
 
 if __name__ == "__main__":
diff --git a/fluid/neural_machine_translation/transformer/profile.py b/fluid/neural_machine_translation/transformer/profile.py
new file mode 100644
index 0000000000000000000000000000000000000000..caf3125b8ae3ed666eb42b4bbcde73b2f0c42ca3
--- /dev/null
+++ b/fluid/neural_machine_translation/transformer/profile.py
@@ -0,0 +1,244 @@
+import os
+import time
+import argparse
+import ast
+import numpy as np
+import multiprocessing
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+from train import split_data, read_multiple, prepare_batch_input
+from model import transformer, position_encoding_init
+from optim import LearningRateScheduler
+from config import *
+import reader
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        "Profile the training process for Transformer.")
+    parser.add_argument(
+        "--src_vocab_fpath",
+        type=str,
+        required=True,
+        help="The path of vocabulary file of source language.")
+    parser.add_argument(
+        "--trg_vocab_fpath",
+        type=str,
+        required=True,
+        help="The path of vocabulary file of target language.")
+    parser.add_argument(
+        "--train_file_pattern",
+        type=str,
+        required=True,
+        help="The pattern to match training data files.")
+    parser.add_argument(
+        "--use_token_batch",
+        type=ast.literal_eval,
+        default=True,
+        help="The flag indicating whether to "
+        "produce batch data according to token number.")
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=2048,
+        help="The number of sequences contained in a mini-batch, or the maximum "
+        "number of tokens (include paddings) contained in a mini-batch. Note "
+        "that this represents the number on single device and the actual batch "
+        "size for multi-devices will multiply the device number.")
+    parser.add_argument(
+        "--num_iters",
+        type=int,
+        default=10,
+        help="The maximum number of iterations profiling over.")
+    parser.add_argument(
+        "--pool_size",
+        type=int,
+        default=10000,
+        help="The buffer size to pool data.")
+    parser.add_argument(
+        "--special_token",
+        type=str,
+        default=["<s>", "<e>", "<unk>"],
+        nargs=3,
+        help="The <bos>, <eos> and <unk> tokens in the dictionary.")
+    parser.add_argument(
+        'opts',
+        help='See config.py for all options',
+        default=None,
+        nargs=argparse.REMAINDER)
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help="The device type.")
+
+    args = parser.parse_args()
+    # Append args related to dict
+    src_dict = reader.DataReader.load_dict(args.src_vocab_fpath)
+    trg_dict = reader.DataReader.load_dict(args.trg_vocab_fpath)
+    dict_args = [
+        "src_vocab_size", str(len(src_dict)), "trg_vocab_size",
+        str(len(trg_dict)), "bos_idx", str(src_dict[args.special_token[0]]),
+        "eos_idx", str(src_dict[args.special_token[1]]), "unk_idx",
+        str(src_dict[args.special_token[2]])
+    ]
+    merge_cfg_from_list(args.opts + dict_args,
+                        [TrainTaskConfig, ModelHyperParams])
+    return args
+
+
+def train_loop(exe, train_progm, init, num_iters, train_data, dev_count,
+               sum_cost, avg_cost, lr_scheduler, token_num, predict):
+
+    data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
+                                                                             -1] + label_data_input_fields
+    util_input_names = encoder_util_input_fields + decoder_util_input_fields
+
+    start_time = time.time()
+    exec_time = 0.0
+    for batch_id, data in enumerate(train_data()):
+        if batch_id >= num_iters:
+            break
+        feed_list = []
+        total_num_token = 0
+        for place_id, data_buffer in enumerate(
+                split_data(
+                    data, num_part=dev_count)):
+            data_input_dict, util_input_dict, num_token = prepare_batch_input(
+                data_buffer, data_input_names, util_input_names,
+                ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
+                ModelHyperParams.n_head, ModelHyperParams.d_model)
+            total_num_token += num_token
+            feed_kv_pairs = data_input_dict.items() + util_input_dict.items()
+            lr_rate = lr_scheduler.update_learning_rate()
+            feed_kv_pairs += {lr_scheduler.learning_rate.name: lr_rate}.items()
+            feed_list.append(dict(feed_kv_pairs))
+
+            if not init:
+                for pos_enc_param_name in pos_enc_param_names:
+                    pos_enc = position_encoding_init(
+                        ModelHyperParams.max_length + 1,
+                        ModelHyperParams.d_model)
+                    feed_list[place_id][pos_enc_param_name] = pos_enc
+        for feed_dict in feed_list:
+            feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token
+
+        exe_start_time = time.time()
+        if dev_count > 1:
+            # prallel executor
+            outs = exe.run(fetch_list=[sum_cost.name, token_num.name],
+                           feed=feed_list)
+        else:
+            # executor
+            outs = exe.run(fetch_list=[sum_cost, token_num], feed=feed_list[0])
+        exec_time += time.time() - exe_start_time
+
+        sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
+        total_sum_cost = sum_cost_val.sum()  # sum the cost from multi-devices
+        total_token_num = token_num_val.sum()
+        total_avg_cost = total_sum_cost / total_token_num
+        print("batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
+              (batch_id, total_sum_cost, total_avg_cost,
+               np.exp([min(total_avg_cost, 100)])))
+        init = True
+    return time.time() - start_time, exec_time
+
+
+def profile(args):
+    print args
+
+    if args.device == 'CPU':
+        TrainTaskConfig.use_gpu = False
+
+    if not TrainTaskConfig.use_gpu:
+        place = fluid.CPUPlace()
+        dev_count = multiprocessing.cpu_count()
+    else:
+        place = fluid.CUDAPlace(0)
+        dev_count = fluid.core.get_cuda_device_count()
+
+    exe = fluid.Executor(place)
+
+    sum_cost, avg_cost, predict, token_num = transformer(
+        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
+        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
+        ModelHyperParams.n_head, ModelHyperParams.d_key,
+        ModelHyperParams.d_value, ModelHyperParams.d_model,
+        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
+        ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps)
+    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
+                                         TrainTaskConfig.warmup_steps,
+                                         TrainTaskConfig.learning_rate)
+
+    optimizer = fluid.optimizer.Adam(
+        learning_rate=lr_scheduler.learning_rate,
+        beta1=TrainTaskConfig.beta1,
+        beta2=TrainTaskConfig.beta2,
+        epsilon=TrainTaskConfig.eps)
+    optimizer.minimize(sum_cost)
+
+    # Initialize the parameters.
+    if TrainTaskConfig.ckpt_path:
+        fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
+        lr_scheduler.current_steps = TrainTaskConfig.start_step
+    else:
+        exe.run(fluid.framework.default_startup_program())
+
+    # Disable all sorts for they will be done in the 1st batch.
+    train_data = reader.DataReader(
+        src_vocab_fpath=args.src_vocab_fpath,
+        trg_vocab_fpath=args.trg_vocab_fpath,
+        fpattern=args.train_file_pattern,
+        use_token_batch=args.use_token_batch,
+        batch_size=args.batch_size * (1 if args.use_token_batch else dev_count),
+        pool_size=args.pool_size,
+        sort_type='none',
+        shuffle=False,
+        shuffle_batch=False,
+        start_mark=args.special_token[0],
+        end_mark=args.special_token[1],
+        unk_mark=args.special_token[2],
+        # count start and end tokens out
+        max_length=ModelHyperParams.max_length - 2,
+        clip_last_batch=False)
+    train_data = read_multiple(
+        reader=train_data.batch_generator,
+        count=dev_count if args.use_token_batch else 1)
+
+    if dev_count > 1:
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=TrainTaskConfig.use_gpu,
+            loss_name=sum_cost.name,
+            main_program=fluid.default_main_program(),
+            build_strategy=build_strategy)
+
+    print("Warming up ...")
+    train_loop(exe if dev_count == 1 else train_exe,
+               fluid.default_main_program(), False, 3, train_data, dev_count,
+               sum_cost, avg_cost, lr_scheduler, token_num, predict)
+
+    print("\nProfiling ...")
+    if dev_count == 1:
+        with profiler.profiler('All', 'total', '/tmp/profile_file'):
+            total_time, exec_time = train_loop(
+                exe,
+                fluid.default_main_program(), True, args.num_iters, train_data,
+                dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict)
+    else:
+        total_time, exec_time = train_loop(
+            train_exe,
+            fluid.default_main_program(), True, args.num_iters, train_data,
+            dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict)
+    print("Elapsed time: total %f s, in executor %f s" %
+          (total_time, exec_time))
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    profile(args)
diff --git a/fluid/neural_machine_translation/transformer/reader.py b/fluid/neural_machine_translation/transformer/reader.py
index 27bd82b13a0480e80bdfcdc72eaa670854f4cd3a..a67d0e6d8aa48ef54709b250ce1763c2b0bb524c 100644
--- a/fluid/neural_machine_translation/transformer/reader.py
+++ b/fluid/neural_machine_translation/transformer/reader.py
@@ -116,9 +116,12 @@ class DataReader(object):
     :param use_token_batch: Whether to produce batch data according to
         token number.
     :type use_token_batch: bool
-    :param delimiter: The delimiter used to split source and target in each
-        line of data file.
-    :type delimiter: basestring
+    :param field_delimiter: The delimiter used to split source and target in
+        each line of data file.
+    :type field_delimiter: basestring
+    :param token_delimiter: The delimiter used to split tokens in source or
+        target sentences.
+    :type token_delimiter: basestring
     :param start_mark: The token representing for the beginning of
         sentences in dictionary.
     :type start_mark: basestring
@@ -145,7 +148,8 @@ class DataReader(object):
                  shuffle=True,
                  shuffle_batch=False,
                  use_token_batch=False,
-                 delimiter="\t",
+                 field_delimiter="\t",
+                 token_delimiter=" ",
                  start_mark="<s>",
                  end_mark="<e>",
                  unk_mark="<unk>",
@@ -164,7 +168,8 @@ class DataReader(object):
         self._shuffle_batch = shuffle_batch
         self._min_length = min_length
         self._max_length = max_length
-        self._delimiter = delimiter
+        self._field_delimiter = field_delimiter
+        self._token_delimiter = token_delimiter
         self._epoch_batches = []
 
         src_seq_words, trg_seq_words = self._load_data(fpattern, tar_fname)
@@ -196,7 +201,7 @@ class DataReader(object):
         trg_seq_words = []
 
         for line in f_obj:
-            fields = line.strip().split(self._delimiter)
+            fields = line.strip().split(self._field_delimiter)
 
             if (not self._only_src and len(fields) != 2) or (self._only_src and
                                                              len(fields) != 1):
@@ -207,7 +212,7 @@ class DataReader(object):
             max_len = -1
 
             for i, seq in enumerate(fields):
-                seq_words = seq.split()
+                seq_words = seq.split(self._token_delimiter)
                 max_len = max(max_len, len(seq_words))
                 if len(seq_words) == 0 or \
                         len(seq_words) < self._min_length or \
@@ -258,9 +263,9 @@ class DataReader(object):
         with open(dict_path, "r") as fdict:
             for idx, line in enumerate(fdict):
                 if reverse:
-                    word_dict[idx] = line.strip()
+                    word_dict[idx] = line.strip('\n')
                 else:
-                    word_dict[line.strip()] = idx
+                    word_dict[line.strip('\n')] = idx
         return word_dict
 
     def _sample_generator(self):
diff --git a/fluid/neural_machine_translation/transformer/train.py b/fluid/neural_machine_translation/transformer/train.py
index cdd7dfed8235a42da867e08e16e0aef4ba500fa1..5175c48e62aa6cc480e766478a5be154791c362e 100644
--- a/fluid/neural_machine_translation/transformer/train.py
+++ b/fluid/neural_machine_translation/transformer/train.py
@@ -3,6 +3,8 @@ import time
 import argparse
 import ast
 import numpy as np
+import multiprocessing
+from functools import partial
 
 import paddle
 import paddle.fluid as fluid
@@ -75,11 +77,33 @@ def parse_args():
         default=["<s>", "<e>", "<unk>"],
         nargs=3,
         help="The <bos>, <eos> and <unk> tokens in the dictionary.")
+    parser.add_argument(
+        "--token_delimiter",
+        type=partial(
+            str.decode, encoding="string-escape"),
+        default=" ",
+        help="The delimiter used to split tokens in source or target sentences. "
+        "For EN-DE BPE data we provided, use spaces as token delimiter. "
+        "For EN-FR wordpiece data we provided, use '\x01' as token delimiter.")
     parser.add_argument(
         'opts',
         help='See config.py for all options',
         default=None,
         nargs=argparse.REMAINDER)
+    parser.add_argument(
+        '--local',
+        type=ast.literal_eval,
+        default=True,
+        help='Whether to run as local mode.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help="The device type.")
+    parser.add_argument(
+        '--sync', type=ast.literal_eval, default=True, help="sync mode.")
+
     args = parser.parse_args()
     # Append args related to dict
     src_dict = reader.DataReader.load_dict(args.src_vocab_fpath)
@@ -247,40 +271,81 @@ def split_data(data, num_part):
     ]
 
 
-def train(args):
-    dev_count = fluid.core.get_cuda_device_count()
+def test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names,
+                 util_input_names, sum_cost, token_num):
+    # Context to do validation.
+    test_program = train_progm.clone()
+    with fluid.program_guard(test_program):
+        test_program = fluid.io.get_inference_program([avg_cost])
 
-    sum_cost, avg_cost, predict, token_num = transformer(
-        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
-        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
-        ModelHyperParams.n_head, ModelHyperParams.d_key,
-        ModelHyperParams.d_value, ModelHyperParams.d_model,
-        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
-        ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps)
+    val_data = reader.DataReader(
+        src_vocab_fpath=args.src_vocab_fpath,
+        trg_vocab_fpath=args.trg_vocab_fpath,
+        fpattern=args.val_file_pattern,
+        token_delimiter=args.token_delimiter,
+        use_token_batch=args.use_token_batch,
+        batch_size=args.batch_size * (1 if args.use_token_batch else dev_count),
+        pool_size=args.pool_size,
+        sort_type=args.sort_type,
+        start_mark=args.special_token[0],
+        end_mark=args.special_token[1],
+        unk_mark=args.special_token[2],
+        # count start and end tokens out
+        max_length=ModelHyperParams.max_length - 2,
+        clip_last_batch=False,
+        shuffle=False,
+        shuffle_batch=False)
 
-    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
-                                         TrainTaskConfig.warmup_steps,
-                                         TrainTaskConfig.learning_rate)
-    optimizer = fluid.optimizer.Adam(
-        learning_rate=lr_scheduler.learning_rate,
-        beta1=TrainTaskConfig.beta1,
-        beta2=TrainTaskConfig.beta2,
-        epsilon=TrainTaskConfig.eps)
-    optimizer.minimize(sum_cost)
-
-    place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()
-    exe = fluid.Executor(place)
+    test_exe = fluid.ParallelExecutor(
+        use_cuda=TrainTaskConfig.use_gpu,
+        main_program=test_program,
+        share_vars_from=train_exe)
+
+    def test(exe=test_exe):
+        test_total_cost = 0
+        test_total_token = 0
+        test_data = read_multiple(
+            reader=val_data.batch_generator,
+            count=dev_count if args.use_token_batch else 1)
+        for batch_id, data in enumerate(test_data()):
+            feed_list = []
+            for place_id, data_buffer in enumerate(
+                    split_data(
+                        data, num_part=dev_count)):
+                data_input_dict, util_input_dict, _ = prepare_batch_input(
+                    data_buffer, data_input_names, util_input_names,
+                    ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
+                    ModelHyperParams.n_head, ModelHyperParams.d_model)
+                feed_list.append(
+                    dict(data_input_dict.items() + util_input_dict.items()))
+
+            outs = exe.run(feed=feed_list,
+                           fetch_list=[sum_cost.name, token_num.name])
+            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
+            test_total_cost += sum_cost_val.sum()
+            test_total_token += token_num_val.sum()
+        test_avg_cost = test_total_cost / test_total_token
+        test_ppl = np.exp([min(test_avg_cost, 100)])
+        return test_avg_cost, test_ppl
+
+    return test
+
+
+def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
+               token_num, predict):
     # Initialize the parameters.
     if TrainTaskConfig.ckpt_path:
         fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
         lr_scheduler.current_steps = TrainTaskConfig.start_step
     else:
+        print "init fluid.framework.default_startup_program"
         exe.run(fluid.framework.default_startup_program())
 
     train_data = reader.DataReader(
         src_vocab_fpath=args.src_vocab_fpath,
         trg_vocab_fpath=args.trg_vocab_fpath,
         fpattern=args.train_file_pattern,
+        token_delimiter=args.token_delimiter,
         use_token_batch=args.use_token_batch,
         batch_size=args.batch_size * (1 if args.use_token_batch else dev_count),
         pool_size=args.pool_size,
@@ -305,77 +370,26 @@ def train(args):
     train_exe = fluid.ParallelExecutor(
         use_cuda=TrainTaskConfig.use_gpu,
         loss_name=sum_cost.name,
+        main_program=train_progm,
         build_strategy=build_strategy)
 
-    def test_context():
-        # Context to do validation.
-        test_program = fluid.default_main_program().clone(for_test=True)
-        test_exe = fluid.ParallelExecutor(
-            use_cuda=TrainTaskConfig.use_gpu,
-            main_program=test_program,
-            share_vars_from=train_exe)
-
-        val_data = reader.DataReader(
-            src_vocab_fpath=args.src_vocab_fpath,
-            trg_vocab_fpath=args.trg_vocab_fpath,
-            fpattern=args.val_file_pattern,
-            use_token_batch=args.use_token_batch,
-            batch_size=args.batch_size *
-            (1 if args.use_token_batch else dev_count),
-            pool_size=args.pool_size,
-            sort_type=args.sort_type,
-            start_mark=args.special_token[0],
-            end_mark=args.special_token[1],
-            unk_mark=args.special_token[2],
-            # count start and end tokens out
-            max_length=ModelHyperParams.max_length - 2,
-            clip_last_batch=False,
-            shuffle=False,
-            shuffle_batch=False)
-
-        def test(exe=test_exe):
-            test_total_cost = 0
-            test_total_token = 0
-            test_data = read_multiple(
-                reader=val_data.batch_generator,
-                count=dev_count if args.use_token_batch else 1)
-            for batch_id, data in enumerate(test_data()):
-                feed_list = []
-                for place_id, data_buffer in enumerate(
-                        split_data(
-                            data, num_part=dev_count)):
-                    data_input_dict, util_input_dict, _ = prepare_batch_input(
-                        data_buffer, data_input_names, util_input_names,
-                        ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
-                        ModelHyperParams.n_head, ModelHyperParams.d_model)
-                    feed_list.append(
-                        dict(data_input_dict.items() + util_input_dict.items()))
-
-                outs = exe.run(feed=feed_list,
-                               fetch_list=[sum_cost.name, token_num.name])
-                sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[
-                    1])
-                test_total_cost += sum_cost_val.sum()
-                test_total_token += token_num_val.sum()
-            test_avg_cost = test_total_cost / test_total_token
-            test_ppl = np.exp([min(test_avg_cost, 100)])
-            return test_avg_cost, test_ppl
-
-        return test
-
-    if args.val_file_pattern is not None:
-        test = test_context()
-
     data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
                                                                              -1] + label_data_input_fields
     util_input_names = encoder_util_input_fields + decoder_util_input_fields
+
+    if args.val_file_pattern is not None:
+        test = test_context(train_progm, avg_cost, train_exe, dev_count,
+                            data_input_names, util_input_names, sum_cost,
+                            token_num)
+
     init = False
     for pass_id in xrange(TrainTaskConfig.pass_num):
         pass_start_time = time.time()
         for batch_id, data in enumerate(train_data()):
             feed_list = []
             total_num_token = 0
-            lr_rate = lr_scheduler.update_learning_rate()
+            if args.local:
+                lr_rate = lr_scheduler.update_learning_rate()
             for place_id, data_buffer in enumerate(
                     split_data(
                         data, num_part=dev_count)):
@@ -384,11 +398,15 @@ def train(args):
                     ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                     ModelHyperParams.n_head, ModelHyperParams.d_model)
                 total_num_token += num_token
-                feed_list.append(
-                    dict(data_input_dict.items() + util_input_dict.items() +
-                         {lr_scheduler.learning_rate.name: lr_rate}.items()))
-
-                if not init:  # init the position encoding table
+                feed_kv_pairs = data_input_dict.items() + util_input_dict.items(
+                )
+                if args.local:
+                    feed_kv_pairs += {
+                        lr_scheduler.learning_rate.name: lr_rate
+                    }.items()
+                feed_list.append(dict(feed_kv_pairs))
+
+                if not init:
                     for pos_enc_param_name in pos_enc_param_names:
                         pos_enc = position_encoding_init(
                             ModelHyperParams.max_length + 1,
@@ -406,12 +424,16 @@ def train(args):
             print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
                   (pass_id, batch_id, total_sum_cost, total_avg_cost,
                    np.exp([min(total_avg_cost, 100)])))
+            if batch_id > 0 and batch_id % 1000 == 0:
+                fluid.io.save_persistables(
+                    exe,
+                    os.path.join(TrainTaskConfig.ckpt_dir, "latest.checkpoint"))
             init = True
         # Validate and save the model for inference.
-        print("epoch: %d, " % pass_id + (
-            "val avg loss: %f, val ppl: %f, " % test()
-            if args.val_file_pattern is not None else "") + "consumed %fs" % (
-                time.time() - pass_start_time))
+        print("epoch: %d, " % pass_id +
+              ("val avg loss: %f, val ppl: %f, " % test()
+               if args.val_file_pattern is not None else "") + "consumed %fs" %
+              (time.time() - pass_start_time))
         fluid.io.save_persistables(
             exe,
             os.path.join(TrainTaskConfig.ckpt_dir,
@@ -422,6 +444,107 @@ def train(args):
             data_input_names[:-2] + util_input_names, [predict], exe)
 
 
+def train(args):
+    # priority: ENV > args > config
+    is_local = os.getenv("PADDLE_IS_LOCAL", "1")
+    if is_local == '0':
+        args.local = False
+    print args
+
+    if args.device == 'CPU':
+        TrainTaskConfig.use_gpu = False
+
+    training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+
+    if training_role == "PSERVER" or (not TrainTaskConfig.use_gpu):
+        place = fluid.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    else:
+        place = fluid.CUDAPlace(0)
+        dev_count = fluid.core.get_cuda_device_count()
+
+    exe = fluid.Executor(place)
+
+    sum_cost, avg_cost, predict, token_num = transformer(
+        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
+        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
+        ModelHyperParams.n_head, ModelHyperParams.d_key,
+        ModelHyperParams.d_value, ModelHyperParams.d_model,
+        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
+        ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps)
+    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
+                                         TrainTaskConfig.warmup_steps,
+                                         TrainTaskConfig.learning_rate)
+
+    if args.local:
+        optimizer = fluid.optimizer.Adam(
+            learning_rate=lr_scheduler.learning_rate,
+            beta1=TrainTaskConfig.beta1,
+            beta2=TrainTaskConfig.beta2,
+            epsilon=TrainTaskConfig.eps)
+        optimizer.minimize(sum_cost)
+    elif args.sync == False:
+        optimizer = fluid.optimizer.SGD(0.003)
+        optimizer.minimize(sum_cost)
+    else:
+        lr_decay = fluid.layers\
+         .learning_rate_scheduler\
+         .noam_decay(ModelHyperParams.d_model,
+            TrainTaskConfig.warmup_steps)
+
+        optimizer = fluid.optimizer.Adam(
+            learning_rate=lr_decay,
+            beta1=TrainTaskConfig.beta1,
+            beta2=TrainTaskConfig.beta2,
+            epsilon=TrainTaskConfig.eps)
+        optimizer.minimize(sum_cost)
+
+    if args.local:
+        print("local start_up:")
+        train_loop(exe,
+                   fluid.default_main_program(), dev_count, sum_cost, avg_cost,
+                   lr_scheduler, token_num, predict)
+    else:
+        port = os.getenv("PADDLE_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        t = fluid.DistributeTranspiler()
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+
+        if training_role == "PSERVER":
+            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
+                "PADDLE_PORT")
+            if not current_endpoint:
+                print("need env SERVER_ENDPOINT")
+                exit(1)
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+
+            print "psserver begin run"
+            with open('pserver_startup.desc', 'w') as f:
+                f.write(str(pserver_startup))
+            with open('pserver_prog.desc', 'w') as f:
+                f.write(str(pserver_prog))
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+
+            trainer_prog = t.get_trainer_program()
+            with open('trainer_prog.desc', 'w') as f:
+                f.write(str(trainer_prog))
+            train_loop(exe, trainer_prog, dev_count, sum_cost, avg_cost,
+                       lr_scheduler, token_num, predict)
+        else:
+            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
 if __name__ == "__main__":
     args = parse_args()
     train(args)
diff --git a/fluid/neural_machine_translation/transformer/util.py b/fluid/neural_machine_translation/transformer/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..190abf92f4f48bfc943bd99bf61a222cc6c9d2f0
--- /dev/null
+++ b/fluid/neural_machine_translation/transformer/util.py
@@ -0,0 +1,68 @@
+import sys
+import re
+import six
+import unicodedata
+
+# Regular expression for unescaping token strings.
+# '\u' is converted to '_'
+# '\\' is converted to '\'
+# '\213;' is converted to unichr(213)
+# Inverse of escaping.
+_UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
+
+# This set contains all letter and number characters.
+_ALPHANUMERIC_CHAR_SET = set(
+    six.unichr(i) for i in range(sys.maxunicode)
+    if (unicodedata.category(six.unichr(i)).startswith("L") or
+        unicodedata.category(six.unichr(i)).startswith("N")))
+
+
+def unescape_token(escaped_token):
+    """
+    Inverse of encoding escaping.
+    """
+
+    def match(m):
+        if m.group(1) is None:
+            return u"_" if m.group(0) == u"\\u" else u"\\"
+
+        try:
+            return six.unichr(int(m.group(1)))
+        except (ValueError, OverflowError) as _:
+            return u"\u3013"  # Unicode for undefined character.
+
+    trimmed = escaped_token[:-1] if escaped_token.endswith(
+        "_") else escaped_token
+    return _UNESCAPE_REGEX.sub(match, trimmed)
+
+
+def subtoken_ids_to_str(subtoken_ids, vocabs):
+    """
+    Convert a list of subtoken(word piece) ids to a native string.
+    Refer to SubwordTextEncoder in Tensor2Tensor. 
+    """
+    subtokens = [vocabs.get(subtoken_id, u"") for subtoken_id in subtoken_ids]
+
+    # Convert a list of subtokens to a list of tokens.
+    concatenated = "".join([
+        t if isinstance(t, unicode) else t.decode("utf-8") for t in subtokens
+    ])
+    split = concatenated.split("_")
+    tokens = []
+    for t in split:
+        if t:
+            unescaped = unescape_token(t + "_")
+            if unescaped:
+                tokens.append(unescaped)
+
+    # Convert a list of tokens to a unicode string (by inserting spaces bewteen
+    # word tokens).
+    token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens]
+    ret = []
+    for i, token in enumerate(tokens):
+        if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:
+            ret.append(u" ")
+        ret.append(token)
+    seq = "".join(ret)
+
+    return seq.encode("utf-8")
diff --git a/fluid/neural_machine_translation_rnn_search b/fluid/neural_machine_translation_rnn_search
new file mode 120000
index 0000000000000000000000000000000000000000..29002f1776a3f4e0bfa0b32a1aebc44d66b65628
--- /dev/null
+++ b/fluid/neural_machine_translation_rnn_search
@@ -0,0 +1 @@
+./neural_machine_translation/rnn_search
\ No newline at end of file
diff --git a/fluid/object_detection/.run_ce.sh b/fluid/object_detection/.run_ce.sh
new file mode 100755
index 0000000000000000000000000000000000000000..50809e77043e0eb0bb5f6bf5a9904d8113c85756
--- /dev/null
+++ b/fluid/object_detection/.run_ce.sh
@@ -0,0 +1,19 @@
+###!/bin/bash
+####This file is only used for continuous evaluation.
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+if [ ! -d "/root/.cache/paddle/dataset/pascalvoc" ];then
+    mkdir -p /root/.cache/paddle/dataset/pascalvoc
+    ./data/pascalvoc/download.sh
+    cp -r ./data/pascalvoc/. /home/.cache/paddle/dataset/pascalvoc
+fi
+
+cudaid=${object_detection_cudaid:=0}
+export CUDA_VISIBLE_DEVICES=$cudaid
+FLAGS_benchmark=true  python train.py --enable_ce=True --batch_size=64 --num_passes=2 --data_dir=/root/.cache/paddle/dataset/pascalvoc/ | python _ce.py
+
+cudaid=${object_detection_cudaid:=0,1,2,3}
+export CUDA_VISIBLE_DEVICES=$cudaid
+FLAGS_benchmark=true  python train.py --enable_ce=True --batch_size=64 --num_passes=2 --data_dir=/root/.cache/paddle/dataset/pascalvoc/ | python _ce.py
diff --git a/fluid/object_detection/_ce.py b/fluid/object_detection/_ce.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f17ff324d8c4bb1d0cecca2401e584a7ec5e3af
--- /dev/null
+++ b/fluid/object_detection/_ce.py
@@ -0,0 +1,72 @@
+####this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+#### NOTE kpi.py should shared in models in some way!!!!
+
+train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
+test_acc_kpi = AccKpi('test_acc', 0.01, 0, actived=True)
+train_speed_kpi = AccKpi('train_speed', 0.2, 0, actived=True)
+train_cost_card4_kpi = CostKpi('train_cost_card4', 0.02, 0, actived=True)
+test_acc_card4_kpi = AccKpi('test_acc_card4', 0.01, 0, actived=True)
+train_speed_card4_kpi = AccKpi('train_speed_card4', 0.2, 0, actived=True)
+
+tracking_kpis = [
+    train_cost_kpi,
+    test_acc_kpi,
+    train_speed_kpi,
+    train_cost_card4_kpi,
+    test_acc_card4_kpi,
+    train_speed_card4_kpi,
+]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    #kpi_map = {}
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            print("-----%s" % fs)
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            #kpi_map[kpi_name] = kpi_value
+            yield kpi_name, kpi_value
+    #return kpi_map
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    print("*****")
+    print(log)
+    print("****")
+    log_to_ce(log)
diff --git a/fluid/object_detection/data/coco/download.sh b/fluid/object_detection/data/coco/download.sh
index 50bc8a6894463549a2b18197704450621e969c9d..6f262ccebb635e993b35349890a793430d9ad597 100644
--- a/fluid/object_detection/data/coco/download.sh
+++ b/fluid/object_detection/data/coco/download.sh
@@ -11,10 +11,10 @@ wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
 wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
 # Extract the data.
 echo "Extracting..."
-unzip train2014.tar
-unzip val2014.tar
-unzip train2017.tar
-unzip val2017.tar
-unzip annotations_trainval2014.tar
-unzip annotations_trainval2017.tar
+unzip train2014.zip
+unzip val2014.zip
+unzip train2017.zip
+unzip val2017.zip
+unzip annotations_trainval2014.zip
+unzip annotations_trainval2017.zip
 
diff --git a/fluid/object_detection/mobilenet_ssd.py b/fluid/object_detection/mobilenet_ssd.py
index c39883196056aede5d410554e14a0198e540d754..b87c0558447397e0a5b6a7a1e689a316d1ee8e14 100644
--- a/fluid/object_detection/mobilenet_ssd.py
+++ b/fluid/object_detection/mobilenet_ssd.py
@@ -1,4 +1,3 @@
-import paddle.v2 as paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
diff --git a/fluid/object_detection/train.py b/fluid/object_detection/train.py
index c29bd070eda4cf82f5ac36a3eb5699ae13ae86d2..aadcc904f55f077c06630a1f8e27a6bf4b422c05 100644
--- a/fluid/object_detection/train.py
+++ b/fluid/object_detection/train.py
@@ -23,7 +23,7 @@ add_arg('dataset',          str,   'pascalvoc', "coco2014, coco2017, and pascalv
 add_arg('model_save_dir',   str,   'model',     "The path to save model.")
 add_arg('pretrained_model', str,   'pretrained/ssd_mobilenet_v1_coco/', "The init model path.")
 add_arg('apply_distort',    bool,  True,   "Whether apply distort.")
-add_arg('apply_expand',     bool,  True,  "Whether appley expand.")
+add_arg('apply_expand',     bool,  True,   "Whether apply expand.")
 add_arg('nms_threshold',    float, 0.45,   "NMS threshold.")
 add_arg('ap_version',       str,   '11point',   "integral, 11point.")
 add_arg('resize_h',         int,   300,    "The resized image height.")
@@ -32,6 +32,8 @@ add_arg('mean_value_B',     float, 127.5,  "Mean value for B channel which will
 add_arg('mean_value_G',     float, 127.5,  "Mean value for G channel which will be subtracted.")  #116.78
 add_arg('mean_value_R',     float, 127.5,  "Mean value for R channel which will be subtracted.")  #103.94
 add_arg('is_toy',           int,   0, "Toy for quick debug, 0 means using all data, while n means using only n sample.")
+add_arg('data_dir',         str,   'data/pascalvoc', "data directory")
+add_arg('enable_ce',     bool,  False, "Whether use CE to evaluate the model")
 #yapf: enable
 
 
@@ -44,6 +46,9 @@ def train(args,
           num_passes,
           model_save_dir,
           pretrained_model=None):
+    if args.enable_ce:
+        fluid.framework.default_startup_program().random_seed = 111
+
     image_shape = [3, data_args.resize_h, data_args.resize_w]
     if 'coco' in data_args.dataset:
         num_classes = 91
@@ -117,8 +122,12 @@ def train(args,
         train_exe = fluid.ParallelExecutor(
             use_cuda=args.use_gpu, loss_name=loss.name)
 
-    train_reader = paddle.batch(
-        reader.train(data_args, train_file_list), batch_size=batch_size)
+    if not args.enable_ce:
+        train_reader = paddle.batch(
+            reader.train(data_args, train_file_list), batch_size=batch_size)
+    else:
+        train_reader = paddle.batch(
+            reader.train(data_args, train_file_list, False), batch_size=batch_size)
     test_reader = paddle.batch(
         reader.test(data_args, val_file_list), batch_size=batch_size)
     feeder = fluid.DataFeeder(
@@ -136,22 +145,29 @@ def train(args,
     def test(pass_id, best_map):
         _, accum_map = map_eval.get_map_var()
         map_eval.reset(exe)
+        every_pass_map=[]
         for batch_id, data in enumerate(test_reader()):
             test_map, = exe.run(test_program,
                                feed=feeder.feed(data),
                                fetch_list=[accum_map])
             if batch_id % 20 == 0:
+                every_pass_map.append(test_map)
                 print("Batch {0}, map {1}".format(batch_id, test_map))
+        mean_map = np.mean(every_pass_map)
         if test_map[0] > best_map:
             best_map = test_map[0]
             save_model('best_model')
         print("Pass {0}, test map {1}".format(pass_id, test_map))
-        return best_map
+        return best_map, mean_map
 
+    total_time = 0.0
     for pass_id in range(num_passes):
+        epoch_idx = pass_id + 1
         start_time = time.time()
         prev_start_time = start_time
-        end_time = 0
+        every_pass_loss = []
+        iter = 0
+        pass_duration = 0.0
         for batch_id, data in enumerate(train_reader()):
             prev_start_time = start_time
             start_time = time.time()
@@ -165,26 +181,40 @@ def train(args,
                 loss_v, = exe.run(fluid.default_main_program(),
                                   feed=feeder.feed(data),
                                   fetch_list=[loss])
-            end_time = time.time()
             loss_v = np.mean(np.array(loss_v))
+            every_pass_loss.append(loss_v)
             if batch_id % 20 == 0:
                 print("Pass {0}, batch {1}, loss {2}, time {3}".format(
                     pass_id, batch_id, loss_v, start_time - prev_start_time))
-        best_map = test(pass_id, best_map)
+
+        end_time = time.time()
+        best_map, mean_map = test(pass_id, best_map)
+        if args.enable_ce and pass_id == 1:
+            total_time += end_time - start_time
+            train_avg_loss = np.mean(every_pass_loss)
+            if devices_num == 1:
+                print ("kpis    train_cost        %s" % train_avg_loss)
+                print ("kpis    test_acc          %s" % mean_map)
+                print ("kpis    train_speed       %s" % (total_time / epoch_idx))
+            else:
+                print ("kpis    train_cost_card%s   %s" % (devices_num, train_avg_loss))
+                print ("kpis    test_acc_card%s     %s" % (devices_num, mean_map))
+                print ("kpis    train_speed_card%s  %f" % (devices_num, total_time / epoch_idx))
+
+
         if pass_id % 10 == 0 or pass_id == num_passes - 1:
             save_model(str(pass_id))
     print("Best test map {0}".format(best_map))
 
-
 if __name__ == '__main__':
     args = parser.parse_args()
     print_arguments(args)
 
-    data_dir = 'data/pascalvoc'
-    train_file_list = 'trainval.txt'
-    val_file_list = 'test.txt'
+    data_dir = args.data_dir
     label_file = 'label_list'
     model_save_dir = args.model_save_dir
+    train_file_list = 'trainval.txt'
+    val_file_list = 'test.txt'
     if 'coco' in args.dataset:
         data_dir = 'data/coco'
         if '2014' in args.dataset:
diff --git a/fluid/ocr_recognition/README.md b/fluid/ocr_recognition/README.md
index 4475695fdd06596b44967bd2d5d44530cccbf2e5..50b72440818384a0d8e80ab214faaabddbd93f90 100644
--- a/fluid/ocr_recognition/README.md
+++ b/fluid/ocr_recognition/README.md
@@ -113,6 +113,10 @@ data/test_images/00003.jpg
 ```
 env CUDA_VISIABLE_DEVICES=0 python ctc_train.py
 ```
+使用默认数据在CPU上训练:
+```
+env OMP_NUM_THREADS=<num_of_physical_cores> python ctc_train.py --use_gpu False --parallel=False
+```
 
 使用默认数据在GPU多卡上训练:
 
diff --git a/fluid/ocr_recognition/crnn_ctc_model.py b/fluid/ocr_recognition/crnn_ctc_model.py
index 79cf7b23954ce3331f46c50ee165dac720deae43..a5d4c70f868a6c973ff3e8b372a2eb387d1f191f 100644
--- a/fluid/ocr_recognition/crnn_ctc_model.py
+++ b/fluid/ocr_recognition/crnn_ctc_model.py
@@ -12,7 +12,8 @@ def conv_bn_pool(input,
                  bias=None,
                  param_0=None,
                  is_test=False,
-                 pooling=True):
+                 pooling=True,
+                 use_cudnn=False):
     tmp = input
     for i in xrange(group):
         tmp = fluid.layers.conv2d(
@@ -22,7 +23,7 @@ def conv_bn_pool(input,
             padding=1,
             param_attr=param if param_0 is None else param_0,
             act=None,  # LinearActivation
-            use_cudnn=True)
+            use_cudnn=use_cudnn)
         tmp = fluid.layers.batch_norm(
             input=tmp,
             act=act,
@@ -35,13 +36,17 @@ def conv_bn_pool(input,
             pool_size=2,
             pool_type='max',
             pool_stride=2,
-            use_cudnn=True,
+            use_cudnn=use_cudnn,
             ceil_mode=True)
 
     return tmp
 
 
-def ocr_convs(input, regularizer=None, gradient_clip=None, is_test=False):
+def ocr_convs(input,
+              regularizer=None,
+              gradient_clip=None,
+              is_test=False,
+              use_cudnn=False):
     b = fluid.ParamAttr(
         regularizer=regularizer,
         gradient_clip=gradient_clip,
@@ -56,12 +61,36 @@ def ocr_convs(input, regularizer=None, gradient_clip=None, is_test=False):
         initializer=fluid.initializer.Normal(0.0, 0.01))
     tmp = input
     tmp = conv_bn_pool(
-        tmp, 2, [16, 16], param=w1, bias=b, param_0=w0, is_test=is_test)
+        tmp,
+        2, [16, 16],
+        param=w1,
+        bias=b,
+        param_0=w0,
+        is_test=is_test,
+        use_cudnn=use_cudnn)
 
-    tmp = conv_bn_pool(tmp, 2, [32, 32], param=w1, bias=b, is_test=is_test)
-    tmp = conv_bn_pool(tmp, 2, [64, 64], param=w1, bias=b, is_test=is_test)
     tmp = conv_bn_pool(
-        tmp, 2, [128, 128], param=w1, bias=b, is_test=is_test, pooling=False)
+        tmp,
+        2, [32, 32],
+        param=w1,
+        bias=b,
+        is_test=is_test,
+        use_cudnn=use_cudnn)
+    tmp = conv_bn_pool(
+        tmp,
+        2, [64, 64],
+        param=w1,
+        bias=b,
+        is_test=is_test,
+        use_cudnn=use_cudnn)
+    tmp = conv_bn_pool(
+        tmp,
+        2, [128, 128],
+        param=w1,
+        bias=b,
+        is_test=is_test,
+        pooling=False,
+        use_cudnn=use_cudnn)
     return tmp
 
 
@@ -70,12 +99,14 @@ def encoder_net(images,
                 rnn_hidden_size=200,
                 regularizer=None,
                 gradient_clip=None,
-                is_test=False):
+                is_test=False,
+                use_cudnn=False):
     conv_features = ocr_convs(
         images,
         regularizer=regularizer,
         gradient_clip=gradient_clip,
-        is_test=is_test)
+        is_test=is_test,
+        use_cudnn=use_cudnn)
     sliced_feature = fluid.layers.im2sequence(
         input=conv_features,
         stride=[1, 1],
@@ -142,7 +173,11 @@ def ctc_train_net(images, label, args, num_classes):
     learning_rate_decay = None
     regularizer = fluid.regularizer.L2Decay(L2_RATE)
 
-    fc_out = encoder_net(images, num_classes, regularizer=regularizer)
+    fc_out = encoder_net(
+        images,
+        num_classes,
+        regularizer=regularizer,
+        use_cudnn=True if args.use_gpu else False)
     cost = fluid.layers.warpctc(
         input=fc_out, label=label, blank=num_classes, norm_by_times=True)
     sum_cost = fluid.layers.reduce_sum(cost)
@@ -166,19 +201,18 @@ def ctc_train_net(images, label, args, num_classes):
     if args.average_window > 0:
         model_average = fluid.optimizer.ModelAverage(
             args.average_window,
-            params_grads,
             min_average_window=args.min_average_window,
             max_average_window=args.max_average_window)
     return sum_cost, error_evaluator, inference_program, model_average
 
 
-def ctc_infer(images, num_classes):
-    fc_out = encoder_net(images, num_classes, is_test=True)
+def ctc_infer(images, num_classes, use_cudnn):
+    fc_out = encoder_net(images, num_classes, is_test=True, use_cudnn=use_cudnn)
     return fluid.layers.ctc_greedy_decoder(input=fc_out, blank=num_classes)
 
 
-def ctc_eval(images, label, num_classes):
-    fc_out = encoder_net(images, num_classes, is_test=True)
+def ctc_eval(images, label, num_classes, use_cudnn):
+    fc_out = encoder_net(images, num_classes, is_test=True, use_cudnn=use_cudnn)
     decoded_out = fluid.layers.ctc_greedy_decoder(
         input=fc_out, blank=num_classes)
 
diff --git a/fluid/ocr_recognition/ctc_reader.py b/fluid/ocr_recognition/ctc_reader.py
index db05dbeae73b67b12aebacdc84a04d5b180d2132..9cbe310c9656d0501915281a5bc0f5236a42f6b1 100644
--- a/fluid/ocr_recognition/ctc_reader.py
+++ b/fluid/ocr_recognition/ctc_reader.py
@@ -25,7 +25,7 @@ class DataGenerator(object):
     def __init__(self):
         pass
 
-    def train_reader(self, img_root_dir, img_label_list, batchsize):
+    def train_reader(self, img_root_dir, img_label_list, batchsize, cycle):
         '''
         Reader interface for training.
 
@@ -35,6 +35,10 @@ class DataGenerator(object):
         :param img_label_list: The path of the <image_name, label> file for training.
         :type img_label_list: str
 
+        :param cycle: If number of iterations is greater than dataset_size / batch_size
+        it reiterates dataset over as many times as necessary.
+        :type cycle: bool
+        
         '''
 
         img_label_lines = []
@@ -65,24 +69,29 @@ class DataGenerator(object):
 
         def reader():
             sizes = len(img_label_lines) / batchsize
-            for i in range(sizes):
-                result = []
-                sz = [0, 0]
-                for j in range(batchsize):
-                    line = img_label_lines[i * batchsize + j]
-                    # h, w, img_name, labels
-                    items = line.split(' ')
-
-                    label = [int(c) for c in items[-1].split(',')]
-                    img = Image.open(os.path.join(img_root_dir, items[
-                        2])).convert('L')  #zhuanhuidu
-                    if j == 0:
-                        sz = img.size
-                    img = img.resize((sz[0], sz[1]))
-                    img = np.array(img) - 127.5
-                    img = img[np.newaxis, ...]
-                    result.append([img, label])
-                yield result
+            if sizes == 0:
+                raise ValueError('Batch size is bigger than the dataset size.')
+            while True:
+                for i in range(sizes):
+                    result = []
+                    sz = [0, 0]
+                    for j in range(batchsize):
+                        line = img_label_lines[i * batchsize + j]
+                        # h, w, img_name, labels
+                        items = line.split(' ')
+
+                        label = [int(c) for c in items[-1].split(',')]
+                        img = Image.open(os.path.join(img_root_dir, items[
+                            2])).convert('L')  #zhuanhuidu
+                        if j == 0:
+                            sz = img.size
+                        img = img.resize((sz[0], sz[1]))
+                        img = np.array(img) - 127.5
+                        img = img[np.newaxis, ...]
+                        result.append([img, label])
+                    yield result
+                if not cycle:
+                    break
 
         return reader
 
@@ -111,7 +120,7 @@ class DataGenerator(object):
 
         return reader
 
-    def infer_reader(self, img_root_dir=None, img_label_list=None):
+    def infer_reader(self, img_root_dir=None, img_label_list=None, cycle=False):
         '''A reader interface for inference.
 
         :param img_root_dir: The root path of the images for training.
@@ -122,11 +131,15 @@ class DataGenerator(object):
         was None. If img_label_list was set to None, it will read image path
         from stdin.
         :type img_root_dir: str
+        
+        :param cycle: If number of iterations is greater than dataset_size /
+        batch_size it reiterates dataset over as many times as necessary.
+        :type cycle: bool
         '''
 
         def reader():
-            if img_label_list is not None:
-                for line in open(img_label_list):
+            def yield_img_and_label(lines):
+                for line in lines:
                     if img_root_dir is not None:
                         # h, w, img_name, labels
                         img_name = line.split(' ')[2]
@@ -138,6 +151,16 @@ class DataGenerator(object):
                     img = img[np.newaxis, ...]
                     label = [int(c) for c in line.split(' ')[3].split(',')]
                     yield img, label
+
+            if img_label_list is not None:
+                lines = []
+                with open(img_label_list) as f:
+                    lines = f.readlines()
+                for img, label in yield_img_and_label(lines):
+                    yield img, label
+                while cycle:
+                    for img, label in yield_img_and_label(lines):
+                        yield img, label
             else:
                 while True:
                     img_path = raw_input("Please input the path of image: ")
@@ -161,14 +184,15 @@ def data_shape():
     return DATA_SHAPE
 
 
-def train(batch_size, train_images_dir=None, train_list_file=None):
+def train(batch_size, train_images_dir=None, train_list_file=None, cycle=False):
     generator = DataGenerator()
     if train_images_dir is None:
         data_dir = download_data()
         train_images_dir = path.join(data_dir, TRAIN_DATA_DIR_NAME)
     if train_list_file is None:
         train_list_file = path.join(data_dir, TRAIN_LIST_FILE_NAME)
-    return generator.train_reader(train_images_dir, train_list_file, batch_size)
+    return generator.train_reader(train_images_dir, train_list_file, batch_size,
+                                  cycle)
 
 
 def test(batch_size=1, test_images_dir=None, test_list_file=None):
@@ -182,10 +206,14 @@ def test(batch_size=1, test_images_dir=None, test_list_file=None):
         generator.test_reader(test_images_dir, test_list_file), batch_size)
 
 
-def inference(infer_images_dir=None, infer_list_file=None):
+def inference(batch_size=1,
+              infer_images_dir=None,
+              infer_list_file=None,
+              cycle=False):
     generator = DataGenerator()
     return paddle.batch(
-        generator.infer_reader(infer_images_dir, infer_list_file), 1)
+        generator.infer_reader(infer_images_dir, infer_list_file, cycle),
+        batch_size)
 
 
 def download_data():
diff --git a/fluid/ocr_recognition/ctc_train.py b/fluid/ocr_recognition/ctc_train.py
index dc68cc2e2e9f6e98d2331ff926109d5df56d1df6..f9a5427240acb1dac6cc40ae58b263f08204169b 100644
--- a/fluid/ocr_recognition/ctc_train.py
+++ b/fluid/ocr_recognition/ctc_train.py
@@ -1,5 +1,6 @@
 """Trainer for OCR CTC model."""
 import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
 from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
 from crnn_ctc_model import ctc_train_net
 import ctc_reader
@@ -14,7 +15,7 @@ parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('batch_size',        int,   32,         "Minibatch size.")
-add_arg('total_step',        int,   720000,    "Number of training iterations.")
+add_arg('total_step',        int,   720000,    "The number of iterations. Zero or less means whole training set. More than 0 means the training set might be looped until # of iterations is reached.")
 add_arg('log_period',        int,   1000,       "Log period.")
 add_arg('save_model_period', int,   15000,      "Save model period. '-1' means never saving the model.")
 add_arg('eval_period',       int,   15000,      "Evaluate period. '-1' means never evaluating the model.")
@@ -25,6 +26,9 @@ add_arg('min_average_window',int,   10000,     "Min average window.")
 add_arg('max_average_window',int,   12500,     "Max average window. It is proposed to be set as the number of minibatch in a pass.")
 add_arg('average_window',    float, 0.15,      "Average window.")
 add_arg('parallel',          bool,  False,     "Whether use parallel training.")
+add_arg('profile',           bool,  False,      "Whether to use profiling.")
+add_arg('skip_batch_num',    int,   0,          "The number of first minibatches to skip as warm-up for better performance test.")
+add_arg('skip_test',         bool,  False,      "Whether to skip test phase.")
 # yapf: enable
 
 
@@ -49,7 +53,8 @@ def train(args, data_reader=ctc_reader):
     train_reader = data_reader.train(
         args.batch_size,
         train_images_dir=train_images,
-        train_list_file=train_list)
+        train_list_file=train_list,
+        cycle=args.total_step > 0)
     test_reader = data_reader.test(
         test_images_dir=test_images, test_list_file=test_list)
 
@@ -74,7 +79,7 @@ def train(args, data_reader=ctc_reader):
     error_evaluator.reset(exe)
     if args.parallel:
         train_exe = fluid.ParallelExecutor(
-            use_cuda=True, loss_name=sum_cost.name)
+            use_cuda=True if args.use_gpu else False, loss_name=sum_cost.name)
 
     fetch_vars = [sum_cost] + error_evaluator.metrics
 
@@ -85,8 +90,8 @@ def train(args, data_reader=ctc_reader):
                                     feed=get_feeder_data(data, place))
             results = [np.array(result).sum() for result in results]
         else:
-            results = exe.run(feed=get_feeder_data(data, place),
-                              fetch_list=fetch_vars)
+            results = train_exe.run(feed=get_feeder_data(data, place),
+                                    fetch_list=fetch_vars)
             results = [result[0] for result in results]
         return results
 
@@ -109,17 +114,29 @@ def train(args, data_reader=ctc_reader):
         print "Saved model to: %s/%s." % (args.save_model_dir, filename)
 
     iter_num = 0
-    while True:
+    stop = False
+    while not stop:
         total_loss = 0.0
         total_seq_error = 0.0
+        batch_times = []
         # train a pass
         for data in train_reader():
-            iter_num += 1
-            if iter_num > args.total_step:
-                return
+            if args.total_step > 0 and iter_num == args.total_step + args.skip_batch_num:
+                stop = True
+                break
+            if iter_num < args.skip_batch_num:
+                print("Warm-up iteration")
+            if iter_num == args.skip_batch_num:
+                profiler.reset_profiler()
+            start = time.time()
             results = train_one_batch(data)
+            batch_time = time.time() - start
+            fps = args.batch_size / batch_time
+            batch_times.append(batch_time)
             total_loss += results[0]
             total_seq_error += results[2]
+
+            iter_num += 1
             # training log
             if iter_num % args.log_period == 0:
                 print "\nTime: %s; Iter[%d]; Avg Warp-CTC loss: %.3f; Avg seq err: %.3f" % (
@@ -131,7 +148,7 @@ def train(args, data_reader=ctc_reader):
                 total_seq_error = 0.0
 
             # evaluate
-            if iter_num % args.eval_period == 0:
+            if not args.skip_test and iter_num % args.eval_period == 0:
                 if model_average:
                     with model_average.apply(exe):
                         test(iter_num)
@@ -145,12 +162,35 @@ def train(args, data_reader=ctc_reader):
                         save_model(args, exe, iter_num)
                 else:
                     save_model(args, exe, iter_num)
+        # Postprocess benchmark data
+        latencies = batch_times[args.skip_batch_num:]
+        latency_avg = np.average(latencies)
+        latency_pc99 = np.percentile(latencies, 99)
+        fpses = np.divide(args.batch_size, latencies)
+        fps_avg = np.average(fpses)
+        fps_pc99 = np.percentile(fpses, 1)
+
+        # Benchmark output
+        print('\nTotal examples (incl. warm-up): %d' %
+              (iter_num * args.batch_size))
+        print('average latency: %.5f s, 99pc latency: %.5f s' % (latency_avg,
+                                                                 latency_pc99))
+        print('average fps: %.5f, fps for 99pc latency: %.5f' % (fps_avg,
+                                                                 fps_pc99))
 
 
 def main():
     args = parser.parse_args()
     print_arguments(args)
-    train(args, data_reader=ctc_reader)
+    if args.profile:
+        if args.use_gpu:
+            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+                train(args, data_reader=ctc_reader)
+        else:
+            with profiler.profiler("CPU", sorted_key='total') as cpuprof:
+                train(args, data_reader=ctc_reader)
+    else:
+        train(args, data_reader=ctc_reader)
 
 
 if __name__ == "__main__":
diff --git a/fluid/ocr_recognition/infer.py b/fluid/ocr_recognition/infer.py
index 080e3f5f84efbb73e3c2381e809222fd2a90c416..154242c9e3ca8fea26f34b5cda0c2bac5a3d0ef1 100644
--- a/fluid/ocr_recognition/infer.py
+++ b/fluid/ocr_recognition/infer.py
@@ -1,5 +1,6 @@
 import paddle.v2 as paddle
 import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
 from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
 from crnn_ctc_model import ctc_infer
 import numpy as np
@@ -7,6 +8,7 @@ import ctc_reader
 import argparse
 import functools
 import os
+import time
 
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
@@ -16,6 +18,10 @@ add_arg('input_images_dir',   str,  None,   "The directory of images.")
 add_arg('input_images_list',  str,  None,   "The list file of images.")
 add_arg('dict',               str,  None,   "The dictionary. The result of inference will be index sequence if the dictionary was None.")
 add_arg('use_gpu',            bool,  True,      "Whether use GPU to infer.")
+add_arg('iterations',         int,  0,      "The number of iterations. Zero or less means whole test set. More than 0 means the test set might be looped until # of iterations is reached.")
+add_arg('profile',            bool, False,  "Whether to use profiling.")
+add_arg('skip_batch_num',     int,  0,      "The number of first minibatches to skip as warm-up for better performance test.")
+add_arg('batch_size',         int,  1,      "The minibatch size.")
 # yapf: enable
 
 
@@ -25,11 +31,14 @@ def inference(args, infer=ctc_infer, data_reader=ctc_reader):
     data_shape = data_reader.data_shape()
     # define network
     images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    sequence = infer(images, num_classes)
+    sequence = infer(
+        images, num_classes, use_cudnn=True if args.use_gpu else False)
     # data reader
     infer_reader = data_reader.inference(
+        batch_size=args.batch_size,
         infer_images_dir=args.input_images_dir,
-        infer_list_file=args.input_images_list)
+        infer_list_file=args.input_images_list,
+        cycle=True if args.iterations > 0 else False)
     # prepare environment
     place = fluid.CPUPlace()
     if args.use_gpu:
@@ -56,23 +65,67 @@ def inference(args, infer=ctc_infer, data_reader=ctc_reader):
     fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name)
     print "Init model from: %s." % args.model_path
 
+    batch_times = []
+    iters = 0
     for data in infer_reader():
+        if args.iterations > 0 and iters == args.iterations + args.skip_batch_num:
+            break
+        if iters < args.skip_batch_num:
+            print("Warm-up itaration")
+        if iters == args.skip_batch_num:
+            profiler.reset_profiler()
+
+        start = time.time()
         result = exe.run(fluid.default_main_program(),
                          feed=get_feeder_data(
                              data, place, need_label=False),
                          fetch_list=[sequence],
                          return_numpy=False)
+        batch_time = time.time() - start
+        fps = args.batch_size / batch_time
+        batch_times.append(batch_time)
         indexes = np.array(result[0]).flatten()
         if dict_map is not None:
-            print "result: %s" % ([dict_map[index] for index in indexes], )
+            print "Iteration %d, latency: %.5f s, fps: %f, result: %s" % (
+                iters,
+                batch_time,
+                fps,
+                [dict_map[index] for index in indexes], )
         else:
-            print "result: %s" % (indexes, )
+            print "Iteration %d, latency: %.5f s, fps: %f, result: %s" % (
+                iters,
+                batch_time,
+                fps,
+                indexes, )
+
+        iters += 1
+
+    latencies = batch_times[args.skip_batch_num:]
+    latency_avg = np.average(latencies)
+    latency_pc99 = np.percentile(latencies, 99)
+    fpses = np.divide(args.batch_size, latencies)
+    fps_avg = np.average(fpses)
+    fps_pc99 = np.percentile(fpses, 1)
+
+    # Benchmark output
+    print('\nTotal examples (incl. warm-up): %d' % (iters * args.batch_size))
+    print('average latency: %.5f s, 99pc latency: %.5f s' % (latency_avg,
+                                                             latency_pc99))
+    print('average fps: %.5f, fps for 99pc latency: %.5f' % (fps_avg, fps_pc99))
 
 
 def main():
     args = parser.parse_args()
     print_arguments(args)
-    inference(args, data_reader=ctc_reader)
+    if args.profile:
+        if args.use_gpu:
+            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+                inference(args, data_reader=ctc_reader)
+        else:
+            with profiler.profiler("CPU", sorted_key='total') as cpuprof:
+                inference(args, data_reader=ctc_reader)
+    else:
+        inference(args, data_reader=ctc_reader)
 
 
 if __name__ == "__main__":
diff --git a/fluid/ocr_recognition/scripts/README.md b/fluid/ocr_recognition/scripts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6320834e6c29fa24496527b265e2aa0d52475ad6
--- /dev/null
+++ b/fluid/ocr_recognition/scripts/README.md
@@ -0,0 +1,46 @@
+## Introduction
+Scripts enclosed in the folder serve as examples of commands that start training
+and inference of a model, and are subject to further customisation.
+
+# Running with MKL-DNN
+In order to run training or inference using MKL-DNN library, please use
+`FLAGS_use_mkldnn=1` environmental variable.
+
+## Prerequisites
+In order to run the training and inference, no special requirements are posed.
+
+## Training
+To run training on *CPU*, please execute:
+
+```sh
+source train.sh CPU
+```
+
+To run training on *CPU* with MKL-DNN, please execute:
+
+```sh
+source train.sh MKLDNN
+```
+
+To run training on *GPU*, please execute:
+
+```sh
+source train.sh GPU
+```
+
+## Inference
+To perform inference on the trained model using *CPU*, please run:
+```sh
+source infer.sh CPU
+```
+
+To perform inference on the trained model using *CPU* with MKL-DNN, please run:
+```sh
+source infer.sh MKLDNN
+```
+
+To perform inference on the trained model using *GPU*, please run:
+
+```sh
+source infer.sh GPU
+```
diff --git a/fluid/ocr_recognition/scripts/infer.sh b/fluid/ocr_recognition/scripts/infer.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6273ad153157138b3b7ecafece461ee01eda2955
--- /dev/null
+++ b/fluid/ocr_recognition/scripts/infer.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+mode=$1 # gpu, cpu, mkldnn
+if [ "$mode" = "CPU" ]; then
+  use_gpu="False"
+  model_path="cpu_model"
+elif [ "$mode" = "GPU" ]; then
+  use_gpu="True"
+  model_path="gpu_model"
+elif [ "$mode" = "MKLDNN" ]; then
+  use_gpu="False"
+  model_path="mkldnn_model"
+  export FLAGS_use_mkldnn=1
+else
+  echo "Invalid mode provided. Please use one of {GPU, CPU, MKLDNN}"
+  exit 1
+fi
+
+ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+        export OMP_DYNAMIC="FALSE"
+    fi
+else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+fi
+
+python ../infer.py \
+    --model_path $model_path/model_00001 \
+    --input_images_list ~/.cache/paddle/dataset/ctc_data/data/test.list \
+    --input_images_dir ~/.cache/paddle/dataset/ctc_data/data/test_images \
+    --use_gpu $use_gpu \
+    --batch_size 32 \
+    --iterations 5 \
+    --skip_batch_num 2
diff --git a/fluid/ocr_recognition/scripts/train.sh b/fluid/ocr_recognition/scripts/train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ceb7c06c0548e1d5ff90651d2fd8624288cc8804
--- /dev/null
+++ b/fluid/ocr_recognition/scripts/train.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+batch_size=32
+core_num=`lscpu |grep -m1 "CPU(s)"|awk -F':' '{print $2}'|xargs`
+mode=$1 # gpu, cpu, mkldnn
+if [ "$mode" = "CPU" ]; then
+  if [ $core_num -gt $batch_size ]; then
+    echo "Batch size should be greater or equal to the number of 
+          available cores, when parallel mode is set to True."
+  fi
+  use_gpu="False"
+  save_model_dir="cpu_model"
+  parallel="True"
+elif [ "$mode" = "GPU" ]; then
+  use_gpu="True"
+  save_model_dir="gpu_model"
+  parallel="True"
+elif [ "$mode" = "MKLDNN" ]; then
+  if [ $core_num -gt $batch_size ]; then
+    echo "Batch size should be greater or equal to the number of 
+          available cores, when parallel mode is set to True."
+  fi
+  use_gpu="False"
+  save_model_dir="mkldnn_model"
+  parallel="False"
+  export FLAGS_use_mkldnn=1
+else
+  echo "Invalid mode provided. Please use one of {GPU, CPU, MKLDNN}"
+  exit 1
+fi
+
+ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+        export OMP_DYNAMIC="FALSE"
+    fi
+else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+fi
+
+python ../ctc_train.py \
+    --use_gpu $use_gpu \
+    --parallel $parallel \
+    --batch_size $batch_size \
+    --save_model_period 1 \
+    --total_step 1 \
+    --save_model_dir $save_model_dir
+