Merge pull request #1 from PaddlePaddle/develop

merge-local

Merge pull request #1 from PaddlePaddle/develop
merge-local
1a5d3925 · lujun · GitHub · c19f7ac1 · 3b4eb996 · 1a5d3925
81 changed file
--- a/.gitignore
+++ b/.gitignore
-.DS_Store
+paddle/operators/check_t.save
+paddle/operators/check_tensor.ls
+paddle/operators/tensor.save
+python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
+python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
+python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
+*.DS_Store
+*.vs
+build/
+build_doc/
+*.user
+
+.vscode
+.idea
+.project
+.cproject
+.pydevproject
+.settings/
+
 *.pyc
-.*~
-fluid/neural_machine_translation/transformer/deps
-fluid/neural_machine_translation/transformer/train.data
-fluid/neural_machine_translation/transformer/train.pkl
-fluid/neural_machine_translation/transformer/train.sh
-fluid/neural_machine_translation/transformer/train.tok.clean.bpe.32000.en-de
-fluid/neural_machine_translation/transformer/vocab.bpe.32000.refined
+CMakeSettings.json
+Makefile
+.test_env/
+third_party/
+
+*~
+bazel-*
+third_party/
+
+build_*
+# clion workspace.
+cmake-build-*
+model_test
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -11,5 +11,16 @@ PaddlePaddle provides a rich set of computational units to enable users to adopt
 - [legacy models](legacy): use PaddlePaddle's v2 APIs.


+PaddlePaddle 提供了丰富的计算单元，使得用户可以采用模块化的方法解决各种学习问题。在此repo中，我们展示了如何用 PaddlePaddle 来解决常见的机器学习任务，提供若干种不同的易学易用的神经网络模型。
+
+- [fluid模型](fluid): 使用 PaddlePaddle Fluid版本的 APIs，我们特别推荐您使用Fluid模型。
+
+- [legacy模型](legacy): 使用 PaddlePaddle v2版本的 APIs。
+
+
 ## License
 This tutorial is contributed by [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) and licensed under the [Apache-2.0 license](LICENSE).
+
+
+## 许可证书
+此向导由[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)贡献，受[Apache-2.0 license](LICENSE)许可认证.
--- a/fluid/DeepASR/.gitignore
+++ b/fluid/DeepASR/.gitignore
+.idea
--- a/fluid/DeepASR/data_utils/util.py
+++ b/fluid/DeepASR/data_utils/util.py
@@ -25,16 +25,6 @@ def to_lodtensor(data, place):
    return res


-def lodtensor_to_ndarray(lod_tensor):
-    """conver lodtensor to ndarray
-    """
-    dims = lod_tensor._get_dims()
-    ret = np.zeros(shape=dims).astype('float32')
-    for i in xrange(np.product(dims)):
-        ret.ravel()[i] = lod_tensor.get_float_element(i)
-    return ret, lod_tensor.lod()
-
-
 def split_infer_result(infer_seq, lod):
    infer_batch = []
    for i in xrange(0, len(lod[0]) - 1):

--- a/fluid/DeepASR/decoder/.gitignore
+++ b/fluid/DeepASR/decoder/.gitignore
+ThreadPool
+build
+post_latgen_faster_mapped.so
+pybind11
--- a/fluid/DeepASR/examples/aishell/.gitignore
+++ b/fluid/DeepASR/examples/aishell/.gitignore
+aux.tar.gz
+aux
+data
+checkpoints
--- a/fluid/DeepASR/examples/aishell/train.sh
+++ b/fluid/DeepASR/examples/aishell/train.sh
@@ -7,7 +7,8 @@ python -u ../../train.py --train_feature_lst data/train_feature.lst \
                   --checkpoints checkpoints \
                   --frame_dim 80  \
                   --class_num 3040  \
+                   --print_per_batches 100 \
                   --infer_models '' \
-                   --batch_size 64 \
+                   --batch_size 16 \
                   --learning_rate 6.4e-5 \
                   --parallel
--- a/fluid/DeepASR/model_utils/model.py
+++ b/fluid/DeepASR/model_utils/model.py
@@ -5,14 +5,16 @@ from __future__ import print_function
 import paddle.fluid as fluid


-def stacked_lstmp_model(frame_dim,
+def stacked_lstmp_model(feature,
+                        label,
                        hidden_dim,
                        proj_dim,
                        stacked_num,
                        class_num,
                        parallel=False,
                        is_train=True):
-    """ The model for DeepASR. The main structure is composed of stacked 
+    """ 
+    The model for DeepASR. The main structure is composed of stacked 
    identical LSTMP (LSTM with recurrent projection) layers.

    When running in training and validation phase, the feeding dictionary
@@ -28,9 +30,6 @@ def stacked_lstmp_model(frame_dim,
        is_train(bool): Run in training phase or not, default `True`.
        class_dim(int): The number of output classes.
    """
-
-    # network configuration
-    def _net_conf(feature, label):
    conv1 = fluid.layers.conv2d(
        input=feature,
        num_filters=32,
@@ -73,35 +72,3 @@ def stacked_lstmp_model(frame_dim,
    avg_cost = fluid.layers.mean(x=cost)
    acc = fluid.layers.accuracy(input=prediction, label=label)
    return prediction, avg_cost, acc
-
-    # data feeder
-    feature = fluid.layers.data(
-        name="feature",
-        shape=[-1, 3, 11, frame_dim],
-        dtype="float32",
-        lod_level=1)
-    label = fluid.layers.data(
-        name="label", shape=[-1, 1], dtype="int64", lod_level=1)
-
-    if parallel:
-        # When the execution place is specified to CUDAPlace, the program will
-        # run on all $CUDA_VISIBLE_DEVICES GPUs. Otherwise the program will 
-        # run on all CPU devices.
-        places = fluid.layers.device.get_places()
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            feat_ = pd.read_input(feature)
-            label_ = pd.read_input(label)
-            prediction, avg_cost, acc = _net_conf(feat_, label_)
-            for out in [prediction, avg_cost, acc]:
-                pd.write_output(out)
-
-        # get mean loss and acc through every devices.
-        prediction, avg_cost, acc = pd()
-        prediction.stop_gradient = True
-        avg_cost = fluid.layers.mean(x=avg_cost)
-        acc = fluid.layers.mean(x=acc)
-    else:
-        prediction, avg_cost, acc = _net_conf(feature, label)
-
-    return prediction, avg_cost, acc
--- a/fluid/DeepASR/train.py
+++ b/fluid/DeepASR/train.py
@@ -14,7 +14,6 @@ import data_utils.augmentor.trans_add_delta as trans_add_delta
 import data_utils.augmentor.trans_splice as trans_splice
 import data_utils.augmentor.trans_delay as trans_delay
 import data_utils.async_data_reader as reader
-from data_utils.util import lodtensor_to_ndarray
 from model_utils.model import stacked_lstmp_model


@@ -24,7 +23,8 @@ def parse_args():
        '--batch_size',
        type=int,
        default=32,
-        help='The sequence number of a batch data. (default: %(default)d)')
+        help='The sequence number of a batch data. Batch size per GPU. (default: %(default)d)'
+    )
    parser.add_argument(
        '--minimum_batch_size',
        type=int,
@@ -147,18 +147,26 @@ def train(args):
    if args.infer_models != '' and not os.path.exists(args.infer_models):
        os.mkdir(args.infer_models)

+    train_program = fluid.Program()
+    train_startup = fluid.Program()
+
+    with fluid.program_guard(train_program, train_startup):
+        with fluid.unique_name.guard():
+            py_train_reader = fluid.layers.py_reader(
+                capacity=10,
+                shapes=([-1, 3, 11, args.frame_dim], [-1, 1]),
+                dtypes=['float32', 'int64'],
+                lod_levels=[1, 1],
+                name='train_reader')
+            feature, label = fluid.layers.read_file(py_train_reader)
            prediction, avg_cost, accuracy = stacked_lstmp_model(
-        frame_dim=args.frame_dim,
+                feature=feature,
+                label=label,
                hidden_dim=args.hidden_dim,
                proj_dim=args.proj_dim,
                stacked_num=args.stacked_num,
-        class_num=args.class_num,
-        parallel=args.parallel)
-
-    # program for test
-    test_program = fluid.default_main_program().clone()
-
-    #optimizer = fluid.optimizer.Momentum(learning_rate=args.learning_rate, momentum=0.9)
+                class_num=args.class_num)
+            # optimizer = fluid.optimizer.Momentum(learning_rate=args.learning_rate, momentum=0.9)
            optimizer = fluid.optimizer.Adam(
                learning_rate=fluid.layers.exponential_decay(
                    learning_rate=args.learning_rate,
@@ -166,10 +174,45 @@ def train(args):
                    decay_rate=1 / 1.2,
                    staircase=True))
            optimizer.minimize(avg_cost)
+            fluid.memory_optimize(train_program)

+    test_program = fluid.Program()
+    test_startup = fluid.Program()
+    with fluid.program_guard(test_program, test_startup):
+        with fluid.unique_name.guard():
+            py_test_reader = fluid.layers.py_reader(
+                capacity=10,
+                shapes=([-1, 3, 11, args.frame_dim], [-1, 1]),
+                dtypes=['float32', 'int64'],
+                lod_levels=[1, 1],
+                name='test_reader')
+            feature, label = fluid.layers.read_file(py_test_reader)
+            prediction, avg_cost, accuracy = stacked_lstmp_model(
+                feature=feature,
+                label=label,
+                hidden_dim=args.hidden_dim,
+                proj_dim=args.proj_dim,
+                stacked_num=args.stacked_num,
+                class_num=args.class_num)
+    test_program = test_program.clone(for_test=True)
    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
+    exe.run(train_startup)
+    exe.run(test_startup)
+
+    if args.parallel:
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.num_iteration_per_drop_scope = 10
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=(args.device == 'GPU'),
+            loss_name=avg_cost.name,
+            exec_strategy=exec_strategy,
+            main_program=train_program)
+        test_exe = fluid.ParallelExecutor(
+            use_cuda=(args.device == 'GPU'),
+            main_program=test_program,
+            exec_strategy=exec_strategy,
+            share_vars_from=train_exe)

    # resume training if initial model provided.
    if args.init_model_path is not None:
@@ -181,15 +224,24 @@ def train(args):
        trans_splice.TransSplice(5, 5), trans_delay.TransDelay(5)
    ]

-    feature_t = fluid.LoDTensor()
-    label_t = fluid.LoDTensor()
+    # bind train_reader
+    train_data_reader = reader.AsyncDataReader(
+        args.train_feature_lst,
+        args.train_label_lst,
+        -1,
+        split_sentence_threshold=1024)

-    # validation
-    def test(exe):
-        # If test data not found, return invalid cost and accuracy
-        if not (os.path.exists(args.val_feature_lst) and
+    train_data_reader.set_transformers(ltrans)
+
+    def train_data_provider():
+        for data in train_data_reader.batch_iterator(args.batch_size,
+                                                     args.minimum_batch_size):
+            yield batch_data_to_lod_tensors(args, data, fluid.CPUPlace())
+
+    py_train_reader.decorate_tensor_provider(train_data_provider)
+
+    if (os.path.exists(args.val_feature_lst) and
            os.path.exists(args.val_label_lst)):
-            return -1.0, -1.0
        # test data reader
        test_data_reader = reader.AsyncDataReader(
            args.val_feature_lst,
@@ -197,86 +249,101 @@ def train(args):
            -1,
            split_sentence_threshold=1024)
        test_data_reader.set_transformers(ltrans)
-        test_costs, test_accs = [], []
-        for batch_id, batch_data in enumerate(
-                test_data_reader.batch_iterator(args.batch_size,
-                                                args.minimum_batch_size)):
-            # load_data
-            (features, labels, lod, _) = batch_data
-            features = np.reshape(features, (-1, 11, 3, args.frame_dim))
-            features = np.transpose(features, (0, 2, 1, 3))
-            feature_t.set(features, place)
-            feature_t.set_lod([lod])
-            label_t.set(labels, place)
-            label_t.set_lod([lod])

-            cost, acc = exe.run(test_program,
-                                feed={"feature": feature_t,
-                                      "label": label_t},
+        def test_data_provider():
+            for data in test_data_reader.batch_iterator(
+                    args.batch_size, args.minimum_batch_size):
+                yield batch_data_to_lod_tensors(args, data, fluid.CPUPlace())
+
+        py_test_reader.decorate_tensor_provider(test_data_provider)
+
+    # validation
+    def test(exe):
+        # If test data not found, return invalid cost and accuracy
+        if not (os.path.exists(args.val_feature_lst) and
+                os.path.exists(args.val_label_lst)):
+            return -1.0, -1.0
+        batch_id = 0
+        test_costs = []
+        test_accs = []
+        while True:
+            if batch_id == 0:
+                py_test_reader.start()
+            try:
+                if args.parallel:
+                    cost, acc = exe.run(
+                        fetch_list=[avg_cost.name, accuracy.name],
+                        return_numpy=False)
+                else:
+                    cost, acc = exe.run(program=test_program,
                                        fetch_list=[avg_cost, accuracy],
                                        return_numpy=False)
-            test_costs.append(lodtensor_to_ndarray(cost)[0])
-            test_accs.append(lodtensor_to_ndarray(acc)[0])
+                sys.stdout.write('.')
+                sys.stdout.flush()
+                test_costs.append(np.array(cost)[0])
+                test_accs.append(np.array(acc)[0])
+                batch_id += 1
+            except fluid.core.EOFException:
+                py_test_reader.reset()
+                break
        return np.mean(test_costs), np.mean(test_accs)

-    # train data reader
-    train_data_reader = reader.AsyncDataReader(
-        args.train_feature_lst,
-        args.train_label_lst,
-        -1,
-        split_sentence_threshold=1024)
-
-    train_data_reader.set_transformers(ltrans)
    # train
    for pass_id in xrange(args.pass_num):
        pass_start_time = time.time()
-        for batch_id, batch_data in enumerate(
-                train_data_reader.batch_iterator(args.batch_size,
-                                                 args.minimum_batch_size)):
-            # load_data
-            (features, labels, lod, name_lst) = batch_data
-            features = np.reshape(features, (-1, 11, 3, args.frame_dim))
-            features = np.transpose(features, (0, 2, 1, 3))
-            feature_t.set(features, place)
-            feature_t.set_lod([lod])
-            label_t.set(labels, place)
-            label_t.set_lod([lod])
-
+        batch_id = 0
+        while True:
+            if batch_id == 0:
+                py_train_reader.start()
            to_print = batch_id > 0 and (batch_id % args.print_per_batches == 0)
-            outs = exe.run(fluid.default_main_program(),
-                           feed={"feature": feature_t,
-                                 "label": label_t},
-                           fetch_list=[avg_cost, accuracy] if to_print else [],
+            try:
+                if args.parallel:
+                    outs = train_exe.run(
+                        fetch_list=[avg_cost.name, accuracy.name]
+                        if to_print else [],
+                        return_numpy=False)
+                else:
+                    outs = exe.run(program=train_program,
+                                   fetch_list=[avg_cost, accuracy]
+                                   if to_print else [],
                                   return_numpy=False)
+            except fluid.core.EOFException:
+                py_train_reader.reset()
+                break

            if to_print:
+                if args.parallel:
                    print("\nBatch %d, train cost: %f, train acc: %f" %
-                      (batch_id, lodtensor_to_ndarray(outs[0])[0],
-                       lodtensor_to_ndarray(outs[1])[0]))
+                          (batch_id, np.mean(outs[0]), np.mean(outs[1])))
+                else:
+                    print("\nBatch %d, train cost: %f, train acc: %f" % (
+                        batch_id, np.array(outs[0])[0], np.array(outs[1])[0]))
                # save the latest checkpoint
                if args.checkpoints != '':
                    model_path = os.path.join(args.checkpoints,
                                              "deep_asr.latest.checkpoint")
-                    fluid.io.save_persistables(exe, model_path)
+                    fluid.io.save_persistables(exe, model_path, train_program)
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
+
+            batch_id += 1
        # run test
-        val_cost, val_acc = test(exe)
+        val_cost, val_acc = test(test_exe if args.parallel else exe)

        # save checkpoint per pass
        if args.checkpoints != '':
            model_path = os.path.join(
                args.checkpoints,
                "deep_asr.pass_" + str(pass_id) + ".checkpoint")
-            fluid.io.save_persistables(exe, model_path)
+            fluid.io.save_persistables(exe, model_path, train_program)
        # save inference model
        if args.infer_models != '':
            model_path = os.path.join(
                args.infer_models,
                "deep_asr.pass_" + str(pass_id) + ".infer.model")
            fluid.io.save_inference_model(model_path, ["feature"],
-                                          [prediction], exe)
+                                          [prediction], exe, train_program)
        # cal pass time
        pass_end_time = time.time()
        time_consumed = pass_end_time - pass_start_time
@@ -285,6 +352,19 @@ def train(args):
              (pass_id, time_consumed, val_cost, val_acc))


+def batch_data_to_lod_tensors(args, batch_data, place):
+    features, labels, lod, name_lst = batch_data
+    features = np.reshape(features, (-1, 11, 3, args.frame_dim))
+    features = np.transpose(features, (0, 2, 1, 3))
+    feature_t = fluid.LoDTensor()
+    label_t = fluid.LoDTensor()
+    feature_t.set(features, place)
+    feature_t.set_lod([lod])
+    label_t.set(labels, place)
+    label_t.set_lod([lod])
+    return feature_t, label_t
+
+
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)

--- a/fluid/PaddleCV/caffe2fluid/kaffe/custom_layers/__init__.py
+++ b/fluid/PaddleCV/caffe2fluid/kaffe/custom_layers/__init__.py
@@ -15,6 +15,7 @@ import detection_out
 import normalize
 import select
 import crop
+import power
 import reduction

 #custom layer import ends

--- a/fluid/PaddleCV/caffe2fluid/kaffe/custom_layers/power.py
+++ b/fluid/PaddleCV/caffe2fluid/kaffe/custom_layers/power.py
+""" a custom layer for 'power', maybe we should implement this in standard way.
+    more info can be found here: http://caffe.berkeleyvision.org/tutorial/layers/power.html
+"""
+from .register import register
+
+
+def power_shape(input_shape, shape=None):
+    """ calculate the output shape of this layer using input shape
+
+    Args:
+        @input_shape (list of num): a list of number which represents the input shape
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+    return input_shape
+
+
+def power_layer(input, name, power=1.0, scale=1.0, shift=0.0):
+    """ build a layer of type 'Power' using fluid
+
+    Args:
+        @input (variables): input fluid variable for this layer
+        @name (str): name for this layer
+        @power (float): parameter from caffe's Power layer
+	@scale (float): parameter from caffe's Power layer
+        @shift (float): parameter from caffe's Power layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+    import paddle.fluid as fluid
+    scale_out = fluid.layers.scale(
+        input, scale=scale, bias=shift, bias_after_scale=True)
+    output = fluid.layers.pow(scale_out, factor=power)
+
+    return output
+
+
+register(kind='Power', shape=power_shape, layer=power_layer)
--- a/fluid/PaddleCV/caffe2fluid/kaffe/custom_layers/priorbox.py
+++ b/fluid/PaddleCV/caffe2fluid/kaffe/custom_layers/priorbox.py
@@ -31,12 +31,12 @@ def priorbox_shape(input_shapes, min_size, max_size=None, aspect_ratio=None):
 def priorbox_layer(inputs,
                   name,
                   min_size,
-                   step,
                   max_size=None,
                   aspect_ratio=None,
-                   flip=True,
+                   variance=[0.1, 0.1, 0.2, 0.2],
+                   flip=False,
                   clip=False,
-                   variance=[],
+                   step=0.0,
                   offset=0.5):
    """ build a layer of type 'Priorbox' using fluid

@@ -52,6 +52,8 @@ def priorbox_layer(inputs,
    assert len(inputs) == 2, "invalid inputs for Priorbox[%s]" % (name)
    input = inputs[0]
    image = inputs[1]
+    steps = tuple(step) if type(step) is list or type(step) is tuple else (step,
+                                                                           step)
    box, variance_ = fluid.layers.prior_box(
        input,
        image,
@@ -60,7 +62,8 @@ def priorbox_layer(inputs,
        aspect_ratio,
        variance,
        flip,
-        clip, (step, step),
+        clip,
+        steps,
        offset,
        min_max_aspect_ratios_order=True)
    """

--- a/fluid/PaddleCV/caffe2fluid/kaffe/layers.py
+++ b/fluid/PaddleCV/caffe2fluid/kaffe/layers.py
@@ -38,7 +38,7 @@ LAYER_DESCRIPTORS = {
    'MultinomialLogisticLoss': shape_scalar,
    'MVN': shape_not_implemented,
    'Pooling': shape_pool,
-    'Power': shape_identity,
+    'Power': shape_power,
    'ReLU': shape_identity,
    'PReLU': shape_identity,
    'Scale': shape_identity,

--- a/fluid/PaddleCV/caffe2fluid/kaffe/paddle/network.py
+++ b/fluid/PaddleCV/caffe2fluid/kaffe/paddle/network.py
@@ -280,8 +280,17 @@ class Network(object):
            param_attr=fluid.ParamAttr(name=prefix + 'negslope'))
        return output

-    def pool(self, pool_type, input, k_h, k_w, s_h, s_w, ceil_mode, padding,
-             name):
+    def pool(self,
+             pool_type,
+             input,
+             k_h,
+             k_w,
+             s_h,
+             s_w,
+             ceil_mode,
+             padding,
+             name,
+             exclusive=True):
        # Get the number of channels in the input
        in_hw = input.shape[2:]
        k_hw = [k_h, k_w]
@@ -295,7 +304,8 @@ class Network(object):
            pool_stride=s_hw,
            pool_padding=padding,
            ceil_mode=ceil_mode,
-            pool_type=pool_type)
+            pool_type=pool_type,
+            exclusive=exclusive)
        return output

    @layer

--- a/fluid/PaddleCV/caffe2fluid/kaffe/shapes.py
+++ b/fluid/PaddleCV/caffe2fluid/kaffe/shapes.py
@@ -67,6 +67,10 @@ def shape_crop(node):
    raise KaffeError('crop function had been defined in customer_layers')


+def shape_power(node):
+    raise KaffeError('power function had been defined in customer_layers')
+
+
 def shape_data(node):
    if node.output_shape:
        # Old-style input specification

--- a/fluid/PaddleCV/deeplabv3+/eval.py
+++ b/fluid/PaddleCV/deeplabv3+/eval.py
@@ -26,6 +26,7 @@ def add_arguments():
    add_argument('dataset_path', str, None, "Cityscape dataset path.")
    add_argument('verbose', bool, False, "Print mIoU for each step if verbose.")
    add_argument('use_gpu', bool, True, "Whether use GPU or CPU.")
+    add_argument('num_classes', int, 19, "Number of classes.")


 def mean_iou(pred, label):
@@ -69,7 +70,7 @@ tp = fluid.Program()
 batch_size = 1
 reader.default_config['crop_size'] = -1
 reader.default_config['shuffle'] = False
-num_classes = 19
+num_classes = args.num_classes

 with fluid.program_guard(tp, sp):
    img = fluid.layers.data(name='img', shape=[3, 0, 0], dtype='float32')
@@ -84,7 +85,7 @@ tp = tp.clone(True)
 fluid.memory_optimize(
    tp,
    print_log=False,
-    skip_opt_set=[pred.name, miou, out_wrong, out_correct],
+    skip_opt_set=set([pred.name, miou, out_wrong, out_correct]),
    level=1)

 place = fluid.CPUPlace()

--- a/fluid/PaddleCV/deeplabv3+/models.py
+++ b/fluid/PaddleCV/deeplabv3+/models.py
@@ -20,6 +20,11 @@ op_results = {}
 default_epsilon = 1e-3
 default_norm_type = 'bn'
 default_group_number = 32
+depthwise_use_cudnn = False
+
+bn_regularizer = fluid.regularizer.L2DecayRegularizer(regularization_coeff=0.0)
+depthwise_regularizer = fluid.regularizer.L2DecayRegularizer(
+    regularization_coeff=0.0)


 @contextlib.contextmanager
@@ -52,20 +57,39 @@ def append_op_result(result, name):


 def conv(*args, **kargs):
-    kargs['param_attr'] = name_scope + 'weights'
+    if "xception" in name_scope:
+        init_std = 0.09
+    elif "logit" in name_scope:
+        init_std = 0.01
+    elif name_scope.endswith('depthwise/'):
+        init_std = 0.33
+    else:
+        init_std = 0.06
+    if name_scope.endswith('depthwise/'):
+        regularizer = depthwise_regularizer
+    else:
+        regularizer = None
+
+    kargs['param_attr'] = fluid.ParamAttr(
+        name=name_scope + 'weights',
+        regularizer=regularizer,
+        initializer=fluid.initializer.TruncatedNormal(
+            loc=0.0, scale=init_std))
    if 'bias_attr' in kargs and kargs['bias_attr']:
-        kargs['bias_attr'] = name_scope + 'biases'
+        kargs['bias_attr'] = fluid.ParamAttr(
+            name=name_scope + 'biases',
+            regularizer=regularizer,
+            initializer=fluid.initializer.ConstantInitializer(value=0.0))
    else:
        kargs['bias_attr'] = False
+    kargs['name'] = name_scope + 'conv'
    return append_op_result(fluid.layers.conv2d(*args, **kargs), 'conv')


 def group_norm(input, G, eps=1e-5, param_attr=None, bias_attr=None):
-    helper = fluid.layer_helper.LayerHelper('group_norm', **locals())
-
    N, C, H, W = input.shape
    if C % G != 0:
-        print("group can not divide channle:", C, G)
+        # print "group can not divide channle:", C, G
        for d in range(10):
            for t in [d, -d]:
                if G + t <= 0: continue
@@ -73,29 +97,16 @@ def group_norm(input, G, eps=1e-5, param_attr=None, bias_attr=None):
                    G = G + t
                    break
            if C % G == 0:
-                print("use group size:", G)
+                # print "use group size:", G
                break
    assert C % G == 0
-    param_shape = (G, )
-    x = input
-    x = fluid.layers.reshape(x, [N, G, C // G * H * W])
-    mean = fluid.layers.reduce_mean(x, dim=2, keep_dim=True)
-    x = x - mean
-    var = fluid.layers.reduce_mean(fluid.layers.square(x), dim=2, keep_dim=True)
-    x = x / fluid.layers.sqrt(var + eps)
-
-    scale = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype='float32',
-        default_initializer=fluid.initializer.Constant(1.0))
-
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=param_shape, dtype='float32', is_bias=True)
-    x = fluid.layers.elementwise_add(
-        fluid.layers.elementwise_mul(
-            x, scale, axis=1), bias, axis=1)
-    return fluid.layers.reshape(x, input.shape)
+    x = fluid.layers.group_norm(
+        input,
+        groups=G,
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        name=name_scope + 'group_norm')
+    return x


 def bn(*args, **kargs):
@@ -106,8 +117,10 @@ def bn(*args, **kargs):
                    *args,
                    epsilon=default_epsilon,
                    momentum=bn_momentum,
-                    param_attr=name_scope + 'gamma',
-                    bias_attr=name_scope + 'beta',
+                    param_attr=fluid.ParamAttr(
+                        name=name_scope + 'gamma', regularizer=bn_regularizer),
+                    bias_attr=fluid.ParamAttr(
+                        name=name_scope + 'beta', regularizer=bn_regularizer),
                    moving_mean_name=name_scope + 'moving_mean',
                    moving_variance_name=name_scope + 'moving_variance',
                    **kargs),
@@ -119,8 +132,10 @@ def bn(*args, **kargs):
                    args[0],
                    default_group_number,
                    eps=default_epsilon,
-                    param_attr=name_scope + 'gamma',
-                    bias_attr=name_scope + 'beta'),
+                    param_attr=fluid.ParamAttr(
+                        name=name_scope + 'gamma', regularizer=bn_regularizer),
+                    bias_attr=fluid.ParamAttr(
+                        name=name_scope + 'beta', regularizer=bn_regularizer)),
                'gn')
    else:
        raise "Unsupport norm type:" + default_norm_type
@@ -143,7 +158,8 @@ def seq_conv(input, channel, stride, filter, dilation=1, act=None):
            stride,
            groups=input.shape[1],
            padding=(filter // 2) * dilation,
-            dilation=dilation)
+            dilation=dilation,
+            use_cudnn=depthwise_use_cudnn)
        input = bn(input)
        if act: input = act(input)
    with scope('pointwise'):

--- a/fluid/PaddleCV/deeplabv3+/train.py
+++ b/fluid/PaddleCV/deeplabv3+/train.py
@@ -13,6 +13,7 @@ import reader
 import models
 import time

+
 def add_argument(name, type, default, help):
    parser.add_argument('--' + name, default=default, type=type, help=help)

@@ -32,15 +33,28 @@ def add_arguments():
    add_argument('dataset_path', str, None, "Cityscape dataset path.")
    add_argument('parallel', bool, False, "using ParallelExecutor.")
    add_argument('use_gpu', bool, True, "Whether use GPU or CPU.")
+    add_argument('num_classes', int, 19, "Number of classes.")


 def load_model():
+    myvars = [
+        x for x in tp.list_vars()
+        if isinstance(x, fluid.framework.Parameter) and x.name.find('logit') ==
+        -1
+    ]
    if args.init_weights_path.endswith('/'):
+        if args.num_classes == 19:
            fluid.io.load_params(
                exe, dirname=args.init_weights_path, main_program=tp)
        else:
+            fluid.io.load_vars(exe, dirname=args.init_weights_path, vars=myvars)
+    else:
+        if args.num_classes == 19:
            fluid.io.load_params(
-            exe, dirname="", filename=args.init_weights_path, main_program=tp)
+                exe, dirname=args.init_weights_path, main_program=tp)
+        else:
+            fluid.io.load_vars(
+                exe, dirname="", filename=args.init_weights_path, vars=myvars)


 def save_model():
@@ -80,6 +94,7 @@ args = parser.parse_args()
 models.clean()
 models.bn_momentum = 0.9997
 models.dropout_keep_prop = 0.9
+models.label_number = args.num_classes
 deeplabv3p = models.deeplabv3p

 sp = fluid.Program()
@@ -89,7 +104,7 @@ batch_size = args.batch_size
 image_shape = [crop_size, crop_size]
 reader.default_config['crop_size'] = crop_size
 reader.default_config['shuffle'] = True
-num_classes = 19
+num_classes = args.num_classes
 weight_decay = 0.00004

 base_lr = args.base_lr
@@ -120,7 +135,7 @@ with fluid.program_guard(tp, sp):
    retv = opt.minimize(loss_mean, startup_program=sp, no_grad_set=no_grad_set)

 fluid.memory_optimize(
-    tp, print_log=False, skip_opt_set=[pred.name, loss_mean.name], level=1)
+    tp, print_log=False, skip_opt_set=set([pred.name, loss_mean.name]), level=1)

 place = fluid.CPUPlace()
 if args.use_gpu:
@@ -155,8 +170,8 @@ for i, imgs, labels, names in batches:
    if i % 100 == 0:
        print("Model is saved to", args.save_weights_path)
        save_model()
-    print("step {:d}, loss: {:.6f}, step_time_cost: {:.3f}" .format(i,
-                    np.mean(retv[1]), end_time - prev_start_time))
+    print("step {:d}, loss: {:.6f}, step_time_cost: {:.3f}".format(
+        i, np.mean(retv[1]), end_time - prev_start_time))

 print("Training done. Model is saved to", args.save_weights_path)
 save_model()
--- a/fluid/PaddleCV/faster_rcnn/README.md
+++ b/fluid/PaddleCV/faster_rcnn/README.md
@@ -111,10 +111,10 @@ Evalutaion result is shown as below:

 | Model              | RoI function    | Batch size     | Max iteration    | mAP  |
 | :--------------- | :--------: | :------------:    | :------------------:    |------: |
-| [Fluid RoIPool minibatch padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_minibatch_padding.tar.gz) | RoIPool | 8   |    180000        | 0.314 |
-| [Fluid RoIPool no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_no_padding.tar.gz)  | RoIPool | 8   |    180000        | 0.316 |
-| [Fluid RoIAlign no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding.tar.gz)  | RoIAlign | 8   |    180000        | 0.345 |
-| [Fluid RoIAlign no padding 2x](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding_2x.tar.gz)  | RoIAlign | 8   |    360000        | 0.364 |
+| [Fluid RoIPool minibatch padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_minibatch_padding.tar.gz) | RoIPool | 8   |    180000        | 0.316 |
+| [Fluid RoIPool no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_no_padding.tar.gz)  | RoIPool | 8   |    180000        | 0.318 |
+| [Fluid RoIAlign no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding.tar.gz)  | RoIAlign | 8   |    180000        | 0.348 |
+| [Fluid RoIAlign no padding 2x](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding_2x.tar.gz)  | RoIAlign | 8   |    360000        | 0.367 |

 * Fluid RoIPool minibatch padding: Use RoIPool. Images in one batch padding to the same size. This method is same as detectron.
 * Fluid RoIPool no padding: Images without padding.

--- a/fluid/PaddleCV/faster_rcnn/README_cn.md
+++ b/fluid/PaddleCV/faster_rcnn/README_cn.md
@@ -105,10 +105,10 @@ Faster RCNN 目标检测模型

 | 模型                   |   RoI处理方式  | 批量大小   | 迭代次数   | mAP  |
 | :--------------- | :--------: | :------------:    | :------------------:    |------: |
-| [Fluid RoIPool minibatch padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_minibatch_padding.tar.gz) | RoIPool | 8   |    180000        | 0.314 |
-| [Fluid RoIPool no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_no_padding.tar.gz)  | RoIPool | 8   |    180000        | 0.316 |
-| [Fluid RoIAlign no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding.tar.gz)  | RoIAlign | 8   |    180000        | 0.345 |
-| [Fluid RoIAlign no padding 2x](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding_2x.tar.gz)  | RoIAlign | 8   |    360000        | 0.364 |
+| [Fluid RoIPool minibatch padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_minibatch_padding.tar.gz) | RoIPool | 8   |    180000        | 0.316 |
+| [Fluid RoIPool no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_no_padding.tar.gz)  | RoIPool | 8   |    180000        | 0.318 |
+| [Fluid RoIAlign no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding.tar.gz)  | RoIAlign | 8   |    180000        | 0.348 |
+| [Fluid RoIAlign no padding 2x](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding_2x.tar.gz)  | RoIAlign | 8   |    360000        | 0.367 |




--- a/fluid/PaddleCV/faster_rcnn/eval_helper.py
+++ b/fluid/PaddleCV/faster_rcnn/eval_helper.py
@@ -23,29 +23,43 @@ from PIL import ImageFont
 from config import cfg


-def box_decoder(target_box, prior_box, prior_box_var):
-    proposals = np.zeros_like(target_box, dtype=np.float32)
-    prior_box_loc = np.zeros_like(prior_box, dtype=np.float32)
-    prior_box_loc[:, 0] = prior_box[:, 2] - prior_box[:, 0] + 1.
-    prior_box_loc[:, 1] = prior_box[:, 3] - prior_box[:, 1] + 1.
-    prior_box_loc[:, 2] = (prior_box[:, 2] + prior_box[:, 0]) / 2
-    prior_box_loc[:, 3] = (prior_box[:, 3] + prior_box[:, 1]) / 2
-    pred_bbox = np.zeros_like(target_box, dtype=np.float32)
-    for i in range(prior_box.shape[0]):
-        dw = np.minimum(prior_box_var[2] * target_box[i, 2::4], cfg.bbox_clip)
-        dh = np.minimum(prior_box_var[3] * target_box[i, 3::4], cfg.bbox_clip)
-        pred_bbox[i, 0::4] = prior_box_var[0] * target_box[
-            i, 0::4] * prior_box_loc[i, 0] + prior_box_loc[i, 2]
-        pred_bbox[i, 1::4] = prior_box_var[1] * target_box[
-            i, 1::4] * prior_box_loc[i, 1] + prior_box_loc[i, 3]
-        pred_bbox[i, 2::4] = np.exp(dw) * prior_box_loc[i, 0]
-        pred_bbox[i, 3::4] = np.exp(dh) * prior_box_loc[i, 1]
-    proposals[:, 0::4] = pred_bbox[:, 0::4] - pred_bbox[:, 2::4] / 2
-    proposals[:, 1::4] = pred_bbox[:, 1::4] - pred_bbox[:, 3::4] / 2
-    proposals[:, 2::4] = pred_bbox[:, 0::4] + pred_bbox[:, 2::4] / 2 - 1
-    proposals[:, 3::4] = pred_bbox[:, 1::4] + pred_bbox[:, 3::4] / 2 - 1
-
-    return proposals
+def box_decoder(deltas, boxes, weights):
+    if boxes.shape[0] == 0:
+        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
+
+    boxes = boxes.astype(deltas.dtype, copy=False)
+
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] * wx
+    dy = deltas[:, 1::4] * wy
+    dw = deltas[:, 2::4] * ww
+    dh = deltas[:, 3::4] * wh
+
+    # Prevent sending too large values into np.exp()
+    dw = np.minimum(dw, cfg.bbox_clip)
+    dh = np.minimum(dh, cfg.bbox_clip)
+
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
+    # x1
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+    # y1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+    # x2 (note: "- 1" is correct; don't be fooled by the asymmetry)
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1
+    # y2 (note: "- 1" is correct; don't be fooled by the asymmetry)
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1
+
+    return pred_boxes


 def clip_tiled_boxes(boxes, im_shape):
@@ -73,7 +87,6 @@ def get_nmsed_box(rpn_rois, confs, locs, class_nums, im_info,
    variance_v = np.array(cfg.bbox_reg_weights)
    confs_v = np.array(confs)
    locs_v = np.array(locs)
-    rois = box_decoder(locs_v, rpn_rois_v, variance_v)
    im_results = [[] for _ in range(len(lod) - 1)]
    new_lod = [0]
    for i in range(len(lod) - 1):
@@ -81,9 +94,11 @@ def get_nmsed_box(rpn_rois, confs, locs, class_nums, im_info,
        end = lod[i + 1]
        if start == end:
            continue
-        rois_n = rois[start:end, :]
+        locs_n = locs_v[start:end, :]
+        rois_n = rpn_rois_v[start:end, :]
        rois_n = rois_n / im_info[i][2]
-        rois_n = clip_tiled_boxes(rois_n, im_info[i][:2])
+        rois_n = box_decoder(locs_n, rois_n, variance_v)
+        rois_n = clip_tiled_boxes(rois_n, im_info[i][:2] / im_info[i][2])

        cls_boxes = [[] for _ in range(class_nums)]
        scores_n = confs_v[start:end, :]

--- a/fluid/PaddleCV/image_classification/README.md
+++ b/fluid/PaddleCV/image_classification/README.md
@@ -59,7 +59,7 @@ python train.py \
       --model=SE_ResNeXt50_32x4d \
       --batch_size=32 \
       --total_images=1281167 \
-       --class_dim=1000
+       --class_dim=1000 \
       --image_shape=3,224,224 \
       --model_save_dir=output/ \
       --with_mem_opt=False \
@@ -80,8 +80,11 @@ python train.py \
 * **lr**: initialized learning rate. Default: 0.1.
 * **pretrained_model**: model path for pretraining. Default: None.
 * **checkpoint**: the checkpoint path to resume. Default: None.
+* **model_category**: the category of models, ("models"|"models_name"). Default: "models".
+
+Or can start the training step by running the ```run.sh```.

-**data reader introduction:** Data reader is defined in ```reader.py```. In [training stage](#training-a-model), random crop and flipping are used, while center crop is used in [evaluation](#inference) and [inference](#inference) stages. Supported data augmentation includes:
+**data reader introduction:** Data reader is defined in ```reader.py``` and ```reader_cv2.py```, Using CV2 reader can improve the speed of reading. In [training stage](#training-a-model), random crop and flipping are used, while center crop is used in [evaluation](#inference) and [inference](#inference) stages. Supported data augmentation includes:
 * rotation
 * color jitter
 * random crop
@@ -183,26 +186,33 @@ Test-12-score: [15.040644], class [386]

 ## Supported models and performances

+Models consists of two categories: Models with specified parameters names in model definition and Models without specified parameters, Generate named model by indicating ```model_category = models_name```.
+
 Models are trained by starting with learning rate ```0.1``` and decaying it by ```0.1``` after each pre-defined epoches, if not special introduced. Available top-1/top-5 validation accuracy on ImageNet 2012 are listed in table. Pretrained models can be downloaded by clicking related model names.

-|model | top-1/top-5 accuracy
-|- | -:
-|[AlexNet](http://paddle-imagenet-models.bj.bcebos.com/alexnet_model.tar) | 57.21%/79.72%
-|VGG11 | -
-|VGG13 | -
-|VGG16 | -
-|VGG19 | -
-|GoogleNet | -
-|InceptionV4 | -
-|MobileNet | -
-|[ResNet50](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar) | 76.63%/93.10%
-|ResNet101 | -
-|ResNet152 | -
-|[SE_ResNeXt50_32x4d](http://paddle-imagenet-models.bj.bcebos.com/se_resnext_50_model.tar) | 78.33%/93.96%
-|SE_ResNeXt101_32x4d | -
-|SE_ResNeXt152_32x4d | -
-|DPN68 | -
-|DPN92 | -
-|DPN98 | -
-|DPN107 | -
-|DPN131 | -
+
+- Released models: specify parameter names
+
+|model | top-1/top-5 accuracy(PIL)| top-1/top-5 accuracy(CV2) |
+|- |:-: |:-:|
+|[AlexNet](http://paddle-imagenet-models-name.bj.bcebos.com/AlexNet_pretrained.zip) | 56.71%/79.18% | 55.88%/78.65% |
+|[VGG11](https://paddle-imagenet-models-name.bj.bcebos.com/VGG11_pretrained.zip) | 69.22%/89.09% | 69.01%/88.90% |
+|[VGG13](https://paddle-imagenet-models-name.bj.bcebos.com/VGG13_pretrained.zip) | 70.14%/89.48% | 69.83%/89.13% |
+|[VGG16](https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_pretrained.zip) | 72.08%/90.63% | 71.65%/90.57% |
+|[VGG19](https://paddle-imagenet-models-name.bj.bcebos.com/VGG19_pretrained.zip) | 72.56%/90.83% | 72.32%/90.98% |
+|[MobileNetV1](http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.zip) | 70.91%/89.54% | 70.51%/89.35% |
+|[ResNet50](http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.zip) | 76.35%/92.80% | 76.22%/92.92% |
+|[ResNet101](http://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_pretrained.zip) | 77.49%/93.57% | 77.56%/93.64% |
+|[ResNet152](https://paddle-imagenet-models-name.bj.bcebos.com/ResNet152_pretrained.zip) | 78.12%/93.93% | 77.92%/93.87% |
+|[SE_ResNeXt50_32x4d](https://paddle-imagenet-models-name.bj.bcebos.com/SE_ResNext50_32x4d_pretrained.zip) | 78.50%/94.01% | 78.44%/93.96% |
+|[SE_ResNeXt101_32x4d](https://paddle-imagenet-models-name.bj.bcebos.com/SE_ResNeXt101_32x4d_pretrained.zip) | 79.26%/94.22% | 79.12%/94.20% |
+
+
+
+
+- Released models: not specify parameter names
+
+|model | top-1/top-5 accuracy(PIL)| top-1/top-5 accuracy(CV2) |
+|- |:-: |:-:|
+|[ResNet152](http://paddle-imagenet-models.bj.bcebos.com/ResNet152_pretrained.zip) | 78.18%/93.93% | 78.11%/94.04% |
+|[SE_ResNeXt50_32x4d](http://paddle-imagenet-models.bj.bcebos.com/se_resnext_50_model.tar) | 78.32%/93.96% | 77.58%/93.73% |
--- a/fluid/PaddleCV/image_classification/README_cn.md
+++ b/fluid/PaddleCV/image_classification/README_cn.md
@@ -58,7 +58,7 @@ python train.py \
       --model=SE_ResNeXt50_32x4d \
       --batch_size=32 \
       --total_images=1281167 \
-       --class_dim=1000
+       --class_dim=1000 \
       --image_shape=3,224,224 \
       --model_save_dir=output/ \
       --with_mem_opt=False \
@@ -79,8 +79,9 @@ python train.py \
 * **lr**: initialized learning rate. Default: 0.1.
 * **pretrained_model**: model path for pretraining. Default: None.
 * **checkpoint**: the checkpoint path to resume. Default: None.
+* **model_category**: the category of models, ("models"|"models_name"). Default:"models".

-**数据读取器说明：** 数据读取器定义在```reader.py```中。在[训练阶段](#training-a-model), 默认采用的增广方式是随机裁剪与水平翻转, 而在[评估](#inference)与[推断](#inference)阶段用的默认方式是中心裁剪。当前支持的数据增广方式有：
+**数据读取器说明：** 数据读取器定义在```reader.py```和```reader_cv2.py```中, 一般, CV2 reader可以提高数据读取速度, reader(PIL)可以得到相对更高的精度, 在[训练阶段](#training-a-model), 默认采用的增广方式是随机裁剪与水平翻转, 而在[评估](#inference)与[推断](#inference)阶段用的默认方式是中心裁剪。当前支持的数据增广方式有：
 * 旋转
 * 颜色抖动
 * 随机裁剪
@@ -183,27 +184,30 @@ Test-12-score: [15.040644], class [386]
 ```

 ## 已有模型及其性能
+Models包括两种模型：带有参数名字的模型，和不带有参数名字的模型。通过设置 ```model_category = models_name```来训练带有参数名字的模型。

 表格中列出了在"models"目录下支持的神经网络种类，并且给出了已完成训练的模型在ImageNet-2012验证集合上的top-1/top-5精度；如无特征说明，训练模型的初始学习率为```0.1```，每隔预定的epochs会下降```0.1```。预训练模型可以通过点击相应模型的名称进行下载。

-|model | top-1/top-5 accuracy
-|- | -:
-|[AlexNet](http://paddle-imagenet-models.bj.bcebos.com/alexnet_model.tar) | 57.21%/79.72%
-|VGG11 | -
-|VGG13 | -
-|VGG16 | -
-|VGG19 | -
-|GoogleNet | -
-|InceptionV4 | -
-|MobileNet | -
-|[ResNet50](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar) | 76.63%/93.10%
-|ResNet101 | -
-|ResNet152 | -
-|[SE_ResNeXt50_32x4d](http://paddle-imagenet-models.bj.bcebos.com/se_resnext_50_model.tar) | 78.33%/93.96%
-|SE_ResNeXt101_32x4d | -
-|SE_ResNeXt152_32x4d | -
-|DPN68 | -
-|DPN92 | -
-|DPN98 | -
-|DPN107 | -
-|DPN131 | -
+
+- Released models: specify parameter names
+
+|model | top-1/top-5 accuracy(PIL)| top-1/top-5 accuracy(CV2) |
+|- |:-: |:-:|
+|[AlexNet](http://paddle-imagenet-models-name.bj.bcebos.com/AlexNet_pretrained.zip) | 56.71%/79.18% | 55.88%/78.65% |
+|[VGG11](https://paddle-imagenet-models-name.bj.bcebos.com/VGG11_pretrained.zip) | 69.22%/89.09% | 69.01%/88.90% |
+|[VGG13](https://paddle-imagenet-models-name.bj.bcebos.com/VGG13_pretrained.zip) | 70.14%/89.48% | 69.83%/89.13% |
+|[VGG16](https://paddle-imagenet-models-name.bj.bcebos.com/VGG16_pretrained.zip) | 72.08%/90.63% | 71.65%/90.57% |
+|[VGG19](https://paddle-imagenet-models-name.bj.bcebos.com/VGG19_pretrained.zip) | 72.56%/90.83% | 72.32%/90.98% |
+|[MobileNetV1](http://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV1_pretrained.zip) | 70.91%/89.54% | 70.51%/89.35% |
+|[ResNet50](http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.zip) | 76.35%/92.80% | 76.22%/92.92% |
+|[ResNet101](http://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_pretrained.zip) | 77.49%/93.57% | 77.56%/93.64% |
+|[ResNet152](https://paddle-imagenet-models-name.bj.bcebos.com/ResNet152_pretrained.zip) | 78.12%/93.93% | 77.92%/93.87% |
+|[SE_ResNeXt50_32x4d](https://paddle-imagenet-models-name.bj.bcebos.com/SE_ResNext50_32x4d_pretrained.zip) | 78.50%/94.01% | 78.44%/93.96% |
+|[SE_ResNeXt101_32x4d](https://paddle-imagenet-models-name.bj.bcebos.com/SE_ResNeXt101_32x4d_pretrained.zip) | 79.26%/94.22% | 79.12%/94.20% |
+
+- Released models: not specify parameter names
+
+|model | top-1/top-5 accuracy(PIL)| top-1/top-5 accuracy(CV2) |
+|- |:-: |:-:|
+|[ResNet152](http://paddle-imagenet-models.bj.bcebos.com/ResNet152_pretrained.zip) | 78.18%/93.93% | 78.11%/94.04% |
+|[SE_ResNeXt50_32x4d](http://paddle-imagenet-models.bj.bcebos.com/se_resnext_50_model.tar) | 78.32%/93.96% | 77.58%/93.73% |
--- a/fluid/PaddleCV/image_classification/dist_train/dist_train.py
+++ b/fluid/PaddleCV/image_classification/dist_train/dist_train.py
@@ -26,6 +26,7 @@ import six
 import sys
 sys.path.append("..")
 import models
+import utils
 from reader import train, val

 def parse_args():
@@ -149,13 +150,15 @@ def get_model(args, is_train, main_prog, startup_prog):
                lr = []
                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]

+                # NOTE: we put weight decay in layers config, and remove
+                # weight decay on bn layers, so don't add weight decay in
+                # optimizer config.
                optimizer = fluid.optimizer.Momentum(
-                    learning_rate=models.learning_rate.lr_warmup(
+                    learning_rate=utils.learning_rate.lr_warmup(
                        fluid.layers.piecewise_decay(
                            boundaries=bd, values=lr),
                        warmup_steps, start_lr, end_lr),
-                    momentum=0.9,
-                    regularization=fluid.regularizer.L2Decay(1e-4))
+                    momentum=0.9)
                optimizer.minimize(avg_cost)

    batched_reader = None
@@ -175,6 +178,7 @@ def append_nccl2_prepare(trainer_id, startup_prog):
    for ip in worker_ips.split(","):
        worker_endpoints.append(':'.join([ip, port]))
    current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
+    num_trainers = len(worker_endpoints)

    config = fluid.DistributeTranspilerConfig()
    config.mode = "nccl2"
@@ -182,6 +186,7 @@ def append_nccl2_prepare(trainer_id, startup_prog):
    t.transpile(trainer_id, trainers=','.join(worker_endpoints),
        current_endpoint=current_endpoint,
        startup_program=startup_prog)
+    return num_trainers, trainer_id


 def dist_transpile(trainer_id, args, train_prog, startup_prog):
@@ -281,12 +286,12 @@ def test_single(exe, test_args, args, test_prog):


 def train_parallel(train_args, test_args, args, train_prog, test_prog,
-                   startup_prog, nccl_id_var, num_trainers, trainer_id):
+                   startup_prog, num_trainers, trainer_id):
    over_all_start = time.time()
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)

-    if nccl_id_var and trainer_id == 0:
-        #FIXME(wuyi): wait other trainer to start listening
+    if args.update_method == "nccl2" and trainer_id == 0:
+        #FIXME(typhoonzero): wait other trainer to start listening
        time.sleep(30)

    startup_exe = fluid.Executor(place)
@@ -398,8 +403,8 @@ def main():

    # the unique trainer id, starting from 0, needed by trainer
    # only
-    nccl_id_var, num_trainers, trainer_id = (
-        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
+    num_trainers, trainer_id = (
+        1, int(os.getenv("PADDLE_TRAINER_ID", "0")))

    train_prog = fluid.Program()
    test_prog = fluid.Program()
@@ -418,7 +423,7 @@ def main():
                "Must configure correct environments to run dist train.")
        all_args.extend([train_prog, test_prog, startup_prog])
        if os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
-            all_args.extend([nccl_id_var, num_trainers, trainer_id])
+            all_args.extend([num_trainers, trainer_id])
            train_parallel(*all_args)
        elif os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
            # start pserver with Executor
@@ -431,10 +436,10 @@ def main():
    all_args.extend([train_prog, test_prog, startup_prog])

    if args.update_method == "nccl2":
-        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(
+        num_trainers, trainer_id = append_nccl2_prepare(
            trainer_id, startup_prog)

-    all_args.extend([nccl_id_var, num_trainers, trainer_id])
+    all_args.extend([num_trainers, trainer_id])
    train_parallel(*all_args)

 if __name__ == "__main__":

--- a/fluid/PaddleCV/image_classification/eval.py
+++ b/fluid/PaddleCV/image_classification/eval.py
@@ -7,11 +7,13 @@ import time
 import sys
 import paddle
 import paddle.fluid as fluid
-import models
-import reader
+#import models
+import models_name as models
+#import reader_cv2 as reader
+import reader as reader
 import argparse
 import functools
-from models.learning_rate import cosine_decay
+from utils.learning_rate import cosine_decay
 from utility import add_arguments, print_arguments
 import math


--- a/fluid/PaddleCV/image_classification/images/alexnet_imagenet1k_acc1.png
+++ b/fluid/PaddleCV/image_classification/images/alexnet_imagenet1k_acc1.png
--- a/fluid/PaddleCV/image_classification/images/mobielenetv1_imagenet1k_acc1.png
+++ b/fluid/PaddleCV/image_classification/images/mobielenetv1_imagenet1k_acc1.png
--- a/fluid/PaddleCV/image_classification/images/resnet101_imagenet1k_acc1.png
+++ b/fluid/PaddleCV/image_classification/images/resnet101_imagenet1k_acc1.png
--- a/fluid/PaddleCV/image_classification/images/resnet50_imagenet1k_acc1.png
+++ b/fluid/PaddleCV/image_classification/images/resnet50_imagenet1k_acc1.png
--- a/fluid/PaddleCV/image_classification/images/vgg11_imagenet1k_acc1.png
+++ b/fluid/PaddleCV/image_classification/images/vgg11_imagenet1k_acc1.png
--- a/fluid/PaddleCV/image_classification/models/__init__.py
+++ b/fluid/PaddleCV/image_classification/models/__init__.py
 from .alexnet import AlexNet
 from .mobilenet import MobileNet
+from .mobilenet_v2 import MobileNetV2
 from .googlenet import GoogleNet
 from .vgg import VGG11, VGG13, VGG16, VGG19
 from .resnet import ResNet50, ResNet101, ResNet152
@@ -7,4 +8,4 @@ from .resnet_dist import DistResNet
 from .inception_v4 import InceptionV4
 from .se_resnext import SE_ResNeXt50_32x4d, SE_ResNeXt101_32x4d, SE_ResNeXt152_32x4d
 from .dpn import DPN68, DPN92, DPN98, DPN107, DPN131
-import learning_rate
+from .shufflenet_v2 import ShuffleNetV2_x0_5, ShuffleNetV2_x1_0, ShuffleNetV2_x1_5, ShuffleNetV2_x2_0
--- a/fluid/PaddleCV/image_classification/models/dpn.py
+++ b/fluid/PaddleCV/image_classification/models/dpn.py
@@ -5,8 +5,8 @@ import os
 import numpy as np
 import time
 import sys
-import math
 import paddle.fluid as fluid
+import math

 __all__ = ["DPN", "DPN68", "DPN92", "DPN98", "DPN107", "DPN131"]

@@ -62,7 +62,6 @@ class DPN(object):
            pool_padding=1,
            pool_type='max')

-        #conv2 - conv5
        for gc in range(4):
            bw = bws[gc]
            inc = inc_sec[gc]

--- a/fluid/PaddleCV/image_classification/models/googlenet.py
+++ b/fluid/PaddleCV/image_classification/models/googlenet.py
@@ -13,7 +13,7 @@ train_parameters = {
    "learning_strategy": {
        "name": "piecewise_decay",
        "batch_size": 256,
-        "epochs": [30, 60, 90],
+        "epochs": [30, 70, 100],
        "steps": [0.1, 0.01, 0.001, 0.0001]
    }
 }

--- a/fluid/PaddleCV/image_classification/models/mobilenet_v2.py
+++ b/fluid/PaddleCV/image_classification/models/mobilenet_v2.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = ['MobileNetV2']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class MobileNetV2():
+    def __init__(self):
+        self.params = train_parameters
+
+    def net(self, input, class_dim=1000, scale=1.0):
+
+        bottleneck_params_list = [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),
+            (6, 32, 3, 2),
+            (6, 64, 4, 2),
+            (6, 96, 3, 1),
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),
+        ]
+
+        input = self.conv_bn_layer(
+            input,
+            num_filters=int(32 * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            if_act=True)
+
+        in_c = int(32 * scale)
+        for layer_setting in bottleneck_params_list:
+            t, c, n, s = layer_setting
+            input = self.invresi_blocks(
+                input=input,
+                in_c=in_c,
+                t=t,
+                c=int(c * scale),
+                n=n,
+                s=s, )
+            in_c = int(c * scale)
+
+        input = self.conv_bn_layer(
+            input=input,
+            num_filters=int(1280 * scale) if scale > 1.0 else 1280,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True)
+
+        input = fluid.layers.pool2d(
+            input=input,
+            pool_size=7,
+            pool_stride=1,
+            pool_type='avg',
+            global_pooling=True)
+
+        output = fluid.layers.fc(input=input,
+                                 size=class_dim,
+                                 act='softmax',
+                                 param_attr=ParamAttr(initializer=MSRA()))
+        return output
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      channels=None,
+                      num_groups=1,
+                      use_cudnn=True,
+                      if_act=True):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(initializer=MSRA()),
+            bias_attr=False)
+        bn = fluid.layers.batch_norm(input=conv)
+        if if_act:
+            return fluid.layers.relu6(bn)
+        else:
+            return bn
+
+    def shortcut(self, input, data_residual):
+        return fluid.layers.elementwise_add(input, data_residual)
+
+    def inverted_residual_unit(self, input, num_in_filter, num_filters,
+                               ifshortcut, stride, filter_size, padding,
+                               expansion_factor):
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+        channel_expand = self.conv_bn_layer(
+            input=input,
+            num_filters=num_expfilter,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True)
+        bottleneck_conv = self.conv_bn_layer(
+            input=channel_expand,
+            num_filters=num_expfilter,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            num_groups=num_expfilter,
+            if_act=True,
+            use_cudnn=False)
+        linear_out = self.conv_bn_layer(
+            input=bottleneck_conv,
+            num_filters=num_filters,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=False)
+        if ifshortcut:
+            out = self.shortcut(input=input, data_residual=linear_out)
+            return out
+        else:
+            return linear_out
+
+    def invresi_blocks(self, input, in_c, t, c, n, s):
+        first_block = self.inverted_residual_unit(
+            input=input,
+            num_in_filter=in_c,
+            num_filters=c,
+            ifshortcut=False,
+            stride=s,
+            filter_size=3,
+            padding=1,
+            expansion_factor=t)
+
+        last_residual_block = first_block
+        last_c = c
+
+        for i in range(1, n):
+            last_residual_block = self.inverted_residual_unit(
+                input=last_residual_block,
+                num_in_filter=last_c,
+                num_filters=c,
+                ifshortcut=True,
+                stride=1,
+                filter_size=3,
+                padding=1,
+                expansion_factor=t)
+        return last_residual_block
--- a/fluid/PaddleCV/image_classification/models/shufflenet_v2.py
+++ b/fluid/PaddleCV/image_classification/models/shufflenet_v2.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = [
+    'ShuffleNetV2', 'ShuffleNetV2_x0_5', 'ShuffleNetV2_x1_0',
+    'ShuffleNetV2_x1_5', 'ShuffleNetV2_x2_0'
+]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class ShuffleNetV2():
+    def __init__(self, scale=1.0):
+        self.params = train_parameters
+        self.scale = scale
+
+    def net(self, input, class_dim=1000):
+        scale = self.scale
+        stage_repeats = [4, 8, 4]
+
+        if scale == 0.5:
+            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif scale == 1.0:
+            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif scale == 1.5:
+            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif scale == 2.0:
+            stage_out_channels = [-1, 24, 224, 488, 976, 2048]
+        else:
+            raise ValueError("""{} groups is not supported for
+                       1x1 Grouped Convolutions""".format(num_groups))
+
+        #conv1
+
+        input_channel = stage_out_channels[1]
+        conv1 = self.conv_bn_layer(
+            input=input,
+            filter_size=3,
+            num_filters=input_channel,
+            padding=1,
+            stride=2)
+        pool1 = fluid.layers.pool2d(
+            input=conv1,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        conv = pool1
+        # bottleneck sequences
+        for idxstage in range(len(stage_repeats)):
+            numrepeat = stage_repeats[idxstage]
+            output_channel = stage_out_channels[idxstage + 2]
+            for i in range(numrepeat):
+                if i == 0:
+                    conv = self.inverted_residual_unit(
+                        input=conv,
+                        num_filters=output_channel,
+                        stride=2,
+                        benchmodel=2)
+                else:
+                    conv = self.inverted_residual_unit(
+                        input=conv,
+                        num_filters=output_channel,
+                        stride=1,
+                        benchmodel=1)
+
+        conv_last = self.conv_bn_layer(
+            input=conv,
+            filter_size=1,
+            num_filters=stage_out_channels[-1],
+            padding=0,
+            stride=1)
+        pool_last = fluid.layers.pool2d(
+            input=conv_last,
+            pool_size=7,
+            pool_stride=7,
+            pool_padding=0,
+            pool_type='avg')
+
+        output = fluid.layers.fc(input=pool_last,
+                                 size=class_dim,
+                                 act='softmax',
+                                 param_attr=ParamAttr(initializer=MSRA()))
+        return output
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      num_groups=1,
+                      use_cudnn=True,
+                      if_act=True):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(initializer=MSRA()),
+            bias_attr=False)
+        if if_act:
+            return fluid.layers.batch_norm(input=conv, act='relu')
+        else:
+            return fluid.layers.batch_norm(input=conv)
+
+    def channel_shuffle(self, x, groups):
+        batchsize, num_channels, height, width = x.shape[0], x.shape[
+            1], x.shape[2], x.shape[3]
+        channels_per_group = num_channels // groups
+
+        # reshape
+        x = fluid.layers.reshape(
+            x=x, shape=[batchsize, groups, channels_per_group, height, width])
+
+        x = fluid.layers.transpose(x=x, perm=[0, 2, 1, 3, 4])
+
+        # flatten
+        x = fluid.layers.reshape(
+            x=x, shape=[batchsize, num_channels, height, width])
+
+        return x
+
+    def inverted_residual_unit(self, input, num_filters, stride, benchmodel):
+        assert stride in [1, 2], \
+            "supported stride are {} but your stride is {}".format([1,2], stride)
+
+        oup_inc = num_filters // 2
+        inp = input.shape[1]
+
+        if benchmodel == 1:
+            x1, x2 = fluid.layers.split(
+                input,
+                num_or_sections=[input.shape[1] // 2, input.shape[1] // 2],
+                dim=1)
+
+            conv_pw = self.conv_bn_layer(
+                input=x2,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True)
+
+            conv_dw = self.conv_bn_layer(
+                input=conv_pw,
+                num_filters=oup_inc,
+                filter_size=3,
+                stride=stride,
+                padding=1,
+                num_groups=oup_inc,
+                if_act=False)
+
+            conv_linear = self.conv_bn_layer(
+                input=conv_dw,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True)
+
+            out = fluid.layers.concat([x1, conv_linear], axis=1)
+
+        else:
+            #branch1
+            conv_dw = self.conv_bn_layer(
+                input=input,
+                num_filters=inp,
+                filter_size=3,
+                stride=stride,
+                padding=1,
+                num_groups=inp,
+                if_act=False)
+
+            conv_linear_1 = self.conv_bn_layer(
+                input=conv_dw,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True)
+
+            #branch2
+            conv_pw = self.conv_bn_layer(
+                input=input,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True)
+
+            conv_dw = self.conv_bn_layer(
+                input=conv_pw,
+                num_filters=oup_inc,
+                filter_size=3,
+                stride=stride,
+                padding=1,
+                num_groups=oup_inc,
+                if_act=False)
+
+            conv_linear_2 = self.conv_bn_layer(
+                input=conv_dw,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True)
+            out = fluid.layers.concat([conv_linear_1, conv_linear_2], axis=1)
+
+        return self.channel_shuffle(out, 2)
+
+
+def ShuffleNetV2_x0_5():
+    model = ShuffleNetV2(scale=0.5)
+    return model
+
+
+def ShuffleNetV2_x1_0():
+    model = ShuffleNetV2(scale=1.0)
+    return model
+
+
+def ShuffleNetV2_x1_5():
+    model = ShuffleNetV2(scale=1.5)
+    return model
+
+
+def ShuffleNetV2_x2_0():
+    model = ShuffleNetV2(scale=2.0)
+    return model
--- a/fluid/PaddleCV/image_classification/models_name/__init__.py
+++ b/fluid/PaddleCV/image_classification/models_name/__init__.py
+from .alexnet import AlexNet
+from .mobilenet import MobileNet
+from .mobilenet_v2 import MobileNetV2
+from .googlenet import GoogleNet
+from .vgg import VGG11, VGG13, VGG16, VGG19
+from .resnet import ResNet50, ResNet101, ResNet152
+from .inception_v4 import InceptionV4
+from .se_resnext import SE_ResNeXt50_32x4d, SE_ResNeXt101_32x4d, SE_ResNeXt152_32x4d
+from .dpn import DPN68, DPN92, DPN98, DPN107, DPN131
+from .shufflenet_v2 import ShuffleNetV2_x0_5, ShuffleNetV2_x1_0, ShuffleNetV2_x1_5, ShuffleNetV2_x2_0
--- a/fluid/PaddleCV/image_classification/models_name/alexnet.py
+++ b/fluid/PaddleCV/image_classification/models_name/alexnet.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+import math
+
+__all__ = ['AlexNet']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [40, 70, 100],
+        "steps": [0.01, 0.001, 0.0001, 0.00001]
+    }
+}
+
+
+class AlexNet():
+    def __init__(self):
+        self.params = train_parameters
+
+    def net(self, input, class_dim=1000):
+        stdv = 1.0 / math.sqrt(input.shape[1] * 11 * 11)
+        layer_name = [
+            "conv1", "conv2", "conv3", "conv4", "conv5", "fc6", "fc7", "fc8"
+        ]
+        conv1 = fluid.layers.conv2d(
+            input=input,
+            num_filters=64,
+            filter_size=11,
+            stride=4,
+            padding=2,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[0] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[0] + "_weights"))
+        pool1 = fluid.layers.pool2d(
+            input=conv1,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='max')
+
+        stdv = 1.0 / math.sqrt(pool1.shape[1] * 5 * 5)
+        conv2 = fluid.layers.conv2d(
+            input=pool1,
+            num_filters=192,
+            filter_size=5,
+            stride=1,
+            padding=2,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[1] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[1] + "_weights"))
+        pool2 = fluid.layers.pool2d(
+            input=conv2,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='max')
+
+        stdv = 1.0 / math.sqrt(pool2.shape[1] * 3 * 3)
+        conv3 = fluid.layers.conv2d(
+            input=pool2,
+            num_filters=384,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[2] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[2] + "_weights"))
+
+        stdv = 1.0 / math.sqrt(conv3.shape[1] * 3 * 3)
+        conv4 = fluid.layers.conv2d(
+            input=conv3,
+            num_filters=256,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[3] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[3] + "_weights"))
+
+        stdv = 1.0 / math.sqrt(conv4.shape[1] * 3 * 3)
+        conv5 = fluid.layers.conv2d(
+            input=conv4,
+            num_filters=256,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[4] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[4] + "_weights"))
+        pool5 = fluid.layers.pool2d(
+            input=conv5,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='max')
+
+        drop6 = fluid.layers.dropout(x=pool5, dropout_prob=0.5)
+        stdv = 1.0 / math.sqrt(drop6.shape[1] * drop6.shape[2] *
+                               drop6.shape[3] * 1.0)
+
+        fc6 = fluid.layers.fc(
+            input=drop6,
+            size=4096,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[5] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[5] + "_weights"))
+
+        drop7 = fluid.layers.dropout(x=fc6, dropout_prob=0.5)
+        stdv = 1.0 / math.sqrt(drop7.shape[1] * 1.0)
+
+        fc7 = fluid.layers.fc(
+            input=drop7,
+            size=4096,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[6] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[6] + "_weights"))
+
+        stdv = 1.0 / math.sqrt(fc7.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=fc7,
+            size=class_dim,
+            act='softmax',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[7] + "_offset"),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=layer_name[7] + "_weights"))
+        return out
--- a/fluid/PaddleCV/image_classification/models_name/dpn.py
+++ b/fluid/PaddleCV/image_classification/models_name/dpn.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import numpy as np
+import time
+import sys
+import paddle.fluid as fluid
+import math
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = ["DPN", "DPN68", "DPN92", "DPN98", "DPN107", "DPN131"]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class DPN(object):
+    def __init__(self, layers=68):
+        self.params = train_parameters
+        self.layers = layers
+
+    def net(self, input, class_dim=1000):
+        # get network args
+        args = self.get_net_args(self.layers)
+        bws = args['bw']
+        inc_sec = args['inc_sec']
+        rs = args['bw']
+        k_r = args['k_r']
+        k_sec = args['k_sec']
+        G = args['G']
+        init_num_filter = args['init_num_filter']
+        init_filter_size = args['init_filter_size']
+        init_padding = args['init_padding']
+
+        ## define Dual Path Network
+
+        # conv1
+        conv1_x_1 = fluid.layers.conv2d(
+            input=input,
+            num_filters=init_num_filter,
+            filter_size=init_filter_size,
+            stride=2,
+            padding=init_padding,
+            groups=1,
+            act=None,
+            bias_attr=False,
+            name="conv1",
+            param_attr=ParamAttr(name="conv1_weights"), )
+
+        conv1_x_1 = fluid.layers.batch_norm(
+            input=conv1_x_1,
+            act='relu',
+            is_test=False,
+            name="conv1_bn",
+            param_attr=ParamAttr(name='conv1_bn_scale'),
+            bias_attr=ParamAttr('conv1_bn_offset'),
+            moving_mean_name='conv1_bn_mean',
+            moving_variance_name='conv1_bn_variance', )
+
+        convX_x_x = fluid.layers.pool2d(
+            input=conv1_x_1,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max',
+            name="pool1")
+
+        #conv2 - conv5
+        match_list, num = [], 0
+        for gc in range(4):
+            bw = bws[gc]
+            inc = inc_sec[gc]
+            R = (k_r * bw) // rs[gc]
+            if gc == 0:
+                _type1 = 'proj'
+                _type2 = 'normal'
+                match = 1
+            else:
+                _type1 = 'down'
+                _type2 = 'normal'
+                match = match + k_sec[gc - 1]
+            match_list.append(match)
+
+            convX_x_x = self.dual_path_factory(
+                convX_x_x, R, R, bw, inc, G, _type1, name="dpn" + str(match))
+            for i_ly in range(2, k_sec[gc] + 1):
+                num += 1
+                if num in match_list:
+                    num += 1
+                convX_x_x = self.dual_path_factory(
+                    convX_x_x, R, R, bw, inc, G, _type2, name="dpn" + str(num))
+
+        conv5_x_x = fluid.layers.concat(convX_x_x, axis=1)
+        conv5_x_x = fluid.layers.batch_norm(
+            input=conv5_x_x,
+            act='relu',
+            is_test=False,
+            name="final_concat_bn",
+            param_attr=ParamAttr(name='final_concat_bn_scale'),
+            bias_attr=ParamAttr('final_concat_bn_offset'),
+            moving_mean_name='final_concat_bn_mean',
+            moving_variance_name='final_concat_bn_variance', )
+        pool5 = fluid.layers.pool2d(
+            input=conv5_x_x,
+            pool_size=7,
+            pool_stride=1,
+            pool_padding=0,
+            pool_type='avg', )
+
+        stdv = 0.01
+        param_attr = fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.Uniform(-stdv, stdv))
+        fc6 = fluid.layers.fc(input=pool5,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=param_attr,
+                              name="fc6")
+
+        return fc6
+
+    def get_net_args(self, layers):
+        if layers == 68:
+            k_r = 128
+            G = 32
+            k_sec = [3, 4, 12, 3]
+            inc_sec = [16, 32, 32, 64]
+            bw = [64, 128, 256, 512]
+            r = [64, 64, 64, 64]
+            init_num_filter = 10
+            init_filter_size = 3
+            init_padding = 1
+        elif layers == 92:
+            k_r = 96
+            G = 32
+            k_sec = [3, 4, 20, 3]
+            inc_sec = [16, 32, 24, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 64
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 98:
+            k_r = 160
+            G = 40
+            k_sec = [3, 6, 20, 3]
+            inc_sec = [16, 32, 32, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 96
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 107:
+            k_r = 200
+            G = 50
+            k_sec = [4, 8, 20, 3]
+            inc_sec = [20, 64, 64, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 128
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 131:
+            k_r = 160
+            G = 40
+            k_sec = [4, 8, 28, 3]
+            inc_sec = [16, 32, 32, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 128
+            init_filter_size = 7
+            init_padding = 3
+        else:
+            raise NotImplementedError
+        net_arg = {
+            'k_r': k_r,
+            'G': G,
+            'k_sec': k_sec,
+            'inc_sec': inc_sec,
+            'bw': bw,
+            'r': r
+        }
+        net_arg['init_num_filter'] = init_num_filter
+        net_arg['init_filter_size'] = init_filter_size
+        net_arg['init_padding'] = init_padding
+
+        return net_arg
+
+    def dual_path_factory(self,
+                          data,
+                          num_1x1_a,
+                          num_3x3_b,
+                          num_1x1_c,
+                          inc,
+                          G,
+                          _type='normal',
+                          name=None):
+        kw = 3
+        kh = 3
+        pw = (kw - 1) // 2
+        ph = (kh - 1) // 2
+
+        # type
+        if _type is 'proj':
+            key_stride = 1
+            has_proj = True
+        if _type is 'down':
+            key_stride = 2
+            has_proj = True
+        if _type is 'normal':
+            key_stride = 1
+            has_proj = False
+
+        # PROJ
+        if type(data) is list:
+            data_in = fluid.layers.concat([data[0], data[1]], axis=1)
+        else:
+            data_in = data
+
+        if has_proj:
+            c1x1_w = self.bn_ac_conv(
+                data=data_in,
+                num_filter=(num_1x1_c + 2 * inc),
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(key_stride, key_stride),
+                name=name + "_match")
+            data_o1, data_o2 = fluid.layers.split(
+                c1x1_w,
+                num_or_sections=[num_1x1_c, 2 * inc],
+                dim=1,
+                name=name + "_match_conv_Slice")
+        else:
+            data_o1 = data[0]
+            data_o2 = data[1]
+
+        # MAIN
+        c1x1_a = self.bn_ac_conv(
+            data=data_in,
+            num_filter=num_1x1_a,
+            kernel=(1, 1),
+            pad=(0, 0),
+            name=name + "_conv1")
+        c3x3_b = self.bn_ac_conv(
+            data=c1x1_a,
+            num_filter=num_3x3_b,
+            kernel=(kw, kh),
+            pad=(pw, ph),
+            stride=(key_stride, key_stride),
+            num_group=G,
+            name=name + "_conv2")
+        c1x1_c = self.bn_ac_conv(
+            data=c3x3_b,
+            num_filter=(num_1x1_c + inc),
+            kernel=(1, 1),
+            pad=(0, 0),
+            name=name + "_conv3")
+
+        c1x1_c1, c1x1_c2 = fluid.layers.split(
+            c1x1_c,
+            num_or_sections=[num_1x1_c, inc],
+            dim=1,
+            name=name + "_conv3_Slice")
+
+        # OUTPUTS
+        summ = fluid.layers.elementwise_add(
+            x=data_o1, y=c1x1_c1, name=name + "_elewise")
+        dense = fluid.layers.concat(
+            [data_o2, c1x1_c2], axis=1, name=name + "_concat")
+
+        return [summ, dense]
+
+    def bn_ac_conv(self,
+                   data,
+                   num_filter,
+                   kernel,
+                   pad,
+                   stride=(1, 1),
+                   num_group=1,
+                   name=None):
+        bn_ac = fluid.layers.batch_norm(
+            input=data,
+            act='relu',
+            is_test=False,
+            name=name + '.output.1',
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance', )
+        bn_ac_conv = fluid.layers.conv2d(
+            input=bn_ac,
+            num_filters=num_filter,
+            filter_size=kernel,
+            stride=stride,
+            padding=pad,
+            groups=num_group,
+            act=None,
+            bias_attr=False,
+            param_attr=ParamAttr(name=name + "_weights"))
+        return bn_ac_conv
+
+
+def DPN68():
+    model = DPN(layers=68)
+    return model
+
+
+def DPN92():
+    onvodel = DPN(layers=92)
+    return model
+
+
+def DPN98():
+    model = DPN(layers=98)
+    return model
+
+
+def DPN107():
+    model = DPN(layers=107)
+    return model
+
+
+def DPN131():
+    model = DPN(layers=131)
+    return model
--- a/fluid/PaddleCV/image_classification/models_name/googlenet.py
+++ b/fluid/PaddleCV/image_classification/models_name/googlenet.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = ['GoogleNet']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 70, 100],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class GoogleNet():
+    def __init__(self):
+        self.params = train_parameters
+
+    def conv_layer(self,
+                   input,
+                   num_filters,
+                   filter_size,
+                   stride=1,
+                   groups=1,
+                   act=None,
+                   name=None):
+        channels = input.shape[1]
+        stdv = (3.0 / (filter_size**2 * channels))**0.5
+        param_attr = ParamAttr(
+            initializer=fluid.initializer.Uniform(-stdv, stdv),
+            name=name + "_weights")
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=act,
+            param_attr=param_attr,
+            bias_attr=False,
+            name=name)
+        return conv
+
+    def xavier(self, channels, filter_size, name):
+        stdv = (3.0 / (filter_size**2 * channels))**0.5
+        param_attr = ParamAttr(
+            initializer=fluid.initializer.Uniform(-stdv, stdv),
+            name=name + "_weights")
+
+        return param_attr
+
+    def inception(self,
+                  input,
+                  channels,
+                  filter1,
+                  filter3R,
+                  filter3,
+                  filter5R,
+                  filter5,
+                  proj,
+                  name=None):
+        conv1 = self.conv_layer(
+            input=input,
+            num_filters=filter1,
+            filter_size=1,
+            stride=1,
+            act=None,
+            name="inception_" + name + "_1x1")
+        conv3r = self.conv_layer(
+            input=input,
+            num_filters=filter3R,
+            filter_size=1,
+            stride=1,
+            act=None,
+            name="inception_" + name + "_3x3_reduce")
+        conv3 = self.conv_layer(
+            input=conv3r,
+            num_filters=filter3,
+            filter_size=3,
+            stride=1,
+            act=None,
+            name="inception_" + name + "_3x3")
+        conv5r = self.conv_layer(
+            input=input,
+            num_filters=filter5R,
+            filter_size=1,
+            stride=1,
+            act=None,
+            name="inception_" + name + "_5x5_reduce")
+        conv5 = self.conv_layer(
+            input=conv5r,
+            num_filters=filter5,
+            filter_size=5,
+            stride=1,
+            act=None,
+            name="inception_" + name + "_5x5")
+        pool = fluid.layers.pool2d(
+            input=input,
+            pool_size=3,
+            pool_stride=1,
+            pool_padding=1,
+            pool_type='max')
+        convprj = fluid.layers.conv2d(
+            input=pool,
+            filter_size=1,
+            num_filters=proj,
+            stride=1,
+            padding=0,
+            name="inception_" + name + "_3x3_proj",
+            param_attr=ParamAttr(
+                name="inception_" + name + "_3x3_proj_weights"),
+            bias_attr=False)
+        cat = fluid.layers.concat(input=[conv1, conv3, conv5, convprj], axis=1)
+        cat = fluid.layers.relu(cat)
+        return cat
+
+    def net(self, input, class_dim=1000):
+        conv = self.conv_layer(
+            input=input,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act=None,
+            name="conv1")
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=3, pool_type='max', pool_stride=2)
+
+        conv = self.conv_layer(
+            input=pool,
+            num_filters=64,
+            filter_size=1,
+            stride=1,
+            act=None,
+            name="conv2_1x1")
+        conv = self.conv_layer(
+            input=conv,
+            num_filters=192,
+            filter_size=3,
+            stride=1,
+            act=None,
+            name="conv2_3x3")
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=3, pool_type='max', pool_stride=2)
+
+        ince3a = self.inception(pool, 192, 64, 96, 128, 16, 32, 32, "ince3a")
+        ince3b = self.inception(ince3a, 256, 128, 128, 192, 32, 96, 64,
+                                "ince3b")
+        pool3 = fluid.layers.pool2d(
+            input=ince3b, pool_size=3, pool_type='max', pool_stride=2)
+
+        ince4a = self.inception(pool3, 480, 192, 96, 208, 16, 48, 64, "ince4a")
+        ince4b = self.inception(ince4a, 512, 160, 112, 224, 24, 64, 64,
+                                "ince4b")
+        ince4c = self.inception(ince4b, 512, 128, 128, 256, 24, 64, 64,
+                                "ince4c")
+        ince4d = self.inception(ince4c, 512, 112, 144, 288, 32, 64, 64,
+                                "ince4d")
+        ince4e = self.inception(ince4d, 528, 256, 160, 320, 32, 128, 128,
+                                "ince4e")
+        pool4 = fluid.layers.pool2d(
+            input=ince4e, pool_size=3, pool_type='max', pool_stride=2)
+
+        ince5a = self.inception(pool4, 832, 256, 160, 320, 32, 128, 128,
+                                "ince5a")
+        ince5b = self.inception(ince5a, 832, 384, 192, 384, 48, 128, 128,
+                                "ince5b")
+        pool5 = fluid.layers.pool2d(
+            input=ince5b, pool_size=7, pool_type='avg', pool_stride=7)
+        dropout = fluid.layers.dropout(x=pool5, dropout_prob=0.4)
+        out = fluid.layers.fc(input=dropout,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=self.xavier(1024, 1, "out"),
+                              name="out",
+                              bias_attr=ParamAttr(name="out_offset"))
+
+        pool_o1 = fluid.layers.pool2d(
+            input=ince4a, pool_size=5, pool_type='avg', pool_stride=3)
+        conv_o1 = self.conv_layer(
+            input=pool_o1,
+            num_filters=128,
+            filter_size=1,
+            stride=1,
+            act=None,
+            name="conv_o1")
+        fc_o1 = fluid.layers.fc(input=conv_o1,
+                                size=1024,
+                                act='relu',
+                                param_attr=self.xavier(2048, 1, "fc_o1"),
+                                name="fc_o1",
+                                bias_attr=ParamAttr(name="fc_o1_offset"))
+        dropout_o1 = fluid.layers.dropout(x=fc_o1, dropout_prob=0.7)
+        out1 = fluid.layers.fc(input=dropout_o1,
+                               size=class_dim,
+                               act='softmax',
+                               param_attr=self.xavier(1024, 1, "out1"),
+                               name="out1",
+                               bias_attr=ParamAttr(name="out1_offset"))
+
+        pool_o2 = fluid.layers.pool2d(
+            input=ince4d, pool_size=5, pool_type='avg', pool_stride=3)
+        conv_o2 = self.conv_layer(
+            input=pool_o2,
+            num_filters=128,
+            filter_size=1,
+            stride=1,
+            act=None,
+            name="conv_o2")
+        fc_o2 = fluid.layers.fc(input=conv_o2,
+                                size=1024,
+                                act='relu',
+                                param_attr=self.xavier(2048, 1, "fc_o2"),
+                                name="fc_o2",
+                                bias_attr=ParamAttr(name="fc_o2_offset"))
+        dropout_o2 = fluid.layers.dropout(x=fc_o2, dropout_prob=0.7)
+        out2 = fluid.layers.fc(input=dropout_o2,
+                               size=class_dim,
+                               act='softmax',
+                               param_attr=self.xavier(1024, 1, "out2"),
+                               name="out2",
+                               bias_attr=ParamAttr(name="out2_offset"))
+
+        # last fc layer is "out"
+        return out, out1, out2
--- a/fluid/PaddleCV/image_classification/models_name/inception_v4.py
+++ b/fluid/PaddleCV/image_classification/models_name/inception_v4.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+import math
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = ['InceptionV4']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class InceptionV4():
+    def __init__(self):
+        self.params = train_parameters
+
+    def net(self, input, class_dim=1000):
+        x = self.inception_stem(input)
+
+        for i in range(4):
+            x = self.inceptionA(x, name=str(i + 1))
+        x = self.reductionA(x)
+
+        for i in range(7):
+            x = self.inceptionB(x, name=str(i + 1))
+        x = self.reductionB(x)
+
+        for i in range(3):
+            x = self.inceptionC(x, name=str(i + 1))
+
+        pool = fluid.layers.pool2d(
+            input=x, pool_size=8, pool_type='avg', global_pooling=True)
+
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=drop,
+            size=class_dim,
+            act='softmax',
+            param_attr=ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name="final_fc_weights"),
+            bias_attr=ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name="final_fc_offset"))
+        return out
+
+    def conv_bn_layer(self,
+                      data,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      padding=0,
+                      groups=1,
+                      act='relu',
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=data,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            name=name)
+        bn_name = name + "_bn"
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            name=bn_name,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def inception_stem(self, data, name=None):
+        conv = self.conv_bn_layer(
+            data, 32, 3, stride=2, act='relu', name="conv1_3x3_s2")
+        conv = self.conv_bn_layer(conv, 32, 3, act='relu', name="conv2_3x3_s1")
+        conv = self.conv_bn_layer(
+            conv, 64, 3, padding=1, act='relu', name="conv3_3x3_s1")
+
+        pool1 = fluid.layers.pool2d(
+            input=conv, pool_size=3, pool_stride=2, pool_type='max')
+        conv2 = self.conv_bn_layer(
+            conv, 96, 3, stride=2, act='relu', name="inception_stem1_3x3_s2")
+        concat = fluid.layers.concat([pool1, conv2], axis=1)
+
+        conv1 = self.conv_bn_layer(
+            concat, 64, 1, act='relu', name="inception_stem2_3x3_reduce")
+        conv1 = self.conv_bn_layer(
+            conv1, 96, 3, act='relu', name="inception_stem2_3x3")
+
+        conv2 = self.conv_bn_layer(
+            concat, 64, 1, act='relu', name="inception_stem2_1x7_reduce")
+        conv2 = self.conv_bn_layer(
+            conv2,
+            64, (7, 1),
+            padding=(3, 0),
+            act='relu',
+            name="inception_stem2_1x7")
+        conv2 = self.conv_bn_layer(
+            conv2,
+            64, (1, 7),
+            padding=(0, 3),
+            act='relu',
+            name="inception_stem2_7x1")
+        conv2 = self.conv_bn_layer(
+            conv2, 96, 3, act='relu', name="inception_stem2_3x3_2")
+
+        concat = fluid.layers.concat([conv1, conv2], axis=1)
+
+        conv1 = self.conv_bn_layer(
+            concat, 192, 3, stride=2, act='relu', name="inception_stem3_3x3_s2")
+        pool1 = fluid.layers.pool2d(
+            input=concat, pool_size=3, pool_stride=2, pool_type='max')
+
+        concat = fluid.layers.concat([conv1, pool1], axis=1)
+
+        return concat
+
+    def inceptionA(self, data, name=None):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_padding=1, pool_type='avg')
+        conv1 = self.conv_bn_layer(
+            pool1, 96, 1, act='relu', name="inception_a" + name + "_1x1")
+
+        conv2 = self.conv_bn_layer(
+            data, 96, 1, act='relu', name="inception_a" + name + "_1x1_2")
+
+        conv3 = self.conv_bn_layer(
+            data, 64, 1, act='relu', name="inception_a" + name + "_3x3_reduce")
+        conv3 = self.conv_bn_layer(
+            conv3,
+            96,
+            3,
+            padding=1,
+            act='relu',
+            name="inception_a" + name + "_3x3")
+
+        conv4 = self.conv_bn_layer(
+            data,
+            64,
+            1,
+            act='relu',
+            name="inception_a" + name + "_3x3_2_reduce")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            96,
+            3,
+            padding=1,
+            act='relu',
+            name="inception_a" + name + "_3x3_2")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            96,
+            3,
+            padding=1,
+            act='relu',
+            name="inception_a" + name + "_3x3_3")
+
+        concat = fluid.layers.concat([conv1, conv2, conv3, conv4], axis=1)
+
+        return concat
+
+    def reductionA(self, data, name=None):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_stride=2, pool_type='max')
+
+        conv2 = self.conv_bn_layer(
+            data, 384, 3, stride=2, act='relu', name="reduction_a_3x3")
+
+        conv3 = self.conv_bn_layer(
+            data, 192, 1, act='relu', name="reduction_a_3x3_2_reduce")
+        conv3 = self.conv_bn_layer(
+            conv3, 224, 3, padding=1, act='relu', name="reduction_a_3x3_2")
+        conv3 = self.conv_bn_layer(
+            conv3, 256, 3, stride=2, act='relu', name="reduction_a_3x3_3")
+
+        concat = fluid.layers.concat([pool1, conv2, conv3], axis=1)
+
+        return concat
+
+    def inceptionB(self, data, name=None):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_padding=1, pool_type='avg')
+        conv1 = self.conv_bn_layer(
+            pool1, 128, 1, act='relu', name="inception_b" + name + "_1x1")
+
+        conv2 = self.conv_bn_layer(
+            data, 384, 1, act='relu', name="inception_b" + name + "_1x1_2")
+
+        conv3 = self.conv_bn_layer(
+            data, 192, 1, act='relu', name="inception_b" + name + "_1x7_reduce")
+        conv3 = self.conv_bn_layer(
+            conv3,
+            224, (1, 7),
+            padding=(0, 3),
+            act='relu',
+            name="inception_b" + name + "_1x7")
+        conv3 = self.conv_bn_layer(
+            conv3,
+            256, (7, 1),
+            padding=(3, 0),
+            act='relu',
+            name="inception_b" + name + "_7x1")
+
+        conv4 = self.conv_bn_layer(
+            data,
+            192,
+            1,
+            act='relu',
+            name="inception_b" + name + "_7x1_2_reduce")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            192, (1, 7),
+            padding=(0, 3),
+            act='relu',
+            name="inception_b" + name + "_1x7_2")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            224, (7, 1),
+            padding=(3, 0),
+            act='relu',
+            name="inception_b" + name + "_7x1_2")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            224, (1, 7),
+            padding=(0, 3),
+            act='relu',
+            name="inception_b" + name + "_1x7_3")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            256, (7, 1),
+            padding=(3, 0),
+            act='relu',
+            name="inception_b" + name + "_7x1_3")
+
+        concat = fluid.layers.concat([conv1, conv2, conv3, conv4], axis=1)
+
+        return concat
+
+    def reductionB(self, data, name=None):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_stride=2, pool_type='max')
+
+        conv2 = self.conv_bn_layer(
+            data, 192, 1, act='relu', name="reduction_b_3x3_reduce")
+        conv2 = self.conv_bn_layer(
+            conv2, 192, 3, stride=2, act='relu', name="reduction_b_3x3")
+
+        conv3 = self.conv_bn_layer(
+            data, 256, 1, act='relu', name="reduction_b_1x7_reduce")
+        conv3 = self.conv_bn_layer(
+            conv3,
+            256, (1, 7),
+            padding=(0, 3),
+            act='relu',
+            name="reduction_b_1x7")
+        conv3 = self.conv_bn_layer(
+            conv3,
+            320, (7, 1),
+            padding=(3, 0),
+            act='relu',
+            name="reduction_b_7x1")
+        conv3 = self.conv_bn_layer(
+            conv3, 320, 3, stride=2, act='relu', name="reduction_b_3x3_2")
+
+        concat = fluid.layers.concat([pool1, conv2, conv3], axis=1)
+
+        return concat
+
+    def inceptionC(self, data, name=None):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_padding=1, pool_type='avg')
+        conv1 = self.conv_bn_layer(
+            pool1, 256, 1, act='relu', name="inception_c" + name + "_1x1")
+
+        conv2 = self.conv_bn_layer(
+            data, 256, 1, act='relu', name="inception_c" + name + "_1x1_2")
+
+        conv3 = self.conv_bn_layer(
+            data, 384, 1, act='relu', name="inception_c" + name + "_1x1_3")
+        conv3_1 = self.conv_bn_layer(
+            conv3,
+            256, (1, 3),
+            padding=(0, 1),
+            act='relu',
+            name="inception_c" + name + "_1x3")
+        conv3_2 = self.conv_bn_layer(
+            conv3,
+            256, (3, 1),
+            padding=(1, 0),
+            act='relu',
+            name="inception_c" + name + "_3x1")
+
+        conv4 = self.conv_bn_layer(
+            data, 384, 1, act='relu', name="inception_c" + name + "_1x1_4")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            448, (1, 3),
+            padding=(0, 1),
+            act='relu',
+            name="inception_c" + name + "_1x3_2")
+        conv4 = self.conv_bn_layer(
+            conv4,
+            512, (3, 1),
+            padding=(1, 0),
+            act='relu',
+            name="inception_c" + name + "_3x1_2")
+        conv4_1 = self.conv_bn_layer(
+            conv4,
+            256, (1, 3),
+            padding=(0, 1),
+            act='relu',
+            name="inception_c" + name + "_1x3_3")
+        conv4_2 = self.conv_bn_layer(
+            conv4,
+            256, (3, 1),
+            padding=(1, 0),
+            act='relu',
+            name="inception_c" + name + "_3x1_3")
+
+        concat = fluid.layers.concat(
+            [conv1, conv2, conv3_1, conv3_2, conv4_1, conv4_2], axis=1)
+
+        return concat
--- a/fluid/PaddleCV/image_classification/models_name/mobilenet.py
+++ b/fluid/PaddleCV/image_classification/models_name/mobilenet.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = ['MobileNet']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class MobileNet():
+    def __init__(self):
+        self.params = train_parameters
+
+    def net(self, input, class_dim=1000, scale=1.0):
+        # conv1: 112x112
+        input = self.conv_bn_layer(
+            input,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1,
+            name="conv1")
+
+        # 56x56
+        input = self.depthwise_separable(
+            input,
+            num_filters1=32,
+            num_filters2=64,
+            num_groups=32,
+            stride=1,
+            scale=scale,
+            name="conv2_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=64,
+            num_filters2=128,
+            num_groups=64,
+            stride=2,
+            scale=scale,
+            name="conv2_2")
+
+        # 28x28
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=128,
+            num_groups=128,
+            stride=1,
+            scale=scale,
+            name="conv3_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=256,
+            num_groups=128,
+            stride=2,
+            scale=scale,
+            name="conv3_2")
+
+        # 14x14
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=256,
+            num_groups=256,
+            stride=1,
+            scale=scale,
+            name="conv4_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=512,
+            num_groups=256,
+            stride=2,
+            scale=scale,
+            name="conv4_2")
+
+        # 14x14
+        for i in range(5):
+            input = self.depthwise_separable(
+                input,
+                num_filters1=512,
+                num_filters2=512,
+                num_groups=512,
+                stride=1,
+                scale=scale,
+                name="conv5" + "_" + str(i + 1))
+        # 7x7
+        input = self.depthwise_separable(
+            input,
+            num_filters1=512,
+            num_filters2=1024,
+            num_groups=512,
+            stride=2,
+            scale=scale,
+            name="conv5_6")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=1024,
+            num_filters2=1024,
+            num_groups=1024,
+            stride=1,
+            scale=scale,
+            name="conv6")
+
+        input = fluid.layers.pool2d(
+            input=input,
+            pool_size=0,
+            pool_stride=1,
+            pool_type='avg',
+            global_pooling=True)
+
+        output = fluid.layers.fc(input=input,
+                                 size=class_dim,
+                                 act='softmax',
+                                 param_attr=ParamAttr(
+                                     initializer=MSRA(), name="fc7_weights"),
+                                 bias_attr=ParamAttr(name="fc7_offset"))
+        return output
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      channels=None,
+                      num_groups=1,
+                      act='relu',
+                      use_cudnn=True,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def depthwise_separable(self,
+                            input,
+                            num_filters1,
+                            num_filters2,
+                            num_groups,
+                            stride,
+                            scale,
+                            name=None):
+        depthwise_conv = self.conv_bn_layer(
+            input=input,
+            filter_size=3,
+            num_filters=int(num_filters1 * scale),
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            use_cudnn=False,
+            name=name + "_dw")
+
+        pointwise_conv = self.conv_bn_layer(
+            input=depthwise_conv,
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0,
+            name=name + "_sep")
+        return pointwise_conv
--- a/fluid/PaddleCV/image_classification/models_name/mobilenet_v2.py
+++ b/fluid/PaddleCV/image_classification/models_name/mobilenet_v2.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = ['MobileNetV2']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class MobileNetV2():
+    def __init__(self):
+        self.params = train_parameters
+
+    def net(self, input, class_dim=1000, scale=1.0):
+
+        bottleneck_params_list = [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),
+            (6, 32, 3, 2),
+            (6, 64, 4, 2),
+            (6, 96, 3, 1),
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),
+        ]
+
+        #conv1 
+        input = self.conv_bn_layer(
+            input,
+            num_filters=int(32 * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            if_act=True,
+            name='conv1_1')
+
+        # bottleneck sequences
+        i = 1
+        in_c = int(32 * scale)
+        for layer_setting in bottleneck_params_list:
+            t, c, n, s = layer_setting
+            i += 1
+            input = self.invresi_blocks(
+                input=input,
+                in_c=in_c,
+                t=t,
+                c=int(c * scale),
+                n=n,
+                s=s,
+                name='conv' + str(i))
+            in_c = int(c * scale)
+        #last_conv
+        input = self.conv_bn_layer(
+            input=input,
+            num_filters=int(1280 * scale) if scale > 1.0 else 1280,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            name='conv9')
+
+        input = fluid.layers.pool2d(
+            input=input,
+            pool_size=7,
+            pool_stride=1,
+            pool_type='avg',
+            global_pooling=True)
+
+        output = fluid.layers.fc(input=input,
+                                 size=class_dim,
+                                 act='softmax',
+                                 param_attr=ParamAttr(name='fc10_weights'),
+                                 bias_attr=ParamAttr(name='fc10_offset'))
+        return output
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      channels=None,
+                      num_groups=1,
+                      if_act=True,
+                      name=None,
+                      use_cudnn=True):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(name=name + '_weights'),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        bn = fluid.layers.batch_norm(
+            input=conv,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+        if if_act:
+            return fluid.layers.relu6(bn)
+        else:
+            return bn
+
+    def shortcut(self, input, data_residual):
+        return fluid.layers.elementwise_add(input, data_residual)
+
+    def inverted_residual_unit(self,
+                               input,
+                               num_in_filter,
+                               num_filters,
+                               ifshortcut,
+                               stride,
+                               filter_size,
+                               padding,
+                               expansion_factor,
+                               name=None):
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+
+        channel_expand = self.conv_bn_layer(
+            input=input,
+            num_filters=num_expfilter,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            name=name + '_expand')
+
+        bottleneck_conv = self.conv_bn_layer(
+            input=channel_expand,
+            num_filters=num_expfilter,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            num_groups=num_expfilter,
+            if_act=True,
+            name=name + '_dwise',
+            use_cudnn=False)
+
+        linear_out = self.conv_bn_layer(
+            input=bottleneck_conv,
+            num_filters=num_filters,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=False,
+            name=name + '_linear')
+        if ifshortcut:
+            out = self.shortcut(input=input, data_residual=linear_out)
+            return out
+        else:
+            return linear_out
+
+    def invresi_blocks(self, input, in_c, t, c, n, s, name=None):
+        first_block = self.inverted_residual_unit(
+            input=input,
+            num_in_filter=in_c,
+            num_filters=c,
+            ifshortcut=False,
+            stride=s,
+            filter_size=3,
+            padding=1,
+            expansion_factor=t,
+            name=name + '_1')
+
+        last_residual_block = first_block
+        last_c = c
+
+        for i in range(1, n):
+            last_residual_block = self.inverted_residual_unit(
+                input=last_residual_block,
+                num_in_filter=last_c,
+                num_filters=c,
+                ifshortcut=True,
+                stride=1,
+                filter_size=3,
+                padding=1,
+                expansion_factor=t,
+                name=name + '_' + str(i + 1))
+        return last_residual_block
--- a/fluid/PaddleCV/image_classification/models_name/resnet.py
+++ b/fluid/PaddleCV/image_classification/models_name/resnet.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+import math
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = ["ResNet", "ResNet50", "ResNet101", "ResNet152"]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class ResNet():
+    def __init__(self, layers=50):
+        self.params = train_parameters
+        self.layers = layers
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+
+        conv = self.conv_bn_layer(
+            input=input,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            name="conv1")
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    name=conv_name)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(input=pool,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            name=name + '.conv2d.output.1')
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            name=bn_name + '.output.1',
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance', )
+
+    def shortcut(self, input, ch_out, stride, name):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, name):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        short = self.shortcut(
+            input, num_filters * 4, stride, name=name + "_branch1")
+
+        return fluid.layers.elementwise_add(
+            x=short, y=conv2, act='relu', name=name + ".add.output.5")
+
+
+def ResNet50():
+    model = ResNet(layers=50)
+    return model
+
+
+def ResNet101():
+    model = ResNet(layers=101)
+    return model
+
+
+def ResNet152():
+    model = ResNet(layers=152)
+    return model
--- a/fluid/PaddleCV/image_classification/models_name/se_resnext.py
+++ b/fluid/PaddleCV/image_classification/models_name/se_resnext.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+import math
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = [
+    "SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d",
+    "SE_ResNeXt152_32x4d"
+]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "dropout_seed": None,
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [40, 80, 100],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class SE_ResNeXt():
+    def __init__(self, layers=50):
+        self.params = train_parameters
+        self.layers = layers
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu',
+                name='conv1', )
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu',
+                name="conv1", )
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu',
+                name='conv1')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=64,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv2')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name='conv3')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+        n = 1 if layers == 50 or layers == 101 else 3
+        for block in range(len(depth)):
+            n += 1
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio,
+                    name=str(n) + '_' + str(i + 1))
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(
+            x=pool, dropout_prob=0.5, seed=self.params['dropout_seed'])
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=drop,
+            size=class_dim,
+            act='softmax',
+            param_attr=ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name='fc6_weights'),
+            bias_attr=ParamAttr(name='fc6_offset'))
+        return out
+
+    def shortcut(self, input, ch_out, stride, name):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(
+                input, ch_out, filter_size, stride, name='conv' + name + '_prj')
+        else:
+            return input
+
+    def bottleneck_block(self,
+                         input,
+                         num_filters,
+                         stride,
+                         cardinality,
+                         reduction_ratio,
+                         name=None):
+        conv0 = self.conv_bn_layer(
+            input=input,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name='conv' + name + '_x1')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu',
+            name='conv' + name + '_x2')
+        conv2 = self.conv_bn_layer(
+            input=conv1,
+            num_filters=num_filters * 2,
+            filter_size=1,
+            act=None,
+            name='conv' + name + '_x3')
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio,
+            name='fc' + name)
+
+        short = self.shortcut(input, num_filters * 2, stride, name=name)
+
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False,
+            param_attr=ParamAttr(name=name + '_weights'), )
+        bn_name = name + "_bn"
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def squeeze_excitation(self,
+                           input,
+                           num_channels,
+                           reduction_ratio,
+                           name=None):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(
+            input=pool,
+            size=num_channels // reduction_ratio,
+            act='relu',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=name + '_sqz_weights'),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(
+            input=squeeze,
+            size=num_channels,
+            act='sigmoid',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv),
+                name=name + '_exc_weights'),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+
+
+def SE_ResNeXt50_32x4d():
+    model = SE_ResNeXt(layers=50)
+    return model
+
+
+def SE_ResNeXt101_32x4d():
+    model = SE_ResNeXt(layers=101)
+    return model
+
+
+def SE_ResNeXt152_32x4d():
+    model = SE_ResNeXt(layers=152)
+    return model
--- a/fluid/PaddleCV/image_classification/models_name/shufflenet_v2.py
+++ b/fluid/PaddleCV/image_classification/models_name/shufflenet_v2.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = [
+    'ShuffleNetV2', 'ShuffleNetV2_x0_5', 'ShuffleNetV2_x1_0',
+    'ShuffleNetV2_x1_5', 'ShuffleNetV2_x2_0'
+]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class ShuffleNetV2():
+    def __init__(self, scale=1.0):
+        self.params = train_parameters
+        self.scale = scale
+
+    def net(self, input, class_dim=1000):
+        scale = self.scale
+        stage_repeats = [4, 8, 4]
+
+        if scale == 0.5:
+            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif scale == 1.0:
+            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif scale == 1.5:
+            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif scale == 2.0:
+            stage_out_channels = [-1, 24, 224, 488, 976, 2048]
+        else:
+            raise ValueError("""{} groups is not supported for
+                       1x1 Grouped Convolutions""".format(num_groups))
+
+        #conv1
+
+        input_channel = stage_out_channels[1]
+        conv1 = self.conv_bn_layer(
+            input=input,
+            filter_size=3,
+            num_filters=input_channel,
+            padding=1,
+            stride=2,
+            name='stage1_conv')
+        pool1 = fluid.layers.pool2d(
+            input=conv1,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+        conv = pool1
+        # bottleneck sequences
+        for idxstage in range(len(stage_repeats)):
+            numrepeat = stage_repeats[idxstage]
+            output_channel = stage_out_channels[idxstage + 2]
+            for i in range(numrepeat):
+                if i == 0:
+                    conv = self.inverted_residual_unit(
+                        input=conv,
+                        num_filters=output_channel,
+                        stride=2,
+                        benchmodel=2,
+                        name=str(idxstage + 2) + '_' + str(i + 1))
+                else:
+                    conv = self.inverted_residual_unit(
+                        input=conv,
+                        num_filters=output_channel,
+                        stride=1,
+                        benchmodel=1,
+                        name=str(idxstage + 2) + '_' + str(i + 1))
+
+        conv_last = self.conv_bn_layer(
+            input=conv,
+            filter_size=1,
+            num_filters=stage_out_channels[-1],
+            padding=0,
+            stride=1,
+            name='conv5')
+        pool_last = fluid.layers.pool2d(
+            input=conv_last,
+            pool_size=7,
+            pool_stride=1,
+            pool_padding=0,
+            pool_type='avg')
+
+        output = fluid.layers.fc(input=pool_last,
+                                 size=class_dim,
+                                 act='softmax',
+                                 param_attr=ParamAttr(
+                                     initializer=MSRA(), name='fc6_weights'),
+                                 bias_attr=ParamAttr(name='fc6_offset'))
+        return output
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      num_groups=1,
+                      use_cudnn=True,
+                      if_act=True,
+                      name=None):
+        #         print(num_groups)
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=name + '_weights'),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        if if_act:
+            return fluid.layers.batch_norm(
+                input=conv,
+                act='relu',
+                param_attr=ParamAttr(name=bn_name + "_scale"),
+                bias_attr=ParamAttr(name=bn_name + "_offset"),
+                moving_mean_name=bn_name + '_mean',
+                moving_variance_name=bn_name + '_variance')
+        else:
+            return fluid.layers.batch_norm(
+                input=conv,
+                param_attr=ParamAttr(name=bn_name + "_scale"),
+                bias_attr=ParamAttr(name=bn_name + "_offset"),
+                moving_mean_name=bn_name + '_mean',
+                moving_variance_name=bn_name + '_variance')
+
+    def channel_shuffle(self, x, groups):
+        batchsize, num_channels, height, width = x.shape[0], x.shape[
+            1], x.shape[2], x.shape[3]
+        channels_per_group = num_channels // groups
+
+        # reshape
+        x = fluid.layers.reshape(
+            x=x, shape=[batchsize, groups, channels_per_group, height, width])
+
+        x = fluid.layers.transpose(x=x, perm=[0, 2, 1, 3, 4])
+
+        # flatten
+        x = fluid.layers.reshape(
+            x=x, shape=[batchsize, num_channels, height, width])
+
+        return x
+
+    def inverted_residual_unit(self,
+                               input,
+                               num_filters,
+                               stride,
+                               benchmodel,
+                               name=None):
+        assert stride in [1, 2], \
+            "supported stride are {} but your stride is {}".format([1,2], stride)
+
+        oup_inc = num_filters // 2
+        inp = input.shape[1]
+
+        if benchmodel == 1:
+            x1, x2 = fluid.layers.split(
+                input,
+                num_or_sections=[input.shape[1] // 2, input.shape[1] // 2],
+                dim=1)
+            #             x1 = input[:, :(input.shape[1]//2), :, :]
+            #             x2 = input[:, (input.shape[1]//2):, :, :]
+
+            conv_pw = self.conv_bn_layer(
+                input=x2,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv1')
+
+            conv_dw = self.conv_bn_layer(
+                input=conv_pw,
+                num_filters=oup_inc,
+                filter_size=3,
+                stride=stride,
+                padding=1,
+                num_groups=oup_inc,
+                if_act=False,
+                name='stage_' + name + '_conv2')
+
+            conv_linear = self.conv_bn_layer(
+                input=conv_dw,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv3')
+
+            out = fluid.layers.concat([x1, conv_linear], axis=1)
+
+        else:
+            #branch1
+            conv_dw_1 = self.conv_bn_layer(
+                input=input,
+                num_filters=inp,
+                filter_size=3,
+                stride=stride,
+                padding=1,
+                num_groups=inp,
+                if_act=False,
+                name='stage_' + name + '_conv4')
+
+            conv_linear_1 = self.conv_bn_layer(
+                input=conv_dw_1,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv5')
+
+            #branch2
+            conv_pw_2 = self.conv_bn_layer(
+                input=input,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv1')
+
+            conv_dw_2 = self.conv_bn_layer(
+                input=conv_pw_2,
+                num_filters=oup_inc,
+                filter_size=3,
+                stride=stride,
+                padding=1,
+                num_groups=oup_inc,
+                if_act=False,
+                name='stage_' + name + '_conv2')
+
+            conv_linear_2 = self.conv_bn_layer(
+                input=conv_dw_2,
+                num_filters=oup_inc,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                num_groups=1,
+                if_act=True,
+                name='stage_' + name + '_conv3')
+            out = fluid.layers.concat([conv_linear_1, conv_linear_2], axis=1)
+
+        return self.channel_shuffle(out, 2)
+
+
+def ShuffleNetV2_x0_5():
+    model = ShuffleNetV2(scale=0.5)
+    return model
+
+
+def ShuffleNetV2_x1_0():
+    model = ShuffleNetV2(scale=1.0)
+    return model
+
+
+def ShuffleNetV2_x1_5():
+    model = ShuffleNetV2(scale=1.5)
+    return model
+
+
+def ShuffleNetV2_x2_0():
+    model = ShuffleNetV2(scale=2.0)
+    return model
--- a/fluid/PaddleCV/image_classification/models_name/vgg.py
+++ b/fluid/PaddleCV/image_classification/models_name/vgg.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+
+__all__ = ["VGGNet", "VGG11", "VGG13", "VGG16", "VGG19"]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class VGGNet():
+    def __init__(self, layers=16):
+        self.params = train_parameters
+        self.layers = layers
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        vgg_spec = {
+            11: ([1, 1, 2, 2, 2]),
+            13: ([2, 2, 2, 2, 2]),
+            16: ([2, 2, 3, 3, 3]),
+            19: ([2, 2, 4, 4, 4])
+        }
+        assert layers in vgg_spec.keys(), \
+            "supported layers are {} but input layer is {}".format(vgg_spec.keys(), layers)
+
+        nums = vgg_spec[layers]
+        conv1 = self.conv_block(input, 64, nums[0], name="conv1_")
+        conv2 = self.conv_block(conv1, 128, nums[1], name="conv2_")
+        conv3 = self.conv_block(conv2, 256, nums[2], name="conv3_")
+        conv4 = self.conv_block(conv3, 512, nums[3], name="conv4_")
+        conv5 = self.conv_block(conv4, 512, nums[4], name="conv5_")
+
+        fc_dim = 4096
+        fc_name = ["fc6", "fc7", "fc8"]
+        fc1 = fluid.layers.fc(
+            input=conv5,
+            size=fc_dim,
+            act='relu',
+            param_attr=fluid.param_attr.ParamAttr(name=fc_name[0] + "_weights"),
+            bias_attr=fluid.param_attr.ParamAttr(name=fc_name[0] + "_offset"))
+        fc1 = fluid.layers.dropout(x=fc1, dropout_prob=0.5)
+        fc2 = fluid.layers.fc(
+            input=fc1,
+            size=fc_dim,
+            act='relu',
+            param_attr=fluid.param_attr.ParamAttr(name=fc_name[1] + "_weights"),
+            bias_attr=fluid.param_attr.ParamAttr(name=fc_name[1] + "_offset"))
+        fc2 = fluid.layers.dropout(x=fc2, dropout_prob=0.5)
+        out = fluid.layers.fc(
+            input=fc2,
+            size=class_dim,
+            act='softmax',
+            param_attr=fluid.param_attr.ParamAttr(name=fc_name[2] + "_weights"),
+            bias_attr=fluid.param_attr.ParamAttr(name=fc_name[2] + "_offset"))
+
+        return out
+
+    def conv_block(self, input, num_filter, groups, name=None):
+        conv = input
+        for i in range(groups):
+            conv = fluid.layers.conv2d(
+                input=conv,
+                num_filters=num_filter,
+                filter_size=3,
+                stride=1,
+                padding=1,
+                act='relu',
+                param_attr=fluid.param_attr.ParamAttr(
+                    name=name + str(i + 1) + "_weights"),
+                bias_attr=fluid.param_attr.ParamAttr(
+                    name=name + str(i + 1) + "_offset"))
+        return fluid.layers.pool2d(
+            input=conv, pool_size=2, pool_type='max', pool_stride=2)
+
+
+def VGG11():
+    model = VGGNet(layers=11)
+    return model
+
+
+def VGG13():
+    model = VGGNet(layers=13)
+    return model
+
+
+def VGG16():
+    model = VGGNet(layers=16)
+    return model
+
+
+def VGG19():
+    model = VGGNet(layers=19)
+    return model
--- a/fluid/PaddleCV/image_classification/reader.py
+++ b/fluid/PaddleCV/image_classification/reader.py
@@ -169,7 +169,12 @@ def _reader_creator(file_list,
 def train(data_dir=DATA_DIR):
    file_list = os.path.join(data_dir, 'train_list.txt')
    return _reader_creator(
-        file_list, 'train', shuffle=True, color_jitter=False, rotate=False, data_dir=data_dir)
+        file_list,
+        'train',
+        shuffle=True,
+        color_jitter=False,
+        rotate=False,
+        data_dir=data_dir)


 def val(data_dir=DATA_DIR):

--- a/fluid/PaddleCV/image_classification/reader_cv2.py
+++ b/fluid/PaddleCV/image_classification/reader_cv2.py
+import os
+import math
+import random
+import functools
+import numpy as np
+import paddle
+import cv2
+import io
+
+random.seed(0)
+np.random.seed(0)
+
+DATA_DIM = 224
+
+THREAD = 8
+BUF_SIZE = 102400
+
+DATA_DIR = 'data/ILSVRC2012'
+
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+
+
+def rotate_image(img):
+    """ rotate_image """
+    (h, w) = img.shape[:2]
+    center = (w / 2, h / 2)
+    angle = np.random.randint(-10, 11)
+    M = cv2.getRotationMatrix2D(center, angle, 1.0)
+    rotated = cv2.warpAffine(img, M, (w, h))
+    return rotated
+
+
+def random_crop(img, size, scale=None, ratio=None):
+    """ random_crop """
+    scale = [0.08, 1.0] if scale is None else scale
+    ratio = [3. / 4., 4. / 3.] if ratio is None else ratio
+
+    aspect_ratio = math.sqrt(np.random.uniform(*ratio))
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+
+    bound = min((float(img.shape[1]) / img.shape[0]) / (w**2),
+                (float(img.shape[0]) / img.shape[1]) / (h**2))
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+
+    target_area = img.shape[0] * img.shape[1] * np.random.uniform(scale_min,
+                                                                  scale_max)
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+
+    i = np.random.randint(0, img.size[0] - w + 1)
+    j = np.random.randint(0, img.size[1] - h + 1)
+
+    img = img[i:i + h, j:j + w, :]
+    resized = cv2.resize(img, (size, size))
+    return resized
+
+
+def distort_color(img):
+    return img
+
+
+def resize_short(img, target_size):
+    """ resize_short """
+    percent = float(target_size) / min(img.shape[0], img.shape[1])
+    resized_width = int(round(img.shape[1] * percent))
+    resized_height = int(round(img.shape[0] * percent))
+    resized = cv2.resize(img, (resized_width, resized_height))
+    return resized
+
+
+def crop_image(img, target_size, center):
+    """ crop_image """
+    height, width = img.shape[:2]
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = np.random.randint(0, width - size + 1)
+        h_start = np.random.randint(0, height - size + 1)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img[h_start:h_end, w_start:w_end, :]
+    return img
+
+
+def process_image(sample,
+                  mode,
+                  color_jitter,
+                  rotate,
+                  crop_size=224,
+                  mean=None,
+                  std=None):
+    """ process_image """
+
+    mean = [0.485, 0.456, 0.406] if mean is None else mean
+    std = [0.229, 0.224, 0.225] if std is None else std
+
+    img_path = sample[0]
+    img = cv2.imread(img_path)
+
+    if mode == 'train':
+        if rotate:
+            img = rotate_image(img)
+        if crop_size > 0:
+            img = random_crop(img, crop_size)
+        if color_jitter:
+            img = distort_color(img)
+        if np.random.randint(0, 2) == 1:
+            img = img[:, ::-1, :]
+    else:
+        if crop_size > 0:
+            img = resize_short(img, crop_size)
+
+            img = crop_image(img, target_size=crop_size, center=True)
+
+    img = img[:, :, ::-1].astype('float32').transpose((2, 0, 1)) / 255
+    img_mean = np.array(mean).reshape((3, 1, 1))
+    img_std = np.array(std).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+
+    if mode == 'train' or mode == 'val':
+        return (img, sample[1])
+    elif mode == 'test':
+        return (img, )
+
+
+def image_mapper(**kwargs):
+    """ image_mapper """
+    return functools.partial(process_image, **kwargs)
+
+
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False,
+                    data_dir=DATA_DIR):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                np.random.shuffle(lines)
+            if mode == 'train' and os.getenv('PADDLE_TRAINING_ROLE'):
+                # distributed mode if the env var `PADDLE_TRAINING_ROLE` exits
+                trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+                trainer_count = int(os.getenv("PADDLE_TRAINERS", "1"))
+                per_node_lines = len(full_lines) // trainer_count
+                lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
+                                   * per_node_lines]
+                print(
+                    "read images from %d, length: %d, lines length: %d, total: %d"
+                    % (trainer_id * per_node_lines, per_node_lines, len(lines),
+                       len(full_lines)))
+            else:
+                lines = full_lines
+            for line in lines:
+                if mode == 'train' or mode == 'val':
+                    img_path, label = line.split()
+                    img_path = img_path.replace("JPEG", "jpeg")
+                    img_path = os.path.join(data_dir, img_path)
+                    yield img_path, int(label)
+                elif mode == 'test':
+                    img_path = os.path.join(DATA_DIR, line)
+                    yield [img_path]
+
+    image_mapper = functools.partial(
+        process_image,
+        mode=mode,
+        color_jitter=color_jitter,
+        rotate=color_jitter,
+        crop_size=224)
+    reader = paddle.reader.xmap_readers(
+        image_mapper, reader, THREAD, BUF_SIZE, order=False)
+    return reader
+
+
+def train(data_dir=DATA_DIR):
+    file_list = os.path.join(data_dir, 'train_list.txt')
+    return _reader_creator(
+        file_list,
+        'train',
+        shuffle=True,
+        color_jitter=False,
+        rotate=False,
+        data_dir=data_dir)
+
+
+def val(data_dir=DATA_DIR):
+    file_list = os.path.join(data_dir, 'val_list.txt')
+    return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir)
+
+
+def test(data_dir=DATA_DIR):
+    file_list = os.path.join(data_dir, 'val_list.txt')
+    return _reader_creator(file_list, 'test', shuffle=False, data_dir=data_dir)
--- a/fluid/PaddleCV/image_classification/run.sh
+++ b/fluid/PaddleCV/image_classification/run.sh
+#Hyperparameters config
+python train.py \
+       --model=SE_ResNeXt50_32x4d \
+       --batch_size=32 \
+       --total_images=1281167 \
+       --class_dim=1000 \
+       --image_shape=3,224,224 \
+       --model_save_dir=output/ \
+       --with_mem_opt=False \
+       --lr_strategy=piecewise_decay \
+       --lr=0.1
+#      >log_SE_ResNeXt50_32x4d.txt 2>&1 &
+
+#AlexNet:
+#python train.py \
+#       --model=AlexNet \
+#       --batch_size=256 \
+#       --total_images=1281167 \
+#       --class_dim=1000 \
+#       --image_shape=3,224,224 \
+#       --model_save_dir=output/ \
+#       --with_mem_opt=False \
+#       --lr_strategy=piecewise_decay \
+#	--num_epochs=120 \
+#       --lr=0.01
+
+#VGG11:
+#python train.py \
+#       --model=VGG11 \
+#       --batch_size=512 \
+#       --total_images=1281167 \
+#       --class_dim=1000 \
+#       --image_shape=3,224,224 \
+#       --model_save_dir=output/ \
+#       --with_mem_opt=False \
+#       --lr_strategy=piecewise_decay \
+#       --num_epochs=120 \
+#       --lr=0.1
+
+
+#MobileNet v1:
+#python train.py \
+#       --model=MobileNet \
+#       --batch_size=256 \
+#       --total_images=1281167 \
+#       --class_dim=1000 \
+#       --image_shape=3,224,224 \
+#       --model_save_dir=output/ \
+#       --with_mem_opt=False \
+#       --lr_strategy=piecewise_decay \
+#	--num_epochs=120 \
+#       --lr=0.1
+
+
+#ResNet50:
+#python train.py \
+#       --model=ResNet50 \
+#       --batch_size=256 \
+#       --total_images=1281167 \
+#       --class_dim=1000 \
+#       --image_shape=3,224,224 \
+#       --model_save_dir=output/ \
+#       --with_mem_opt=False \
+#       --lr_strategy=piecewise_decay \
+#	--num_epochs=120 \
+#       --lr=0.1
+
+#ResNet101:
+#python train.py \
+#       --model=ResNet101 \
+#       --batch_size=256 \
+#       --total_images=1281167 \
+#       --class_dim=1000 \
+#       --image_shape=3,224,224 \
+#       --model_save_dir=output/ \
+#       --with_mem_opt=False \
+#       --lr_strategy=piecewise_decay \
+#	--num_epochs=120 \
+#       --lr=0.1
+
+#ResNet152:
+#python train.py \
+#       --model=ResNet152 \
+#       --batch_size=256 \
+#       --total_images=1281167 \
+#       --image_shape=3,224,224 \
+#       --lr_strategy=piecewise_decay \
+#       --lr=0.1 \
+#       --num_epochs=120 \
+#       --l2_decay=1e-4 \(TODO)
+
+
+#SE_ResNeXt50:
+#python train.py \
+#       --model=SE_ResNeXt50 \
+#       --batch_size=400 \
+#       --total_images=1281167 \
+#       --image_shape=3,224,224 \
+#       --lr_strategy=cosine_decay \
+#       --lr=0.1 \
+#       --num_epochs=200 \
+#       --l2_decay=12e-5 \(TODO)
+
+#SE_ResNeXt101:
+#python train.py \
+#        --model=SE_ResNeXt101 \
+#        --batch_size=400 \
+#        --total_images=1281167 \
+#        --image_shape=3,224,224 \
+#        --lr_strategy=cosine_decay \
+#        --lr=0.1 \
+#        --num_epochs=200 \
+#        --l2_decay=15e-5 \(TODO)
+
+#VGG11:
+#python train.py \
+#        --model=VGG11 \
+#        --batch_size=512 \
+#        --total_images=1281167 \
+#        --image_shape=3,224,224 \
+#        --lr_strategy=cosine_decay \
+#        --lr=0.1 \
+#        --num_epochs=90 \
+#        --l2_decay=2e-4 \(TODO)
+
+#VGG13:
+#python train.py
+#        --model=VGG13 \          
+#        --batch_size=256 \
+#        --total_images=1281167 \
+#        --image_shape=3,224,224 \
+#        --lr_strategy=cosine_decay \
+#        --lr=0.01 \
+#        --num_epochs=90 \
+#        --l2_decay=3e-4 \(TODO)
--- a/fluid/PaddleCV/image_classification/train.py
+++ b/fluid/PaddleCV/image_classification/train.py
@@ -13,8 +13,13 @@ import paddle.dataset.flowers as flowers
 import models
 import reader
 import argparse
-from models.learning_rate import cosine_decay
+import functools
+import subprocess
+import utils
+from utils.learning_rate import cosine_decay
 from utility import add_arguments, print_arguments
+import models
+import models_name

 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
@@ -34,20 +39,25 @@ add_arg('lr_strategy',      str,   "piecewise_decay",    "Set the learning rate
 add_arg('model',            str,   "SE_ResNeXt50_32x4d", "Set the network to use.")
 add_arg('enable_ce',        bool,  False,                "If set True, enable continuous evaluation job.")
 add_arg('data_dir',         str,   "./data/ILSVRC2012",  "The ImageNet dataset root dir.")
-# yapf: enable
+add_arg('model_category',   str,   "models",             "Whether to use models_name or not, valid value:'models','models_name'" )
+# yapf: enabl

-model_list = [m for m in dir(models) if "__" not in m]
+
+def set_models(model):
+    global models
+    if model == "models":
+        models = models
+    else:
+        models = models_name


 def optimizer_setting(params):
    ls = params["learning_strategy"]
-
    if ls["name"] == "piecewise_decay":
        if "total_images" not in params:
            total_images = 1281167
        else:
            total_images = params["total_images"]
-
        batch_size = ls["batch_size"]
        step = int(total_images / batch_size + 1)

@@ -60,6 +70,7 @@ def optimizer_setting(params):
                boundaries=bd, values=lr),
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
+
    elif ls["name"] == "cosine_decay":
        if "total_images" not in params:
            total_images = 1281167
@@ -76,7 +87,29 @@ def optimizer_setting(params):
            learning_rate=cosine_decay(
                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
            momentum=0.9,
-            regularization=fluid.regularizer.L2Decay(1e-4))
+            regularization=fluid.regularizer.L2Decay(4e-5))
+    elif ls["name"] == "exponential_decay":
+        if "total_images" not in params:
+            total_images = 1281167
+        else:
+            total_images = params["total_images"]
+        batch_size = ls["batch_size"]
+        step = int(total_images / batch_size +1)
+        lr = params["lr"]
+        num_epochs = params["num_epochs"]
+        learning_decay_rate_factor=ls["learning_decay_rate_factor"]
+        num_epochs_per_decay = ls["num_epochs_per_decay"]
+        NUM_GPUS = 1
+
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.exponential_decay(
+                learning_rate = lr * NUM_GPUS,
+                decay_steps = step * num_epochs_per_decay / NUM_GPUS,
+                decay_rate = learning_decay_rate_factor),
+            momentum=0.9,
+
+            regularization = fluid.regularizer.L2Decay(4e-5))
+
    else:
        lr = params["lr"]
        optimizer = fluid.optimizer.Momentum(
@@ -86,29 +119,16 @@ def optimizer_setting(params):

    return optimizer

+def net_config(image, label, model, args):
+    model_list = [m for m in dir(models) if "__" not in m]
+    assert args.model in model_list,"{} is not lists: {}".format(
+        args.model, model_list)

-def train(args):
-    # parameters from arguments
    class_dim = args.class_dim
    model_name = args.model
-    checkpoint = args.checkpoint
-    pretrained_model = args.pretrained_model
-    with_memory_optimization = args.with_mem_opt
-    model_save_dir = args.model_save_dir
-    image_shape = [int(m) for m in args.image_shape.split(",")]
-
-    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
-                                                                     model_list)
-
-    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # model definition
-    model = models.__dict__[model_name]()

    if args.enable_ce:
        assert model_name == "SE_ResNeXt50_32x4d"
-        fluid.default_startup_program().random_seed = 1000
        model.params["dropout_seed"] = 100
        class_dim = 102

@@ -132,9 +152,30 @@ def train(args):
        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)

-    test_program = fluid.default_main_program().clone(for_test=True)
+    return avg_cost, acc_top1, acc_top5
+

-    # parameters from model and arguments
+def build_program(is_train, main_prog, startup_prog, args):
+    image_shape = [int(m) for m in args.image_shape.split(",")]
+    model_name = args.model
+    model_list = [m for m in dir(models) if "__" not in m]
+    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
+                                                                     model_list)
+    model = models.__dict__[model_name]()
+    with fluid.program_guard(main_prog, startup_prog):
+        py_reader = fluid.layers.py_reader(
+            capacity=16,
+            shapes=[[-1] + image_shape, [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            use_double_buffer=True)
+        with fluid.unique_name.guard():
+            image, label = fluid.layers.read_file(py_reader)
+            avg_cost, acc_top1, acc_top5 = net_config(image, label, model, args)
+            avg_cost.persistable = True
+            acc_top1.persistable = True
+            acc_top5.persistable = True
+            if is_train:
                params = model.params
                params["total_images"] = args.total_images
                params["lr"] = args.lr
@@ -142,32 +183,69 @@ def train(args):
                params["learning_strategy"]["batch_size"] = args.batch_size
                params["learning_strategy"]["name"] = args.lr_strategy

-    # initialize optimizer
                optimizer = optimizer_setting(params)
-    opts = optimizer.minimize(avg_cost)
+                optimizer.minimize(avg_cost)
+
+    return py_reader, avg_cost, acc_top1, acc_top5
+
+
+def train(args):
+    # parameters from arguments
+    model_name = args.model
+    checkpoint = args.checkpoint
+    pretrained_model = args.pretrained_model
+    with_memory_optimization = args.with_mem_opt
+    model_save_dir = args.model_save_dir
+
+    startup_prog = fluid.Program()
+    train_prog = fluid.Program()
+    test_prog = fluid.Program()
+    if args.enable_ce:
+        startup_prog.random_seed = 1000
+        train_prog.random_seed = 1000
+
+    train_py_reader, train_cost, train_acc1, train_acc5 = build_program(
+        is_train=True,
+        main_prog=train_prog,
+        startup_prog=startup_prog,
+        args=args)
+    test_py_reader, test_cost, test_acc1, test_acc5 = build_program(
+        is_train=False,
+        main_prog=test_prog,
+        startup_prog=startup_prog,
+        args=args)
+    test_prog = test_prog.clone(for_test=True)

    if with_memory_optimization:
-        fluid.memory_optimize(fluid.default_main_program())
+        fluid.memory_optimize(train_prog)
+        fluid.memory_optimize(test_prog)

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
+    exe.run(startup_prog)

    if checkpoint is not None:
-        fluid.io.load_persistables(exe, checkpoint)
+        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)

    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

-        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+        fluid.io.load_vars(
+            exe, pretrained_model, main_program=train_prog, predicate=if_exist)

-    train_batch_size = args.batch_size
-    test_batch_size = 16
+    visible_device = os.getenv('CUDA_VISIBLE_DEVICES')
+    if visible_device:
+        device_num = len(visible_device.split(','))
+    else:
+        device_num = subprocess.check_output(['nvidia-smi', '-L']).decode().count('\n')

+    train_batch_size = args.batch_size / device_num
+    test_batch_size = 8
    if not args.enable_ce:
-        train_reader = paddle.batch(reader.train(), batch_size=train_batch_size)
+        train_reader = paddle.batch(
+            reader.train(), batch_size=train_batch_size, drop_last=True)
        test_reader = paddle.batch(reader.val(), batch_size=test_batch_size)
    else:
        # use flowers dataset for CE and set use_xmap False to avoid disorder data
@@ -176,26 +254,36 @@ def train(args):
        random.seed(0)
        np.random.seed(0)
        train_reader = paddle.batch(
-            flowers.train(use_xmap=False), batch_size=train_batch_size)
+            flowers.train(use_xmap=False),
+            batch_size=train_batch_size,
+            drop_last=True)
        test_reader = paddle.batch(
            flowers.test(use_xmap=False), batch_size=test_batch_size)

-    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
-
+    train_py_reader.decorate_paddle_reader(train_reader)
+    test_py_reader.decorate_paddle_reader(test_reader)
    train_exe = fluid.ParallelExecutor(
-        use_cuda=True if args.use_gpu else False, loss_name=avg_cost.name)
+        main_program=train_prog,
+        use_cuda=bool(args.use_gpu),
+        loss_name=train_cost.name)

-    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
+    train_fetch_list = [train_cost.name, train_acc1.name, train_acc5.name]
+    test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name]
+
+    params = models.__dict__[args.model]().params

-    gpu = os.getenv("CUDA_VISIBLE_DEVICES") or ""
-    gpu_nums = len(gpu.split(","))
    for pass_id in range(params["num_epochs"]):
+
+        train_py_reader.start()
+
        train_info = [[], [], []]
        test_info = [[], [], []]
        train_time = []
-        for batch_id, data in enumerate(train_reader()):
+        batch_id = 0
+        try:
+            while True:
                t1 = time.time()
-            loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data))
+                loss, acc1, acc5 = train_exe.run(fetch_list=train_fetch_list)
                t2 = time.time()
                period = t2 - t1
                loss = np.mean(np.array(loss))
@@ -208,57 +296,63 @@ def train(args):
                if batch_id % 10 == 0:
                    print("Pass {0}, trainbatch {1}, loss {2}, \
                        acc1 {3}, acc5 {4} time {5}"
-                                                   .format(pass_id, \
-                       batch_id, loss, acc1, acc5, \
+                          .format(pass_id, batch_id, loss, acc1, acc5,
                                  "%2.2f sec" % period))
                    sys.stdout.flush()
+                batch_id += 1
+        except fluid.core.EOFException:
+            train_py_reader.reset()

        train_loss = np.array(train_info[0]).mean()
        train_acc1 = np.array(train_info[1]).mean()
        train_acc5 = np.array(train_info[2]).mean()
        train_speed = np.array(train_time).mean() / train_batch_size
-        cnt = 0
-        for test_batch_id, data in enumerate(test_reader()):
+
+        test_py_reader.start()
+
+        test_batch_id = 0
+        try:
+            while True:
                t1 = time.time()
-            loss, acc1, acc5 = exe.run(test_program,
-                                       fetch_list=fetch_list,
-                                       feed=feeder.feed(data))
+                loss, acc1, acc5 = exe.run(program=test_prog,
+                                           fetch_list=test_fetch_list)
                t2 = time.time()
                period = t2 - t1
                loss = np.mean(loss)
                acc1 = np.mean(acc1)
                acc5 = np.mean(acc5)
-            test_info[0].append(loss * len(data))
-            test_info[1].append(acc1 * len(data))
-            test_info[2].append(acc5 * len(data))
-            cnt += len(data)
+                test_info[0].append(loss)
+                test_info[1].append(acc1)
+                test_info[2].append(acc5)
                if test_batch_id % 10 == 0:
                    print("Pass {0},testbatch {1},loss {2}, \
                        acc1 {3},acc5 {4},time {5}"
-                                                  .format(pass_id, \
-                       test_batch_id, loss, acc1, acc5, \
+                          .format(pass_id, test_batch_id, loss, acc1, acc5,
                                  "%2.2f sec" % period))
                    sys.stdout.flush()
+                test_batch_id += 1
+        except fluid.core.EOFException:
+            test_py_reader.reset()

-        test_loss = np.sum(test_info[0]) / cnt
-        test_acc1 = np.sum(test_info[1]) / cnt
-        test_acc5 = np.sum(test_info[2]) / cnt
+        test_loss = np.array(test_info[0]).mean()
+        test_acc1 = np.array(test_info[1]).mean()
+        test_acc5 = np.array(test_info[2]).mean()

        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
-              "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(pass_id, \
-              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
-              test_acc5))
+              "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(
+                  pass_id, train_loss, train_acc1, train_acc5, test_loss,
+                  test_acc1, test_acc5))
        sys.stdout.flush()

        model_path = os.path.join(model_save_dir + '/' + model_name,
                                  str(pass_id))
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
-        fluid.io.save_persistables(exe, model_path)
+        fluid.io.save_persistables(exe, model_path, main_program=train_prog)

        # This is for continuous evaluation only
        if args.enable_ce and pass_id == args.num_epochs - 1:
-            if gpu_nums == 1:
+            if device_num == 1:
                # Use the mean cost/acc for training
                print("kpis	train_cost	%s" % train_loss)
                print("kpis	train_acc_top1	%s" % train_acc1)
@@ -270,18 +364,24 @@ def train(args):
                print("kpis	train_speed	%s" % train_speed)
            else:
                # Use the mean cost/acc for training
-                print("kpis	train_cost_card%s	%s" % (gpu_nums, train_loss))
-                print("kpis	train_acc_top1_card%s	%s" % (gpu_nums, train_acc1))
-                print("kpis	train_acc_top5_card%s	%s" % (gpu_nums, train_acc5))
+                print("kpis	train_cost_card%s	%s" % (device_num, train_loss))
+                print("kpis	train_acc_top1_card%s	%s" %
+                      (device_num, train_acc1))
+                print("kpis	train_acc_top5_card%s	%s" %
+                      (device_num, train_acc5))
                # Use the mean cost/acc for testing
-                print("kpis	test_cost_card%s	%s" % (gpu_nums, test_loss))
-                print("kpis	test_acc_top1_card%s	%s" % (gpu_nums, test_acc1))
-                print("kpis	test_acc_top5_card%s	%s" % (gpu_nums, test_acc5))
-                print("kpis	train_speed_card%s	%s" % (gpu_nums, train_speed))
+                print("kpis	test_cost_card%s	%s" % (device_num, test_loss))
+                print("kpis	test_acc_top1_card%s	%s" % (device_num, test_acc1))
+                print("kpis	test_acc_top5_card%s	%s" % (device_num, test_acc5))
+                print("kpis	train_speed_card%s	%s" % (device_num, train_speed))


 def main():
    args = parser.parse_args()
+    models_now = args.model_category
+    assert models_now in ["models", "models_name"], "{} is not in lists: {}".format(
+            models_now, ["models", "models_name"])
+    set_models(models_now)
    print_arguments(args)
    train(args)


--- a/fluid/PaddleCV/image_classification/utils/__init__.py
+++ b/fluid/PaddleCV/image_classification/utils/__init__.py
+from .learning_rate import cosine_decay, lr_warmup
--- a/fluid/PaddleCV/image_classification/models/learning_rate.py
+++ b/fluid/PaddleCV/image_classification/models/learning_rate.py
@@ -27,8 +27,8 @@ def lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
        Argument learning_rate can be float or a Variable
        lr = lr + (warmup_rate * step / warmup_steps)
    """
-    assert(isinstance(end_lr, float))
-    assert(isinstance(start_lr, float))
+    assert (isinstance(end_lr, float))
+    assert (isinstance(start_lr, float))
    linear_step = end_lr - start_lr
    with fluid.default_main_program()._lr_schedule_guard():
        lr = fluid.layers.tensor.create_global_var(
@@ -42,7 +42,8 @@ def lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):

        with fluid.layers.control_flow.Switch() as switch:
            with switch.case(global_step < warmup_steps):
-                decayed_lr = start_lr + linear_step * (global_step / warmup_steps)
+                decayed_lr = start_lr + linear_step * (global_step /
+                                                       warmup_steps)
                fluid.layers.tensor.assign(decayed_lr, lr)
            with switch.default():
                fluid.layers.tensor.assign(learning_rate, lr)

--- a/fluid/PaddleCV/ocr_recognition/README.md
+++ b/fluid/PaddleCV/ocr_recognition/README.md
@@ -80,7 +80,7 @@
 在训练时，我们通过选项`--train_images` 和 `--train_list` 分别设置准备好的`train_images` 和`train_list`。


->**注：** 如果`--train_images` 和 `--train_list`都未设置或设置为None， ctc_reader.py会自动下载使用[示例数据](http://paddle-ocr-data.bj.bcebos.com/data.tar.gz)，并将其缓存到`$HOME/.cache/paddle/dataset/ctc_data/data/` 路径下。
+>**注：** 如果`--train_images` 和 `--train_list`都未设置或设置为None， reader.py会自动下载使用[示例数据](http://paddle-ocr-data.bj.bcebos.com/data.tar.gz)，并将其缓存到`$HOME/.cache/paddle/dataset/ctc_data/data/` 路径下。


 **B. 测试集和评估集**
@@ -119,17 +119,17 @@ data/test_images/00003.jpg
 使用默认数据在GPU单卡上训练:

 ```
-env CUDA_VISIBLE_DEVICES=0 python ctc_train.py
+env CUDA_VISIBLE_DEVICES=0 python train.py
 ```
 使用默认数据在CPU上训练:
 ```
-env OMP_NUM_THREADS=<num_of_physical_cores> python ctc_train.py --use_gpu False --parallel=False
+env OMP_NUM_THREADS=<num_of_physical_cores> python train.py --use_gpu False --parallel=False
 ```

 使用默认数据在GPU多卡上训练:

 ```
-env CUDA_VISIBLE_DEVICES=0,1,2,3 python ctc_train.py --parallel=True
+env CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --parallel=True
 ```

 默认使用的是`CTC model`, 可以通过选项`--model="attention"`切换为`attention model`。
@@ -197,3 +197,10 @@ env CUDA_VISIBLE_DEVICE=0 python infer.py \
    --model_path="models/model_00044_15000" \
    --input_images_list="data/test.list"
 ```
+
+## 预训练模型
+
+|模型| 错误率|
+|- |:-: |
+|[ocr_ctc_params](https://drive.google.com/open?id=1gsg2ODO2_F2pswXwW5MXpf8RY8-BMRyZ) | 22.3% |
+|[ocr_attention_params](https://drive.google.com/open?id=1Bx7-94mngyTaMA5kVjzYHDPAdXxOYbRm) | 15.8%|
--- a/fluid/PaddleNLP/sequence_tagging_for_ner/infer.py
+++ b/fluid/PaddleNLP/sequence_tagging_for_ner/infer.py
@@ -38,12 +38,10 @@ def infer(model_path, batch_size, test_data_file, vocab_file, target_file,
        for data in test_data():
            word = to_lodtensor([x[0] for x in data], place)
            mark = to_lodtensor([x[1] for x in data], place)
-            target = to_lodtensor([x[2] for x in data], place)
            crf_decode = exe.run(
                inference_program,
                feed={"word": word,
-                      "mark": mark,
-                      "target": target},
+                      "mark": mark},
                fetch_list=fetch_targets,
                return_numpy=False)
            lod_info = (crf_decode[0].lod())[0]

--- a/fluid/PaddleNLP/sequence_tagging_for_ner/train.py
+++ b/fluid/PaddleNLP/sequence_tagging_for_ner/train.py
@@ -61,9 +61,6 @@ def main(train_data_file,
    avg_cost, feature_out, word, mark, target = ner_net(
        word_dict_len, label_dict_len, parallel)

-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
-    sgd_optimizer.minimize(avg_cost)
-
    crf_decode = fluid.layers.crf_decoding(
        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))

@@ -77,6 +74,8 @@ def main(train_data_file,

    inference_program = fluid.default_main_program().clone(for_test=True)
    test_fetch_list = [num_infer_chunks, num_label_chunks, num_correct_chunks]
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
+    sgd_optimizer.minimize(avg_cost)

    if "CE_MODE_X" not in os.environ:
        train_reader = paddle.batch(
@@ -135,7 +134,7 @@ def main(train_data_file,
              " pass_f1_score:" + str(test_pass_f1_score))

        save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id)
-        fluid.io.save_inference_model(save_dirname, ['word', 'mark', 'target'],
+        fluid.io.save_inference_model(save_dirname, ['word', 'mark'],
                                      crf_decode, exe)

    if "CE_MODE_X" in os.environ:

--- a/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh
+++ b/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+cudaid=${text_matching_on_quora:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce | python _ce.py
+
+cudaid=${text_matching_on_quora_m:=0,1,2,3} # use 0,1,2,3 card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce | python _ce.py
--- a/fluid/PaddleNLP/text_matching_on_quora/__init__.py
+++ b/fluid/PaddleNLP/text_matching_on_quora/__init__.py
--- a/fluid/PaddleNLP/text_matching_on_quora/_ce.py
+++ b/fluid/PaddleNLP/text_matching_on_quora/_ce.py
+# this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi
+from kpi import DurationKpi
+
+
+each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.05, 0, actived=True)
+train_avg_cost_card1_kpi = CostKpi('train_avg_cost_card1', 0.2, 0)
+train_avg_acc_card1_kpi = CostKpi('train_avg_acc_card1', 0.02, 0)
+each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.05, 0, actived=True)
+train_avg_cost_card4_kpi = CostKpi('train_avg_cost_card4', 0.2, 0)
+train_avg_acc_card4_kpi = CostKpi('train_avg_acc_card4', 0.02, 0)
+
+tracking_kpis = [
+        each_pass_duration_card1_kpi,
+        train_avg_cost_card1_kpi,
+        train_avg_acc_card1_kpi,
+        each_pass_duration_card4_kpi,
+        train_avg_cost_card4_kpi,
+        train_avg_acc_card4_kpi,
+        ]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    log_to_ce(log)
--- a/fluid/PaddleNLP/text_matching_on_quora/pretrained_word2vec.py
+++ b/fluid/PaddleNLP/text_matching_on_quora/pretrained_word2vec.py
@@ -21,6 +21,11 @@ import numpy as np
 import time, datetime
 import os, sys

+def maybe_open(filepath):
+    if sys.version_info <= (3, 0): # for python2
+        return open(filepath, 'r')
+    else:
+        return open(filepath, 'r', encoding="utf-8")
    
 def Glove840B_300D(filepath, keys=None):
    """
@@ -33,7 +38,7 @@ def Glove840B_300D(filepath, keys=None):
    print("please wait for a minute.")
    start = time.time()
    word2vec = {}
-    with open(filepath, "r") as f:
+    with maybe_open(filepath) as f:
        for line in f:
            if sys.version_info <= (3, 0): # for python2
                line = line.decode('utf-8')

--- a/fluid/PaddleNLP/text_matching_on_quora/quora_question_pairs.py
+++ b/fluid/PaddleNLP/text_matching_on_quora/quora_question_pairs.py
@@ -68,8 +68,10 @@ def maybe_open(file_name):
                "     |- readme.txt\n"
                "     |- wordvec.txt\n")
        raise RuntimeError(msg)
-
+    if sys.version_info <= (3, 0): # for python2
        return open(file_name, 'r')
+    else:
+        return open(file_name, 'r', encoding="utf-8")


 def tokenized_question_pairs(file_name):

--- a/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py
+++ b/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py
@@ -33,6 +33,7 @@ parser = argparse.ArgumentParser(description=__doc__)

 parser.add_argument('--model_name',       type=str,   default='cdssmNet',                  help="Which model to train")
 parser.add_argument('--config',           type=str,   default='cdssm_base',       help="The global config setting")
+parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')

 DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache/paddle/dataset')

@@ -139,6 +140,13 @@ def train_and_evaluate(train_reader,
    else:
        feeder = fluid.DataFeeder(feed_list=[q1, q2, mask1, mask2, label], place=place)

+    # only for ce
+    args = parser.parse_args()
+    if args.enable_ce:
+        SEED = 102
+        fluid.default_startup_program().random_seed = SEED
+        fluid.default_main_program().random_seed = SEED
+
    # logging param info
    for param in fluid.default_main_program().global_block().all_parameters():
        print("param name: %s; param shape: %s" % (param.name, param.shape))
@@ -167,8 +175,10 @@ def train_and_evaluate(train_reader,
             metric_type=global_config.metric_type)

    # start training
+    total_time = 0.0
    print("[%s] Start Training" % time.asctime(time.localtime(time.time())))
    for epoch_id in range(global_config.epoch_num):
+
        data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
        batch_id = 0
        epoch_begin_time = time.time()
@@ -177,8 +187,8 @@ def train_and_evaluate(train_reader,
                                              feed=feeder.feed(data),
                                              fetch_list=[cost, acc])
            data_size = len(data)
-            total_acc += data_size * avg_acc_np
-            total_cost += data_size * avg_cost_np
+            total_acc += data_size * avg_acc_np[0]
+            total_cost += data_size * avg_cost_np[0]
            data_count += data_size
            if batch_id % 100 == 0:
                print("[%s] epoch_id: %d, batch_id: %d, cost: %f, acc: %f" % (
@@ -188,9 +198,10 @@ def train_and_evaluate(train_reader,
                    avg_cost_np,
                    avg_acc_np))
            batch_id += 1
-        
        avg_cost = total_cost / data_count
        avg_acc = total_acc / data_count
+        epoch_end_time = time.time()
+        total_time += epoch_end_time - epoch_begin_time

        print("")
        print("[%s] epoch_id: %d, train_avg_cost: %f, train_avg_acc: %f, epoch_time_cost: %f" % (
@@ -198,6 +209,19 @@ def train_and_evaluate(train_reader,
            epoch_id, avg_cost, avg_acc,
            time.time() - epoch_begin_time))

+        # only for ce
+        if epoch_id == global_config.epoch_num - 1 and args.enable_ce:
+            #Note: The following logs are special for CE monitoring.
+            #Other situations do not need to care about these logs.
+            gpu_num = get_cards(args)
+            print("kpis\teach_pass_duration_card%s\t%s" % \
+                  (gpu_num, total_time / (global_config.epoch_num)))
+            print("kpis\ttrain_avg_cost_card%s\t%s" %
+                  (gpu_num, avg_cost))
+            print("kpis\ttrain_avg_acc_card%s\t%s" %
+                  (gpu_num, avg_acc))
+
+
        epoch_model = global_config.save_dirname + "/" + "epoch" + str(epoch_id)
        fluid.io.save_inference_model(epoch_model, ["question1", "question2", "label"], acc, exe)    
        
@@ -267,5 +291,15 @@ def main():
                   use_cuda=global_config.use_cuda,
                   parallel=False)

+
+def get_cards(args):
+    if args.enable_ce:
+        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
+        num = len(cards.split(","))
+        return num
+    else:
+        return args.num_devices
+
+
 if __name__ == "__main__":
    main()
--- a/fluid/PaddleRec/ctr/README.cn.md
+++ b/fluid/PaddleRec/ctr/README.cn.md
@@ -74,3 +74,6 @@ python infer.py \
 1. 用preprocess.py处理训练数据生成train.txt。
 1. 将train.txt切分成集群机器份，放到每台机器上。
 1. 用上面的 `分布式训练` 中的命令行启动分布式训练任务.
+
+## 在PaddleCloud上运行集群训练
+如果你正在使用PaddleCloud做集群训练，你可以使用```cloud.py```这个文件来帮助你提交任务，```trian.py```中所需要的参数可以通过PaddleCloud的环境变量来提交。
\ No newline at end of file
--- a/fluid/PaddleRec/ctr/README.md
+++ b/fluid/PaddleRec/ctr/README.md
@@ -91,3 +91,6 @@ Note: The AUC value in the last log info is the total AUC for all test dataset.
 1. Prepare dataset using preprocess.py.
 1. Split the train.txt to trainer_num parts and put them on the machines.
 1. Run training with the cluster train using the command in `Distributed Train` above.
+
+## Train on Paddle Cloud
+If you want to run this training on PaddleCloud, you can use the script ```cloud.py```, you can change the arguments in ```trian.py``` through environments in PaddleCloud.
\ No newline at end of file
--- a/fluid/PaddleRec/ctr/cloud.py
+++ b/fluid/PaddleRec/ctr/cloud.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ======================================================================
+#
+# Copyright (c) 2017 Baidu.com, Inc. All Rights Reserved
+#
+# ======================================================================
+"""this file is only for PaddleCloud"""
+
+import os
+
+import logging
+
+import paddle.fluid.contrib.utils.hdfs_utils as hdfs_utils
+
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger("cloud")
+logger.setLevel(logging.INFO)
+
+
+def run():
+    cmd = "python -u train.py "
+
+    cmd += " --train_data_path %s " % "data/train.txt"
+
+    cmd += " --test_data_path %s " % "data/test.txt"
+
+    if os.getenv("BATCH_SIZE", ""):
+        cmd += " --batch_size %s " % os.getenv("BATCH_SIZE")
+
+    if os.getenv("EMBEDDING_SIZE", ""):
+        cmd += " --embedding_size %s " % os.getenv("EMBEDDING_SIZE")
+
+    if os.getenv("NUM_PASSES", ""):
+        cmd += " --num_passes %s " % os.getenv("NUM_PASSES")
+
+    if os.getenv("MODEL_OUTPUT_DIR", ""):
+        cmd += " --model_output_dir %s " % os.getenv("MODEL_OUTPUT_DIR")
+
+    if os.getenv("SPARSE_FEATURE_DIM", ""):
+        cmd += " --sparse_feature_dim %s " % os.getenv("SPARSE_FEATURE_DIM")
+
+    if os.getenv("ASYNC_MODE", ""):
+        cmd += " --async_mode "
+
+    if os.getenv("NO_SPLIT_VAR", ""):
+        cmd += " --no_split_var "
+
+    is_local = int(os.getenv("PADDLE_IS_LOCAL", "1"))
+
+    if is_local:
+        cmd += " --is_local 1 "
+        cmd += " --cloud_train 0 "
+    else:
+        cmd += " --is_local 0 "
+        cmd += " --cloud_train 1 "
+
+        trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
+        trainers = int(os.environ["PADDLE_TRAINERS"])
+        training_role = os.environ["PADDLE_TRAINING_ROLE"]
+
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)
+        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+
+        if training_role == "PSERVER":
+            cmd += " --role pserver "
+        else:
+            cmd += " --role trainer "
+        cmd += " --endpoints %s " % pserver_endpoints
+        cmd += " --current_endpoint %s " % current_endpoint
+        cmd += " --trainer_id %s " % trainer_id
+        cmd += " --trainers %s " % trainers
+
+    logging.info("run cluster commands: {}".format(cmd))
+
+    exit(os.system(cmd))
+
+
+def download():
+    hadoop_home = os.getenv("HADOOP_HOME")
+
+    configs = {}
+    configs["fs.default.name"] = os.getenv("DATA_FS_NAME")
+    configs["hadoop.job.ugi"] = os.getenv("DATA_FS_UGI")
+    client = hdfs_utils.HDFSClient(hadoop_home, configs)
+
+    local_train_data_dir = os.getenv("TRAIN_DATA_LOCAL", "data")
+    hdfs_train_data_dir = os.getenv("TRAIN_DATA_HDFS", "")
+
+    downloads = hdfs_utils.multi_download(client, hdfs_train_data_dir, local_train_data_dir, 0, 1, multi_processes=1)
+
+    print(downloads)
+    for d in downloads:
+        base_dir = os.path.dirname(d)
+        tar_cmd = "tar -zxvf {} -C {}".format(d, base_dir)
+        print tar_cmd
+
+    for d in downloads:
+        base_dir = os.path.dirname(d)
+        tar_cmd = "tar -zxvf {} -C {}".format(d, base_dir)
+        logging.info("DOWNLOAD DATA: {}, AND TAR IT: {}".format(d, tar_cmd))
+        os.system(tar_cmd)
+
+
+def env_declar():
+    logging.info("********  Rename Cluster Env to PaddleFluid Env ********")
+
+    if os.environ["TRAINING_ROLE"] == "PSERVER" or os.environ["PADDLE_IS_LOCAL"] == "0":
+        os.environ["PADDLE_TRAINING_ROLE"] = os.environ["TRAINING_ROLE"]
+        os.environ["PADDLE_PSERVER_PORT"] = os.environ["PADDLE_PORT"]
+        os.environ["PADDLE_PSERVER_IPS"] = os.environ["PADDLE_PSERVERS"]
+        os.environ["PADDLE_TRAINERS"] = os.environ["PADDLE_TRAINERS_NUM"]
+        os.environ["PADDLE_CURRENT_IP"] = os.environ["POD_IP"]
+        os.environ["PADDLE_TRAINER_ID"] = os.environ["PADDLE_TRAINER_ID"]
+
+    os.environ["CPU_NUM"] = os.getenv("CPU_NUM", "12")
+    os.environ["NUM_THREADS"] = os.getenv("NUM_THREADS", "12")
+
+    logging.info("Content-Type: text/plain\n\n")
+    for key in os.environ.keys():
+        logging.info("%30s %s \n" % (key, os.environ[key]))
+
+    logging.info("******  Rename Cluster Env to PaddleFluid Env END ******")
+
+
+if __name__ == '__main__':
+    env_declar()
+
+    if os.getenv("NEED_CUSTOM_DOWNLOAD", ""):
+
+        if os.environ["PADDLE_TRAINING_ROLE"] == "PSERVER":
+            logging.info("PSERVER do not need to download datas")
+        else:
+            logging.info("NEED_CUSTOM_DOWNLOAD is True, will download train data with hdfs_utils")
+            download()
+
+    run()
--- a/fluid/PaddleRec/ctr/network_conf.py
+++ b/fluid/PaddleRec/ctr/network_conf.py
@@ -3,14 +3,108 @@ import math

 dense_feature_dim = 13

-def ctr_dnn_model(embedding_size, sparse_feature_dim):
-    dense_input = fluid.layers.data(
-        name="dense_input", shape=[dense_feature_dim], dtype='float32')
+
+def ctr_deepfm_model(factor_size, sparse_feature_dim, dense_feature_dim, sparse_input):
+    def dense_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
+        """
+        dense_fm_layer
+        """
+        first_order = fluid.layers.fc(input=input, size=1)
+        emb_table = fluid.layers.create_parameter(shape=[emb_dict_size, factor_size],
+                                                  dtype='float32', attr=fm_param_attr)
+
+        input_mul_factor = fluid.layers.matmul(input, emb_table)
+        input_mul_factor_square = fluid.layers.square(input_mul_factor)
+        input_square = fluid.layers.square(input)
+        factor_square = fluid.layers.square(emb_table)
+        input_square_mul_factor_square = fluid.layers.matmul(input_square, factor_square)
+
+        second_order = 0.5 * (input_mul_factor_square - input_square_mul_factor_square)
+        return first_order, second_order
+
+    def sparse_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
+        """
+        sparse_fm_layer
+        """
+        first_embeddings = fluid.layers.embedding(
+            input=input, dtype='float32', size=[emb_dict_size, 1], is_sparse=True)
+        first_order = fluid.layers.sequence_pool(input=first_embeddings, pool_type='sum')
+
+        nonzero_embeddings = fluid.layers.embedding(
+            input=input, dtype='float32', size=[emb_dict_size, factor_size],
+            param_attr=fm_param_attr, is_sparse=True)
+        summed_features_emb = fluid.layers.sequence_pool(input=nonzero_embeddings, pool_type='sum')
+        summed_features_emb_square = fluid.layers.square(summed_features_emb)
+
+        squared_features_emb = fluid.layers.square(nonzero_embeddings)
+        squared_sum_features_emb = fluid.layers.sequence_pool(
+            input=squared_features_emb, pool_type='sum')
+
+        second_order = 0.5 * (summed_features_emb_square - squared_sum_features_emb)
+        return first_order, second_order
+
+    dense_input = fluid.layers.data(name="dense_input", shape=[dense_feature_dim], dtype='float32')
+
    sparse_input_ids = [
-        fluid.layers.data(
-            name="C" + str(i), shape=[1], lod_level=1, dtype='int64')
-        for i in range(1, 27)
-    ]
+        fluid.layers.data(name="C" + str(i), shape=[1], lod_level=1, dtype='int64')
+        for i in range(1, 27)]
+
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    datas = [dense_input] + sparse_input_ids + [label]
+
+    py_reader = fluid.layers.create_py_reader_by_data(capacity=64,
+                                                      feed_list=datas,
+                                                      name='py_reader',
+                                                      use_double_buffer=True)
+    words = fluid.layers.read_file(py_reader)
+
+    sparse_fm_param_attr = fluid.param_attr.ParamAttr(name="SparseFeatFactors",
+                                                      initializer=fluid.initializer.Normal(
+                                                          scale=1 / math.sqrt(sparse_feature_dim)))
+    dense_fm_param_attr = fluid.param_attr.ParamAttr(name="DenseFeatFactors",
+                                                     initializer=fluid.initializer.Normal(
+                                                         scale=1 / math.sqrt(dense_feature_dim)))
+
+    sparse_fm_first, sparse_fm_second = sparse_fm_layer(
+        sparse_input, sparse_feature_dim, factor_size, sparse_fm_param_attr)
+    dense_fm_first, dense_fm_second = dense_fm_layer(
+        dense_input, dense_feature_dim, factor_size, dense_fm_param_attr)
+
+    def embedding_layer(input):
+        """embedding_layer"""
+        emb = fluid.layers.embedding(
+            input=input, dtype='float32', size=[sparse_feature_dim, factor_size],
+            param_attr=sparse_fm_param_attr, is_sparse=True)
+        return fluid.layers.sequence_pool(input=emb, pool_type='average')
+
+    sparse_embed_seq = list(map(embedding_layer, sparse_input_ids))
+    concated = fluid.layers.concat(sparse_embed_seq + [dense_input], axis=1)
+    fc1 = fluid.layers.fc(input=concated, size=400, act='relu',
+                          param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                              scale=1 / math.sqrt(concated.shape[1]))))
+    fc2 = fluid.layers.fc(input=fc1, size=400, act='relu',
+                          param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                              scale=1 / math.sqrt(fc1.shape[1]))))
+    fc3 = fluid.layers.fc(input=fc2, size=400, act='relu',
+                          param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                              scale=1 / math.sqrt(fc2.shape[1]))))
+    predict = fluid.layers.fc(
+        input=[sparse_fm_first, sparse_fm_second, dense_fm_first, dense_fm_second, fc3],
+        size=2,
+        act="softmax",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1 / math.sqrt(fc3.shape[1]))))
+
+    cost = fluid.layers.cross_entropy(input=predict, label=words[-1])
+    avg_cost = fluid.layers.reduce_sum(cost)
+    accuracy = fluid.layers.accuracy(input=predict, label=words[-1])
+    auc_var, batch_auc_var, auc_states = \
+        fluid.layers.auc(input=predict, label=words[-1], num_thresholds=2 ** 12, slide_steps=20)
+
+    return avg_cost, auc_var, batch_auc_var, py_reader
+
+
+def ctr_dnn_model(embedding_size, sparse_feature_dim):

    def embedding_layer(input):
        return fluid.layers.embedding(
@@ -20,27 +114,46 @@ def ctr_dnn_model(embedding_size, sparse_feature_dim):
            # if you want to set is_distributed to True
            is_distributed=False,
            size=[sparse_feature_dim, embedding_size],
-            param_attr=fluid.ParamAttr(name="SparseFeatFactors", initializer=fluid.initializer.Uniform()))
+            param_attr=fluid.ParamAttr(name="SparseFeatFactors",
+                                       initializer=fluid.initializer.Uniform()))

-    sparse_embed_seq = map(embedding_layer, sparse_input_ids)
-    concated = fluid.layers.concat(sparse_embed_seq + [dense_input], axis=1)
+    dense_input = fluid.layers.data(
+        name="dense_input", shape=[dense_feature_dim], dtype='float32')
+
+    sparse_input_ids = [
+        fluid.layers.data(name="C" + str(i), shape=[1], lod_level=1, dtype='int64')
+        for i in range(1, 27)]
+
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    datas = [dense_input] + sparse_input_ids + [label]
+
+    py_reader = fluid.layers.create_py_reader_by_data(capacity=64,
+                                                      feed_list=datas,
+                                                      name='py_reader',
+                                                      use_double_buffer=True)
+    words = fluid.layers.read_file(py_reader)
+
+    sparse_embed_seq = list(map(embedding_layer, words[1:-1]))
+    concated = fluid.layers.concat(sparse_embed_seq + words[0:1], axis=1)

    fc1 = fluid.layers.fc(input=concated, size=400, act='relu',
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1/math.sqrt(concated.shape[1]))))
+                          param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                              scale=1 / math.sqrt(concated.shape[1]))))
    fc2 = fluid.layers.fc(input=fc1, size=400, act='relu',
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1/math.sqrt(fc1.shape[1]))))
+                          param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                              scale=1 / math.sqrt(fc1.shape[1]))))
    fc3 = fluid.layers.fc(input=fc2, size=400, act='relu',
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1/math.sqrt(fc2.shape[1]))))
+                          param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                              scale=1 / math.sqrt(fc2.shape[1]))))
    predict = fluid.layers.fc(input=fc3, size=2, act='softmax',
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1/math.sqrt(fc3.shape[1]))))
-
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    data_list = [dense_input] + sparse_input_ids + [label]
+                              param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                                  scale=1 / math.sqrt(fc3.shape[1]))))

-    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    cost = fluid.layers.cross_entropy(input=predict, label=words[-1])
    avg_cost = fluid.layers.reduce_sum(cost)
-    accuracy = fluid.layers.accuracy(input=predict, label=label)
-    auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict, label=label, num_thresholds=2**12, slide_steps=20)
+    accuracy = fluid.layers.accuracy(input=predict, label=words[-1])
+    auc_var, batch_auc_var, auc_states = \
+        fluid.layers.auc(input=predict, label=words[-1], num_thresholds=2 ** 12, slide_steps=20)

-    return avg_cost, data_list, auc_var, batch_auc_var
+    return avg_cost, auc_var, batch_auc_var, py_reader
--- a/fluid/PaddleRec/ctr/preprocess.py
+++ b/fluid/PaddleRec/ctr/preprocess.py
@@ -51,7 +51,7 @@ class CategoryDictGenerator:
        return res

    def dicts_sizes(self):
-        return map(len, self.dicts)
+        return list(map(len, self.dicts))


 class ContinuousFeatureGenerator:
@@ -61,8 +61,8 @@ class ContinuousFeatureGenerator:

    def __init__(self, num_feature):
        self.num_feature = num_feature
-        self.min = [sys.maxint] * num_feature
-        self.max = [-sys.maxint] * num_feature
+        self.min = [sys.maxsize] * num_feature
+        self.max = [-sys.maxsize] * num_feature

    def build(self, datafile, continous_features):
        with open(datafile, 'r') as f:

--- a/fluid/PaddleRec/ctr/train.py
+++ b/fluid/PaddleRec/ctr/train.py
@@ -5,14 +5,18 @@ import logging
 import os
 import time

-# disable gpu training for this example 
-os.environ["CUDA_VISIBLE_DEVICES"] = ""
+import numpy as np

 import paddle
 import paddle.fluid as fluid

 import reader
 from network_conf import ctr_dnn_model
+from multiprocessing import cpu_count
+
+
+# disable gpu training for this example
+os.environ["CUDA_VISIBLE_DEVICES"] = ""

 logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s')
@@ -107,7 +111,7 @@ def parse_args():
    return parser.parse_args()


-def train_loop(args, train_program, data_list, loss, auc_var, batch_auc_var, 
+def train_loop(args, train_program, py_reader, loss, auc_var, batch_auc_var,
               trainer_num, trainer_id):
    dataset = reader.CriteoDataset(args.sparse_feature_dim)
    train_reader = paddle.batch(
@@ -115,28 +119,56 @@ def train_loop(args, train_program, data_list, loss, auc_var, batch_auc_var,
            dataset.train([args.train_data_path], trainer_num, trainer_id),
            buf_size=args.batch_size * 100),
        batch_size=args.batch_size)
-    place = fluid.CPUPlace()

-    feeder = fluid.DataFeeder(feed_list=data_list, place=place)
-    data_name_list = [var.name for var in data_list]
+    py_reader.decorate_paddle_reader(train_reader)
+    data_name_list = []

+    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
+
+    exec_strategy = fluid.ExecutionStrategy()
+    build_strategy = fluid.BuildStrategy()
+
+    if os.getenv("NUM_THREADS", ""):
+        exec_strategy.num_threads = int(os.getenv("NUM_THREADS"))
+
+    cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
+    build_strategy.reduce_strategy = \
+        fluid.BuildStrategy.ReduceStrategy.Reduce if cpu_num > 1 \
+            else fluid.BuildStrategy.ReduceStrategy.AllReduce
+
+    pe = fluid.ParallelExecutor(
+        use_cuda=False,
+        loss_name=loss.name,
+        main_program=train_program,
+        build_strategy=build_strategy,
+        exec_strategy=exec_strategy)
+
    exe.run(fluid.default_startup_program())
+
    for pass_id in range(args.num_passes):
        pass_start = time.time()
-        for batch_id, data in enumerate(train_reader()):
-            loss_val, auc_val, batch_auc_val = exe.run(
-                train_program,
-                feed=feeder.feed(data),
-                fetch_list=[loss, auc_var, batch_auc_var]
-            )
+        batch_id = 0
+        py_reader.start()
+
+        try:
+            while True:
+                loss_val, auc_val, batch_auc_val = pe.run(fetch_list=[loss.name, auc_var.name, batch_auc_var.name])
+                loss_val = np.mean(loss_val)
+                auc_val = np.mean(auc_val)
+                batch_auc_val = np.mean(batch_auc_val)
+
                logger.info("TRAIN --> pass: {} batch: {} loss: {} auc: {}, batch_auc: {}"
                      .format(pass_id, batch_id, loss_val/args.batch_size, auc_val, batch_auc_val))
                if batch_id % 1000 == 0 and batch_id != 0:
                    model_dir = args.model_output_dir + '/batch-' + str(batch_id)
                    if args.trainer_id == 0:
                        fluid.io.save_inference_model(model_dir, data_name_list, [loss, auc_var], exe)
+                batch_id += 1
+        except fluid.core.EOFException:
+            py_reader.reset()
        print("pass_id: %d, pass_time_cost: %f" % (pass_id, time.time() - pass_start))
+
        model_dir = args.model_output_dir + '/pass-' + str(pass_id)
        if args.trainer_id == 0:
            fluid.io.save_inference_model(model_dir, data_name_list, [loss, auc_var], exe)
@@ -148,7 +180,7 @@ def train():
    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

-    loss, data_list, auc_var, batch_auc_var = ctr_dnn_model(args.embedding_size, args.sparse_feature_dim)
+    loss, auc_var, batch_auc_var, py_reader = ctr_dnn_model(args.embedding_size, args.sparse_feature_dim)
    optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
    optimizer.minimize(loss)
    if args.cloud_train:
@@ -166,11 +198,10 @@ def train():
        args.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        args.is_local = bool(int(os.getenv("PADDLE_IS_LOCAL", 0)))

-
    if args.is_local:
        logger.info("run local training")
        main_program = fluid.default_main_program()
-        train_loop(args, main_program, data_list, loss, auc_var, batch_auc_var, 1, 0)
+        train_loop(args, main_program, py_reader, loss, auc_var, batch_auc_var, 1, 0)
    else:
        logger.info("run dist training")
        t = fluid.DistributeTranspiler()
@@ -185,7 +216,7 @@ def train():
        elif args.role == "trainer" or args.role == "TRAINER":
            logger.info("run trainer")
            train_prog = t.get_trainer_program()
-            train_loop(args, train_prog, data_list, loss, auc_var, batch_auc_var, 
+            train_loop(args, train_prog, py_reader, loss, auc_var, batch_auc_var,
                       args.trainers, args.trainer_id)
        else:
            raise ValueError(

--- a/fluid/PaddleRec/gru4rec/README.md
+++ b/fluid/PaddleRec/gru4rec/README.md
@@ -126,6 +126,8 @@ CPU 环境
 python train.py --train_dir train_data/
 ```

+请注意CPU环境下运行单机多卡任务（--parallel 1)时，batch_size应大于cpu核数。
+
 ## 自定义网络结构

 可在[net.py](./net.py) `network` 函数中调整网络结构，当前的网络结构如下：

--- a/fluid/PaddleRec/gru4rec/infer.py
+++ b/fluid/PaddleRec/gru4rec/infer.py
@@ -78,6 +78,6 @@ if __name__ == "__main__":
        test_dir, "", batch_size=batch_size,
        buffer_size=1000, word_freq_threshold=0, is_train=False)

-    for epoch in xrange(start_index, last_index + 1):
+    for epoch in range(start_index, last_index + 1):
        epoch_path = model_dir + "/epoch_" + str(epoch)
        infer(test_reader=test_reader, use_cuda=use_cuda, model_path=epoch_path)
--- a/fluid/PaddleRec/tagspace/infer.py
+++ b/fluid/PaddleRec/tagspace/infer.py
@@ -80,6 +80,6 @@ if __name__ == "__main__":
        test_dir, "", vocab_tag_path, batch_size=1,
        neg_size=0, buffer_size=1000, is_train=False)

-    for epoch in xrange(start_index, last_index + 1):
+    for epoch in range(start_index, last_index + 1):
        epoch_path = model_dir + "/epoch_" + str(epoch)
        infer(test_reader=test_reader, vocab_tag=vocab_tag, use_cuda=False, model_path=epoch_path)
--- a/fluid/PaddleRec/word2vec/README.cn.md
+++ b/fluid/PaddleRec/word2vec/README.cn.md
+
+# 基于skip-gram的word2vector模型
+
+## 介绍
+
+
+## 运行环境
+需要先安装PaddlePaddle Fluid
+
+## 数据集
+数据集使用的是来自1 Billion Word Language Model Benchmark的(http://www.statmt.org/lm-benchmark)的数据集.
+
+下载数据集：
+```bash
+cd data && ./download.sh && cd ..
+```
+
+## 模型
+本例子实现了一个skip-gram模式的word2vector模型。
+
+
+## 数据准备
+对数据进行预处理以生成一个词典。
+
+```bash
+python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
+```
+
+## 训练
+训练的命令行选项可以通过`python train.py -h`列出。
+
+### 单机训练：
+
+```bash
+export CPU_NUM=1
+python train.py \
+        --train_data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled \
+        --dict_path data/1-billion_dict \
+        --with_hs --with_nce --is_local \
+        2>&1 | tee train.log
+```
+
+### 分布式训练
+
+本地启动一个2 trainer 2 pserver的分布式训练任务，分布式场景下训练数据会按照trainer的id进行切分，保证trainer之间的训练数据不会重叠，提高训练效率
+
+```bash
+sh cluster_train.sh
+```
+
+## 预测
+在infer.py中我们在`build_test_case`方法中构造了一些test case来评估word embeding的效果：
+我们输入test case（ 我们目前采用的是analogical-reasoning的任务：找到A - B = C - D的结构，为此我们计算A - B + D，通过cosine距离找最近的C，计算准确率要去除候选中出现A、B、D的候选 ）然后计算候选和整个embeding中所有词的余弦相似度，并且取topK（K由参数 --rank_num确定，默认为4）打印出来。
+
+如：
+对于：boy - girl + aunt = uncle  
+0 nearest aunt:0.89
+1 nearest uncle:0.70
+2 nearest grandmother:0.67
+3 nearest father:0.64
+
+您也可以在`build_test_case`方法中模仿给出的例子增加自己的测试
+
+要从测试文件运行测试用例，请将测试文件下载到“test”目录中
+我们为每个案例提供以下结构的测试：
+        `word1 word2 word3 word4`
+所以我们可以将它构建成`word1  -  word2 + word3 = word4`
+
+训练中预测：
+
+```bash
+python infer.py --infer_during_train 2>&1 | tee infer.log
+```
+使用某个model进行离线预测：
+
+```bash
+python infer.py --infer_once --model_output_dir ./models/[具体的models文件目录] 2>&1 | tee infer.log
+```
+## 在百度云上运行集群训练
+1. 参考文档 [在百度云上启动Fluid分布式训练](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/user_guides/howto/training/train_on_baidu_cloud_cn.rst) 在百度云上部署一个CPU集群。
+1. 用preprocess.py处理训练数据生成train.txt。
+1. 将train.txt切分成集群机器份，放到每台机器上。
+1. 用上面的 `分布式训练` 中的命令行启动分布式训练任务.
--- a/fluid/PaddleRec/word2vec/README.md
+++ b/fluid/PaddleRec/word2vec/README.md
+
+# Skip-Gram Word2Vec Model
+
+## Introduction
+
+
+## Environment
+You should install PaddlePaddle Fluid first.
+
+## Dataset
+The training data for the 1 Billion Word Language Model Benchmark的(http://www.statmt.org/lm-benchmark).
+
+Download dataset:
+```bash
+cd data && ./download.sh && cd ..
+```
+
+## Model
+This model implement a skip-gram model of word2vector.
+
+
+## Data Preprocessing method
+
+Preprocess the training data to generate a word dict.
+
+```bash
+python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
+```
+
+## Train
+The command line options for training can be listed by `python train.py -h`.
+
+### Local Train:
+we set CPU_NUM=1 as default CPU_NUM to execute
+```bash
+export CPU_NUM=1 && \
+python train.py \
+        --train_data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled \
+        --dict_path data/1-billion_dict \
+        --with_hs --with_nce --is_local \
+        2>&1 | tee train.log
+```
+
+
+### Distributed Train
+Run a 2 pserver 2 trainer distribute training on a single machine.
+In distributed training setting, training data is splited by trainer_id, so that training data
+ do not overlap among trainers
+
+```bash
+sh cluster_train.sh
+```
+
+## Infer
+
+In infer.py we construct some test cases in the `build_test_case` method to evaluate the effect of word embeding:
+We enter the test case (we are currently using the analogical-reasoning task: find the structure of A - B = C - D, for which we calculate A - B + D, find the nearest C by cosine distance, the calculation accuracy is removed Candidates for A, B, and D appear in the candidate) Then calculate the cosine similarity of the candidate and all words in the entire embeding, and print out the topK (K is determined by the parameter --rank_num, the default is 4).
+
+Such as:
+For: boy - girl + aunt = uncle
+0 nearest aunt: 0.89
+1 nearest uncle: 0.70
+2 nearest grandmother: 0.67
+3 nearest father:0.64
+
+You can also add your own tests by mimicking the examples given in the `build_test_case` method.
+
+To running test case from test files, please download the test files into 'test' directory
+we provide test for each case with the following structure:
+        `word1 word2 word3 word4`
+so we can build it into `word1 - word2 + word3 = word4`
+
+Forecast in training:
+
+```bash
+Python infer.py --infer_during_train 2>&1 | tee infer.log
+```
+Use a model for offline prediction:
+
+```bash
+Python infer.py --infer_once --model_output_dir ./models/[specific models file directory] 2>&1 | tee infer.log
+```
+
+## Train on Baidu Cloud
+1. Please prepare some CPU machines on Baidu Cloud following the steps in [train_on_baidu_cloud](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/user_guides/howto/training/train_on_baidu_cloud_cn.rst)
+1. Prepare dataset using preprocess.py.
+1. Split the train.txt to trainer_num parts and put them on the machines.
+1. Run training with the cluster train using the command in `Distributed Train` above.
--- a/fluid/PaddleRec/word2vec/__init__.py
+++ b/fluid/PaddleRec/word2vec/__init__.py
--- a/fluid/PaddleRec/word2vec/cluster_train.sh
+++ b/fluid/PaddleRec/word2vec/cluster_train.sh
+#!/bin/bash
+
+echo "WARNING: This script only for run PaddlePaddle Fluid on one node..."
+echo ""
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
+export PADDLE_PSERVER_PORTS=36001,36002
+export PADDLE_PSERVER_PORT_ARRAY=(36001 36002)
+export PADDLE_PSERVERS=2
+
+export PADDLE_IP=127.0.0.1
+export PADDLE_TRAINERS=2
+
+export CPU_NUM=2
+export NUM_THREADS=2
+export PADDLE_SYNC_MODE=TRUE
+export PADDLE_IS_LOCAL=0
+
+export FLAGS_rpc_deadline=3000000
+export GLOG_logtostderr=1
+
+
+export TRAIN_DATA=data/enwik8
+export DICT_PATH=data/enwik8_dict
+export IS_SPARSE="--is_sparse"
+
+
+echo "Start PSERVER ..."
+for((i=0;i<$PADDLE_PSERVERS;i++))
+do
+    cur_port=${PADDLE_PSERVER_PORT_ARRAY[$i]}
+    echo "PADDLE WILL START PSERVER "$cur_port
+    GLOG_v=0 PADDLE_TRAINING_ROLE=PSERVER CUR_PORT=$cur_port PADDLE_TRAINER_ID=$i python -u train.py $IS_SPARSE &> pserver.$i.log &
+done
+
+echo "Start TRAINER ..."
+for((i=0;i<$PADDLE_TRAINERS;i++))
+do
+    echo "PADDLE WILL START Trainer "$i
+    GLOG_v=0 PADDLE_TRAINER_ID=$i PADDLE_TRAINING_ROLE=TRAINER python -u train.py $IS_SPARSE --train_data_path $TRAIN_DATA --dict_path $DICT_PATH &> trainer.$i.log &
+done
\ No newline at end of file
--- a/fluid/PaddleRec/word2vec/data/download.sh
+++ b/fluid/PaddleRec/word2vec/data/download.sh
+#!/bin/bash
+
+wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
+tar -zxvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
--- a/fluid/PaddleRec/word2vec/infer.py
+++ b/fluid/PaddleRec/word2vec/infer.py
+import time
+import os
+import paddle.fluid as fluid
+import numpy as np
+from Queue import PriorityQueue
+import logging
+import argparse
+import preprocess
+from sklearn.metrics.pairwise import cosine_similarity
+
+word_to_id = dict()
+id_to_word = dict()
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger("fluid")
+logger.setLevel(logging.INFO)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Word2vec infer example")
+    parser.add_argument(
+        '--dict_path',
+        type=str,
+        default='./data/1-billion_dict',
+        help="The path of training dataset")
+    parser.add_argument(
+        '--model_output_dir',
+        type=str,
+        default='models',
+        help="The path for model to store (with infer_once please set specify dir to models) (default: models)"
+    )
+    parser.add_argument(
+        '--rank_num',
+        type=int,
+        default=4,
+        help="find rank_num-nearest result for test (default: 4)")
+    parser.add_argument(
+        '--infer_once',
+        action='store_true',
+        required=False,
+        default=False,
+        help='if using infer_once, (default: False)')
+    parser.add_argument(
+        '--infer_during_train',
+        action='store_true',
+        required=False,
+        default=True,
+        help='if using infer_during_train, (default: True)')
+    parser.add_argument(
+        '--test_acc',
+        action='store_true',
+        required=False,
+        default=True,
+        help='if using test_files , (default: True)')
+    parser.add_argument(
+        '--test_files_dir',
+        type=str,
+        default='test',
+        help="The path for test_files) (default: test)")
+    parser.add_argument(
+        '--test_batch_size',
+        type=int,
+        default=1000,
+        help="test used batch size (default: 1000)")
+
+    return parser.parse_args()
+
+
+def BuildWord_IdMap(dict_path):
+    with open(dict_path + "_word_to_id_", 'r') as f:
+        for line in f:
+            word_to_id[line.split(' ')[0]] = int(line.split(' ')[1])
+            id_to_word[int(line.split(' ')[1])] = line.split(' ')[0]
+
+
+def inference_prog():  # just to create program for test
+    fluid.layers.create_parameter(
+        shape=[1, 1], dtype='float32', name="embeding")
+
+
+def build_test_case_from_file(args, emb):
+    logger.info("test files dir: {}".format(args.test_files_dir))
+    current_list = os.listdir(args.test_files_dir)
+    logger.info("test files list: {}".format(current_list))
+    test_cases = list()
+    test_labels = list()
+    exclude_lists = list()
+    for file_dir in current_list:
+        with open(args.test_files_dir + "/" + file_dir, 'r') as f:
+            count = 0
+            for line in f:
+                if count == 0:
+                    pass
+                elif ':' in line:
+                    logger.info("{}".format(line))
+                    pass
+                else:
+                    line = preprocess.strip_lines(line, word_to_id)
+                    test_case = emb[word_to_id[line.split()[0]]] - emb[
+                        word_to_id[line.split()[1]]] + emb[word_to_id[
+                            line.split()[2]]]
+                    test_case_desc = line.split()[0] + " - " + line.split()[
+                        1] + " + " + line.split()[2] + " = " + line.split()[3]
+                    test_cases.append([test_case, test_case_desc])
+                    test_labels.append(word_to_id[line.split()[3]])
+                    exclude_lists.append([
+                        word_to_id[line.split()[0]],
+                        word_to_id[line.split()[1]], word_to_id[line.split()[2]]
+                    ])
+                count += 1
+    return test_cases, test_labels, exclude_lists
+
+
+def build_small_test_case(emb):
+    emb1 = emb[word_to_id['boy']] - emb[word_to_id['girl']] + emb[word_to_id[
+        'aunt']]
+    desc1 = "boy - girl + aunt = uncle"
+    label1 = word_to_id["uncle"]
+    emb2 = emb[word_to_id['brother']] - emb[word_to_id['sister']] + emb[
+        word_to_id['sisters']]
+    desc2 = "brother - sister + sisters = brothers"
+    label2 = word_to_id["brothers"]
+    emb3 = emb[word_to_id['king']] - emb[word_to_id['queen']] + emb[word_to_id[
+        'woman']]
+    desc3 = "king - queen + woman = man"
+    label3 = word_to_id["man"]
+    emb4 = emb[word_to_id['reluctant']] - emb[word_to_id['reluctantly']] + emb[
+        word_to_id['slowly']]
+    desc4 = "reluctant - reluctantly + slowly = slow"
+    label4 = word_to_id["slow"]
+    emb5 = emb[word_to_id['old']] - emb[word_to_id['older']] + emb[word_to_id[
+        'deeper']]
+    desc5 = "old - older + deeper = deep"
+    label5 = word_to_id["deep"]
+    return [[emb1, desc1], [emb2, desc2], [emb3, desc3], [emb4, desc4],
+            [emb5, desc5]], [label1, label2, label3, label4, label5]
+
+
+def build_test_case(args, emb):
+    if args.test_acc:
+        return build_test_case_from_file(args, emb)
+    else:
+        return build_small_test_case(emb)
+
+
+def inference_test(scope, model_dir, args):
+    BuildWord_IdMap(args.dict_path)
+    logger.info("model_dir is: {}".format(model_dir + "/"))
+    emb = np.array(scope.find_var("embeding").get_tensor())
+    logger.info("inference result: ====================")
+    test_cases = list()
+    test_labels = list()
+    exclude_lists = list()
+    if args.test_acc:
+        test_cases, test_labels, exclude_lists = build_test_case(args, emb)
+    else:
+        test_cases, test_labels = build_test_case(args, emb)
+        exclude_lists = [[-1]]
+    accual_rank = 1 if args.test_acc else args.rank_num
+    correct_num = 0
+    for i in range(len(test_labels)):
+        pq = None
+        if args.test_acc:
+            pq = topK(
+                accual_rank,
+                emb,
+                test_cases[i][0],
+                exclude_lists[i],
+                is_acc=True)
+        else:
+            pq = pq = topK(
+                accual_rank,
+                emb,
+                test_cases[i][0],
+                exclude_lists[0],
+                is_acc=False)
+        logger.info("Test result for {}".format(test_cases[i][1]))
+        for j in range(accual_rank):
+            pq_tmps = pq.get()
+            if (j == accual_rank - 1) and (
+                    pq_tmps.id == test_labels[i]
+            ):  # if the nearest word is what we want 
+                correct_num += 1
+            logger.info("{} nearest is {}, rate is {}".format(
+                accual_rank - j, id_to_word[pq_tmps.id], pq_tmps.priority))
+    acc = correct_num / len(test_labels)
+    logger.info("Test acc is: {}, there are {} / {}}".format(acc, correct_num,
+                                                             len(test_labels)))
+
+
+class PQ_Entry(object):
+    def __init__(self, cos_similarity, id):
+        self.priority = cos_similarity
+        self.id = id
+
+    def __cmp__(self, other):
+        return cmp(self.priority, other.priority)
+
+
+def topK(k, emb, test_emb, exclude_list, is_acc=False):
+    pq = PriorityQueue(k + 1)
+    while not pq.empty():
+        try:
+            pq.get(False)
+        except Empty:
+            continue
+        pq.task_done()
+
+    if len(emb) <= k:
+        for i in range(len(emb)):
+            x = cosine_similarity([emb[i]], [test_emb])
+            pq.put(PQ_Entry(x, i))
+        return pq
+
+    for i in range(len(emb)):
+        if is_acc and (i in exclude_list):
+            pass
+        else:
+            x = cosine_similarity([emb[i]], [test_emb])
+            pq_e = PQ_Entry(x, i)
+            if pq.full():
+                pq.get()
+            pq.put(pq_e)
+    pq.get()
+    return pq
+
+
+def infer_during_train(args):
+    model_file_list = list()
+    exe = fluid.Executor(fluid.CPUPlace())
+    Scope = fluid.Scope()
+    inference_prog()
+    solved_new = True
+    while True:
+        time.sleep(60)
+        current_list = os.listdir(args.model_output_dir)
+        # logger.info("current_list is : {}".format(current_list))
+        # logger.info("model_file_list is : {}".format(model_file_list))
+        if set(model_file_list) == set(current_list):
+            if solved_new:
+                solved_new = False
+                logger.info("No New models created")
+            pass
+        else:
+            solved_new = True
+            increment_models = list()
+            for f in current_list:
+                if f not in model_file_list:
+                    increment_models.append(f)
+            logger.info("increment_models is : {}".format(increment_models))
+            for model in increment_models:
+                model_dir = args.model_output_dir + "/" + model
+                if os.path.exists(model_dir + "/_success"):
+                    logger.info("using models from " + model_dir)
+                    with fluid.scope_guard(Scope):
+                        fluid.io.load_persistables(
+                            executor=exe, dirname=model_dir + "/")
+                        inference_test(Scope, model_dir, args)
+            model_file_list = current_list
+
+
+def infer_once(args):
+    # check models file has already been finished
+    if os.path.exists(args.model_output_dir + "/_success"):
+        logger.info("using models from " + args.model_output_dir)
+        exe = fluid.Executor(fluid.CPUPlace())
+        Scope = fluid.Scope()
+        inference_prog()
+        with fluid.scope_guard(Scope):
+            fluid.io.load_persistables(
+                executor=exe, dirname=args.model_output_dir + "/")
+            inference_test(Scope, args.model_output_dir, args)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    # while setting infer_once please specify the dir to models file with --model_output_dir
+    if args.infer_once:
+        infer_once(args)
+    elif args.infer_during_train:
+        infer_during_train(args)
+    else:
+        pass
--- a/fluid/PaddleRec/word2vec/network_conf.py
+++ b/fluid/PaddleRec/word2vec/network_conf.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+neural network for word2vec
+"""
+
+from __future__ import print_function
+
+import math
+import numpy as np
+
+import paddle.fluid as fluid
+
+
+def skip_gram_word2vec(dict_size,
+                       word_frequencys,
+                       embedding_size,
+                       max_code_length=None,
+                       with_hsigmoid=False,
+                       with_nce=True,
+                       is_sparse=False):
+    def nce_layer(input, label, embedding_size, num_total_classes,
+                  num_neg_samples, sampler, word_frequencys, sample_weight):
+
+        w_param_name = "nce_w"
+        b_param_name = "nce_b"
+        w_param = fluid.default_main_program().global_block().create_parameter(
+            shape=[num_total_classes, embedding_size],
+            dtype='float32',
+            name=w_param_name)
+        b_param = fluid.default_main_program().global_block().create_parameter(
+            shape=[num_total_classes, 1], dtype='float32', name=b_param_name)
+
+        cost = fluid.layers.nce(input=input,
+                                label=label,
+                                num_total_classes=num_total_classes,
+                                sampler=sampler,
+                                custom_dist=word_frequencys,
+                                sample_weight=sample_weight,
+                                param_attr=fluid.ParamAttr(name=w_param_name),
+                                bias_attr=fluid.ParamAttr(name=b_param_name),
+                                num_neg_samples=num_neg_samples,
+                                is_sparse=is_sparse)
+
+        return cost
+
+    def hsigmoid_layer(input, label, path_table, path_code, non_leaf_num,
+                       is_sparse):
+        if non_leaf_num is None:
+            non_leaf_num = dict_size
+
+        cost = fluid.layers.hsigmoid(
+            input=input,
+            label=label,
+            num_classes=non_leaf_num,
+            path_table=path_table,
+            path_code=path_code,
+            is_custom=True,
+            is_sparse=is_sparse)
+
+        return cost
+
+    datas = []
+
+    input_word = fluid.layers.data(name="input_word", shape=[1], dtype='int64')
+    predict_word = fluid.layers.data(
+        name='predict_word', shape=[1], dtype='int64')
+    datas.append(input_word)
+    datas.append(predict_word)
+
+    if with_hsigmoid:
+        path_table = fluid.layers.data(
+            name='path_table',
+            shape=[max_code_length if max_code_length else 40],
+            dtype='int64')
+        path_code = fluid.layers.data(
+            name='path_code',
+            shape=[max_code_length if max_code_length else 40],
+            dtype='int64')
+        datas.append(path_table)
+        datas.append(path_code)
+
+    py_reader = fluid.layers.create_py_reader_by_data(
+        capacity=64, feed_list=datas, name='py_reader', use_double_buffer=True)
+
+    words = fluid.layers.read_file(py_reader)
+
+    emb = fluid.layers.embedding(
+        input=words[0],
+        is_sparse=is_sparse,
+        size=[dict_size, embedding_size],
+        param_attr=fluid.ParamAttr(
+            name='embeding',
+            initializer=fluid.initializer.Normal(scale=1 /
+                                                 math.sqrt(dict_size))))
+
+    cost, cost_nce, cost_hs = None, None, None
+
+    if with_nce:
+        cost_nce = nce_layer(emb, words[1], embedding_size, dict_size, 5,
+                             "uniform", word_frequencys, None)
+        cost = cost_nce
+    if with_hsigmoid:
+        cost_hs = hsigmoid_layer(emb, words[1], words[2], words[3], dict_size,
+                                 is_sparse)
+        cost = cost_hs
+    if with_nce and with_hsigmoid:
+        cost = fluid.layers.elementwise_add(cost_nce, cost_hs)
+
+    avg_cost = fluid.layers.reduce_mean(cost)
+
+    return avg_cost, py_reader
--- a/fluid/PaddleRec/word2vec/preprocess.py
+++ b/fluid/PaddleRec/word2vec/preprocess.py
+# -*- coding: utf-8 -*
+
+import re
+import six
+import argparse
+
+prog = re.compile("[^a-z ]", flags=0)
+word_count = dict()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Paddle Fluid word2 vector preprocess")
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        required=True,
+        help="The path of training dataset")
+    parser.add_argument(
+        '--dict_path',
+        type=str,
+        default='./dict',
+        help="The path of generated dict")
+    parser.add_argument(
+        '--freq',
+        type=int,
+        default=5,
+        help="If the word count is less then freq, it will be removed from dict")
+    parser.add_argument(
+        '--is_local',
+        action='store_true',
+        required=False,
+        default=False,
+        help='Local train or not, (default: False)')
+
+    parser.add_argument(
+        '--with_other_dict',
+        action='store_true',
+        required=False,
+        default=False,
+        help='Using third party provided dict , (default: False)')
+
+    parser.add_argument(
+        '--other_dict_path',
+        type=str,
+        default='',
+        help='The path for third party provided dict (default: '
+        ')')
+
+    return parser.parse_args()
+
+
+def text_strip(text):
+    return prog.sub("", text)
+
+
+# users can self-define their own strip rules by modifing this method
+def strip_lines(line, vocab=word_count):
+    return _replace_oov(vocab, native_to_unicode(line))
+
+
+# Shameless copy from Tensorflow https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/text_encoder.py
+def _replace_oov(original_vocab, line):
+    """Replace out-of-vocab words with "<UNK>".
+  This maintains compatibility with published results.
+  Args:
+    original_vocab: a set of strings (The standard vocabulary for the dataset)
+    line: a unicode string - a space-delimited sequence of words.
+  Returns:
+    a unicode string - a space-delimited sequence of words.
+  """
+    return u" ".join([
+        word if word in original_vocab else u"<UNK>" for word in line.split()
+    ])
+
+
+# Shameless copy from Tensorflow https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/text_encoder.py
+# Unicode utility functions that work with Python 2 and 3
+def native_to_unicode(s):
+    if _is_unicode(s):
+        return s
+    try:
+        return _to_unicode(s)
+    except UnicodeDecodeError:
+        res = _to_unicode(s, ignore_errors=True)
+        tf.logging.info("Ignoring Unicode error, outputting: %s" % res)
+        return res
+
+
+def _is_unicode(s):
+    if six.PY2:
+        if isinstance(s, unicode):
+            return True
+    else:
+        if isinstance(s, str):
+            return True
+    return False
+
+
+def _to_unicode(s, ignore_errors=False):
+    if _is_unicode(s):
+        return s
+    error_mode = "ignore" if ignore_errors else "strict"
+    return s.decode("utf-8", errors=error_mode)
+
+
+def build_Huffman(word_count, max_code_length):
+
+    MAX_CODE_LENGTH = max_code_length
+    sorted_by_freq = sorted(word_count.items(), key=lambda x: x[1])
+    count = list()
+    vocab_size = len(word_count)
+    parent = [-1] * 2 * vocab_size
+    code = [-1] * MAX_CODE_LENGTH
+    point = [-1] * MAX_CODE_LENGTH
+    binary = [-1] * 2 * vocab_size
+    word_code_len = dict()
+    word_code = dict()
+    word_point = dict()
+    i = 0
+    for a in range(vocab_size):
+        count.append(word_count[sorted_by_freq[a][0]])
+
+    for a in range(vocab_size):
+        word_point[sorted_by_freq[a][0]] = [-1] * MAX_CODE_LENGTH
+        word_code[sorted_by_freq[a][0]] = [-1] * MAX_CODE_LENGTH
+
+    for k in range(vocab_size):
+        count.append(1e15)
+
+    pos1 = vocab_size - 1
+    pos2 = vocab_size
+    min1i = 0
+    min2i = 0
+    b = 0
+
+    for r in range(vocab_size):
+        if pos1 >= 0:
+            if count[pos1] < count[pos2]:
+                min1i = pos1
+                pos1 = pos1 - 1
+            else:
+                min1i = pos2
+                pos2 = pos2 + 1
+        else:
+            min1i = pos2
+            pos2 = pos2 + 1
+        if pos1 >= 0:
+            if count[pos1] < count[pos2]:
+                min2i = pos1
+                pos1 = pos1 - 1
+            else:
+                min2i = pos2
+                pos2 = pos2 + 1
+        else:
+            min2i = pos2
+            pos2 = pos2 + 1
+
+        count[vocab_size + r] = count[min1i] + count[min2i]
+
+        #record the parent of left and right child
+        parent[min1i] = vocab_size + r
+        parent[min2i] = vocab_size + r
+        binary[min1i] = 0  #left branch has code 0
+        binary[min2i] = 1  #right branch has code 1
+
+    for a in range(vocab_size):
+        b = a
+        i = 0
+        while True:
+            code[i] = binary[b]
+            point[i] = b
+            i = i + 1
+            b = parent[b]
+            if b == vocab_size * 2 - 2:
+                break
+
+        word_code_len[sorted_by_freq[a][0]] = i
+        word_point[sorted_by_freq[a][0]][0] = vocab_size - 2
+
+        for k in range(i):
+            word_code[sorted_by_freq[a][0]][i - k - 1] = code[k]
+
+            # only non-leaf nodes will be count in
+            if point[k] - vocab_size >= 0:
+                word_point[sorted_by_freq[a][0]][i - k] = point[k] - vocab_size
+
+    return word_point, word_code, word_code_len
+
+
+def preprocess(args):
+    """
+    proprocess the data, generate dictionary and save into dict_path.
+    :param data_path: the input data path.
+    :param dict_path: the generated dict path. the data in dict is "word count"
+    :param freq:
+    :return:
+    """
+    # word to count
+
+    if args.with_other_dict:
+        with open(args.other_dict_path, 'r') as f:
+            for line in f:
+                word_count[native_to_unicode(line.strip())] = 1
+
+    if args.is_local:
+        for i in range(1, 100):
+            with open(args.data_path + "/news.en-000{:0>2d}-of-00100".format(
+                    i)) as f:
+                for line in f:
+                    line = strip_lines(line)
+                    words = line.split()
+                    for item in words:
+                        if item in word_count:
+                            word_count[item] = word_count[item] + 1
+                        else:
+                            word_count[native_to_unicode('<UNK>')] += 1
+    # with open(args.data_path + "/tmp.txt") as f:
+    #     for line in f:
+    #         print("line before strip is: {}".format(line))
+    #         line = strip_lines(line, word_count)
+    #         print("line after strip is: {}".format(line))
+    #         words = line.split()
+    #         print("words after split is: {}".format(words))
+    #         for item in words:
+    #             if item in word_count:
+    #                 word_count[item] = word_count[item] + 1
+    #             else:
+    #                 word_count[item] = 1
+    item_to_remove = []
+    for item in word_count:
+        if word_count[item] <= args.freq:
+            item_to_remove.append(item)
+    for item in item_to_remove:
+        del word_count[item]
+
+    path_table, path_code, word_code_len = build_Huffman(word_count, 40)
+
+    with open(args.dict_path, 'w+') as f:
+        for k, v in word_count.items():
+            f.write(k.encode("utf-8") + " " + str(v).encode("utf-8") + '\n')
+
+    with open(args.dict_path + "_ptable", 'w+') as f2:
+        for pk, pv in path_table.items():
+            f2.write(
+                pk.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
+                                                      for x in pv)) + '\n')
+
+    with open(args.dict_path + "_pcode", 'w+') as f3:
+        for pck, pcv in path_code.items():
+            f3.write(
+                pck.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
+                                                       for x in pcv)) + '\n')
+
+
+if __name__ == "__main__":
+    preprocess(parse_args())
--- a/fluid/PaddleRec/word2vec/reader.py
+++ b/fluid/PaddleRec/word2vec/reader.py
+# -*- coding: utf-8 -*
+
+import numpy as np
+import preprocess
+
+import logging
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger("fluid")
+logger.setLevel(logging.INFO)
+
+
+class Word2VecReader(object):
+    def __init__(self,
+                 dict_path,
+                 data_path,
+                 filelist,
+                 trainer_id,
+                 trainer_num,
+                 window_size=5):
+        self.window_size_ = window_size
+        self.data_path_ = data_path
+        self.filelist = filelist
+        self.num_non_leaf = 0
+        self.word_to_id_ = dict()
+        self.id_to_word = dict()
+        self.word_to_path = dict()
+        self.word_to_code = dict()
+        self.trainer_id = trainer_id
+        self.trainer_num = trainer_num
+
+        word_all_count = 0
+        word_counts = []
+        word_id = 0
+
+        with open(dict_path, 'r') as f:
+            for line in f:
+                line = line.decode(encoding='UTF-8')
+                word, count = line.split()[0], int(line.split()[1])
+                self.word_to_id_[word] = word_id
+                self.id_to_word[word_id] = word  #build id to word dict
+                word_id += 1
+                word_counts.append(count)
+                word_all_count += count
+
+        with open(dict_path + "_word_to_id_", 'w+') as f6:
+            for k, v in self.word_to_id_.items():
+                f6.write(
+                    k.encode("utf-8") + " " + str(v).encode("utf-8") + '\n')
+
+        self.dict_size = len(self.word_to_id_)
+        self.word_frequencys = [
+            float(count) / word_all_count for count in word_counts
+        ]
+        print("dict_size = " + str(
+            self.dict_size)) + " word_all_count = " + str(word_all_count)
+
+        with open(dict_path + "_ptable", 'r') as f2:
+            for line in f2:
+                self.word_to_path[line.split("\t")[0]] = np.fromstring(
+                    line.split('\t')[1], dtype=int, sep=' ')
+                self.num_non_leaf = np.fromstring(
+                    line.split('\t')[1], dtype=int, sep=' ')[0]
+        print("word_ptable dict_size = " + str(len(self.word_to_path)))
+
+        with open(dict_path + "_pcode", 'r') as f3:
+            for line in f3:
+                line = line.decode(encoding='UTF-8')
+                self.word_to_code[line.split("\t")[0]] = np.fromstring(
+                    line.split('\t')[1], dtype=int, sep=' ')
+        print("word_pcode dict_size = " + str(len(self.word_to_code)))
+
+    def get_context_words(self, words, idx, window_size):
+        """
+        Get the context word list of target word.
+
+        words: the words of the current line
+        idx: input word index
+        window_size: window size
+        """
+        target_window = np.random.randint(1, window_size + 1)
+        # need to keep in mind that maybe there are no enough words before the target word.
+        start_point = idx - target_window if (idx - target_window) > 0 else 0
+        end_point = idx + target_window
+        # context words of the target word
+        targets = set(words[start_point:idx] + words[idx + 1:end_point + 1])
+        return list(targets)
+
+    def train(self, with_hs):
+        def _reader():
+            for file in self.filelist:
+                with open(self.data_path_ + "/" + file, 'r') as f:
+                    logger.info("running data in {}".format(self.data_path_ +
+                                                            "/" + file))
+                    count = 1
+                    for line in f:
+                        if self.trainer_id == count % self.trainer_num:
+                            line = preprocess.strip_lines(line)
+                            word_ids = [
+                                self.word_to_id_[word] for word in line.split()
+                                if word in self.word_to_id_
+                            ]
+                            for idx, target_id in enumerate(word_ids):
+                                context_word_ids = self.get_context_words(
+                                    word_ids, idx, self.window_size_)
+                                for context_id in context_word_ids:
+                                    yield [target_id], [context_id]
+                        else:
+                            pass
+                        count += 1
+
+        def _reader_hs():
+            for file in self.filelist:
+                with open(self.data_path_ + "/" + file, 'r') as f:
+                    logger.info("running data in {}".format(self.data_path_ +
+                                                            "/" + file))
+                    count = 1
+                    for line in f:
+                        if self.trainer_id == count % self.trainer_num:
+                            line = preprocess.strip_lines(line)
+                            word_ids = [
+                                self.word_to_id_[word] for word in line.split()
+                                if word in self.word_to_id_
+                            ]
+                            for idx, target_id in enumerate(word_ids):
+                                context_word_ids = self.get_context_words(
+                                    word_ids, idx, self.window_size_)
+                                for context_id in context_word_ids:
+                                    yield [target_id], [context_id], [
+                                        self.word_to_code[self.id_to_word[
+                                            target_id]]
+                                    ], [
+                                        self.word_to_path[self.id_to_word[
+                                            target_id]]
+                                    ]
+                        else:
+                            pass
+                        count += 1
+
+        if not with_hs:
+            return _reader
+        else:
+            return _reader_hs
+
+
+if __name__ == "__main__":
+    window_size = 10
+
+    reader = Word2VecReader("data/enwik9_dict", "data/enwik9", window_size)
+    i = 0
+    for x, y in reader.train()():
+        print("x: " + str(x))
+        print("y: " + str(y))
+        print("\n")
+        if i == 10:
+            exit(0)
+        i += 1
--- a/fluid/PaddleRec/word2vec/train.py
+++ b/fluid/PaddleRec/word2vec/train.py
+from __future__ import print_function
+import argparse
+import logging
+import os
+import time
+
+import numpy as np
+
+# disable gpu training for this example
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.executor import global_scope
+
+import reader
+from network_conf import skip_gram_word2vec
+from infer import inference_test
+
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger("fluid")
+logger.setLevel(logging.INFO)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Word2vec example")
+    parser.add_argument(
+        '--train_data_path',
+        type=str,
+        default='./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
+        help="The path of training dataset")
+    parser.add_argument(
+        '--dict_path',
+        type=str,
+        default='./data/1-billion_dict',
+        help="The path of data dict")
+    parser.add_argument(
+        '--test_data_path',
+        type=str,
+        default='./data/text8',
+        help="The path of testing dataset")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=100,
+        help="The size of mini-batch (default:100)")
+    parser.add_argument(
+        '--num_passes',
+        type=int,
+        default=10,
+        help="The number of passes to train (default: 10)")
+    parser.add_argument(
+        '--model_output_dir',
+        type=str,
+        default='models',
+        help='The path for model to store (default: models)')
+    parser.add_argument(
+        '--embedding_size',
+        type=int,
+        default=64,
+        help='sparse feature hashing space for index processing')
+
+    parser.add_argument(
+        '--with_hs',
+        action='store_true',
+        required=False,
+        default=False,
+        help='using hierarchical sigmoid, (default: False)')
+
+    parser.add_argument(
+        '--with_nce',
+        action='store_true',
+        required=False,
+        default=False,
+        help='using negtive sampling, (default: True)')
+
+    parser.add_argument(
+        '--max_code_length',
+        type=int,
+        default=40,
+        help='max code length used by hierarchical sigmoid, (default: 40)')
+
+    parser.add_argument(
+        '--is_sparse',
+        action='store_true',
+        required=False,
+        default=False,
+        help='embedding and nce will use sparse or not, (default: False)')
+
+    parser.add_argument(
+        '--with_Adam',
+        action='store_true',
+        required=False,
+        default=False,
+        help='Using Adam as optimizer or not, (default: False)')
+
+    parser.add_argument(
+        '--is_local',
+        action='store_true',
+        required=False,
+        default=False,
+        help='Local train or not, (default: False)')
+
+    parser.add_argument(
+        '--with_speed',
+        action='store_true',
+        required=False,
+        default=False,
+        help='print speed or not , (default: False)')
+
+    parser.add_argument(
+        '--with_infer_test',
+        action='store_true',
+        required=False,
+        default=False,
+        help='Do inference every 100 batches , (default: False)')
+
+    parser.add_argument(
+        '--rank_num',
+        type=int,
+        default=4,
+        help="find rank_num-nearest result for test (default: 4)")
+
+    return parser.parse_args()
+
+
+def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            reader.train((args.with_hs or (not args.with_nce))),
+            buf_size=args.batch_size * 100),
+        batch_size=args.batch_size)
+
+    py_reader.decorate_paddle_reader(train_reader)
+
+    place = fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    exec_strategy = fluid.ExecutionStrategy()
+
+    print("CPU_NUM:" + str(os.getenv("CPU_NUM")))
+    exec_strategy.num_threads = int(os.getenv("CPU_NUM"))
+
+    build_strategy = fluid.BuildStrategy()
+    if int(os.getenv("CPU_NUM")) > 1:
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+
+    train_exe = fluid.ParallelExecutor(
+        use_cuda=False,
+        loss_name=loss.name,
+        main_program=train_program,
+        build_strategy=build_strategy,
+        exec_strategy=exec_strategy)
+
+    profile_state = "CPU"
+    profiler_step = 0
+    profiler_step_start = 20
+    profiler_step_end = 30
+
+    for pass_id in range(args.num_passes):
+        epoch_start = time.time()
+        py_reader.start()
+        batch_id = 0
+        start = time.clock()
+
+        try:
+            while True:
+
+                if profiler_step == profiler_step_start:
+                    fluid.profiler.start_profiler(profile_state)
+
+                loss_val = train_exe.run(fetch_list=[loss.name])
+                loss_val = np.mean(loss_val)
+
+                if profiler_step == profiler_step_end:
+                    fluid.profiler.stop_profiler('total', 'trainer_profile.log')
+                    profiler_step += 1
+                else:
+                    profiler_step += 1
+
+                if batch_id % 50 == 0:
+                    logger.info(
+                        "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
+                        format(pass_id, batch_id,
+                               loss_val.mean() / args.batch_size,
+                               py_reader.queue.size()))
+                if args.with_speed:
+                    if batch_id % 1000 == 0 and batch_id != 0:
+                        elapsed = (time.clock() - start)
+                        start = time.clock()
+                        samples = 1001 * args.batch_size * int(
+                            os.getenv("CPU_NUM"))
+                        logger.info("Time used: {}, Samples/Sec: {}".format(
+                            elapsed, samples / elapsed))
+                # calculate infer result each 100 batches when using --with_infer_test
+                if args.with_infer_test:
+                    if batch_id % 1000 == 0 and batch_id != 0:
+                        model_dir = args.model_output_dir + '/batch-' + str(
+                            batch_id)
+                        inference_test(global_scope(), model_dir, args)
+
+                if batch_id % 500000 == 0 and batch_id != 0:
+                    model_dir = args.model_output_dir + '/batch-' + str(
+                        batch_id)
+                    fluid.io.save_persistables(executor=exe, dirname=model_dir)
+                    with open(model_dir + "/_success", 'w+') as f:
+                        f.write(str(batch_id))
+                batch_id += 1
+
+        except fluid.core.EOFException:
+            py_reader.reset()
+            epoch_end = time.time()
+            logger.info("Epoch: {0}, Train total expend: {1} ".format(
+                pass_id, epoch_end - epoch_start))
+
+            model_dir = args.model_output_dir + '/pass-' + str(pass_id)
+            if trainer_id == 0:
+                fluid.io.save_persistables(executor=exe, dirname=model_dir)
+                with open(model_dir + "/_success", 'w+') as f:
+                    f.write(str(pass_id))
+
+
+def GetFileList(data_path):
+    return os.listdir(data_path)
+
+
+def train(args):
+
+    if not os.path.isdir(args.model_output_dir):
+        os.mkdir(args.model_output_dir)
+
+    filelist = GetFileList(args.train_data_path)
+    word2vec_reader = None
+    if args.is_local or os.getenv("PADDLE_IS_LOCAL", "1") == "1":
+        word2vec_reader = reader.Word2VecReader(
+            args.dict_path, args.train_data_path, filelist, 0, 1)
+    else:
+        trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
+        trainers = int(os.environ["PADDLE_TRAINERS"])
+        word2vec_reader = reader.Word2VecReader(args.dict_path,
+                                                args.train_data_path, filelist,
+                                                trainer_id, trainer_num)
+
+    logger.info("dict_size: {}".format(word2vec_reader.dict_size))
+    loss, py_reader = skip_gram_word2vec(
+        word2vec_reader.dict_size,
+        word2vec_reader.word_frequencys,
+        args.embedding_size,
+        args.max_code_length,
+        args.with_hs,
+        args.with_nce,
+        is_sparse=args.is_sparse)
+
+    optimizer = None
+    if args.with_Adam:
+        optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
+    else:
+        optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
+
+    optimizer.minimize(loss)
+
+    # do local training 
+    if args.is_local or os.getenv("PADDLE_IS_LOCAL", "1") == "1":
+        logger.info("run local training")
+        main_program = fluid.default_main_program()
+
+        with open("local.main.proto", "w") as f:
+            f.write(str(main_program))
+
+        train_loop(args, main_program, word2vec_reader, py_reader, loss, 0)
+    # do distribute training
+    else:
+        logger.info("run dist training")
+
+        trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
+        trainers = int(os.environ["PADDLE_TRAINERS"])
+        training_role = os.environ["PADDLE_TRAINING_ROLE"]
+
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)
+        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+
+        config = fluid.DistributeTranspilerConfig()
+        config.slice_var_up = False
+        t = fluid.DistributeTranspiler(config=config)
+        t.transpile(
+            trainer_id,
+            pservers=pserver_endpoints,
+            trainers=trainers,
+            sync_mode=True)
+
+        if training_role == "PSERVER":
+            logger.info("run pserver")
+            prog = t.get_pserver_program(current_endpoint)
+            startup = t.get_startup_program(
+                current_endpoint, pserver_program=prog)
+
+            with open("pserver.main.proto.{}".format(os.getenv("CUR_PORT")),
+                      "w") as f:
+                f.write(str(prog))
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup)
+            exe.run(prog)
+        elif training_role == "TRAINER":
+            logger.info("run trainer")
+            train_prog = t.get_trainer_program()
+
+            with open("trainer.main.proto.{}".format(trainer_id), "w") as f:
+                f.write(str(train_prog))
+
+            train_loop(args, train_prog, word2vec_reader, py_reader, loss,
+                       trainer_id)
+
+
+def env_declar():
+    print("********  Rename Cluster Env to PaddleFluid Env ********")
+
+    print("Content-Type: text/plain\n\n")
+    for key in os.environ.keys():
+        print("%30s %s \n" % (key, os.environ[key]))
+
+    if os.environ["TRAINING_ROLE"] == "PSERVER" or os.environ[
+            "PADDLE_IS_LOCAL"] == "0":
+        os.environ["PADDLE_TRAINING_ROLE"] = os.environ["TRAINING_ROLE"]
+        os.environ["PADDLE_PSERVER_PORT"] = os.environ["PADDLE_PORT"]
+        os.environ["PADDLE_PSERVER_IPS"] = os.environ["PADDLE_PSERVERS"]
+        os.environ["PADDLE_TRAINERS"] = os.environ["PADDLE_TRAINERS_NUM"]
+        os.environ["PADDLE_CURRENT_IP"] = os.environ["POD_IP"]
+        os.environ["PADDLE_TRAINER_ID"] = os.environ["PADDLE_TRAINER_ID"]
+        # we set the thread number same as CPU number
+        os.environ["CPU_NUM"] = "12"
+
+    print("Content-Type: text/plain\n\n")
+    for key in os.environ.keys():
+        print("%30s %s \n" % (key, os.environ[key]))
+
+    print("******  Rename Cluster Env to PaddleFluid Env END ******")
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    if args.is_local:
+        pass
+    else:
+        env_declar()
+    train(args)
--- a/fluid/__init__.py
+++ b/fluid/__init__.py