diff --git a/.gitignore b/.gitignore
index dde3895fc112ad34a839b2fed9210ac2288a959b..9492cff0cb9500079955856eedac883e39b522a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .DS_Store
 *.pyc
+.*~
diff --git a/fluid/image_classification/caffe2fluid/README.md b/fluid/image_classification/caffe2fluid/README.md
index 5f565afe0c33db291092faeac632da3d51f95613..6aba34b9cafbd87b3474575fcbcee65819769c2f 100644
--- a/fluid/image_classification/caffe2fluid/README.md
+++ b/fluid/image_classification/caffe2fluid/README.md
@@ -18,19 +18,19 @@ This tool is used to convert a Caffe model to Fluid model
 
 
 ### Tested models
-- Lenet on mnist dataset
+- Lenet
 
 - ResNets:(ResNet-50, ResNet-101, ResNet-152)
-    model addr: `https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777`_
+[model addr](https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777)
 
 - GoogleNet:
-    model addr: `https://gist.github.com/jimmie33/7ea9f8ac0da259866b854460f4526034`_
+[model addr](https://gist.github.com/jimmie33/7ea9f8ac0da259866b854460f4526034)
 
 - VGG:
-    model addr: `https://gist.github.com/ksimonyan/211839e770f7b538e2d8`_
+[model addr](https://gist.github.com/ksimonyan/211839e770f7b538e2d8)
 
 - AlexNet:
-    model addr: `https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet`_
+[model addr](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet)
 
 ### Notes
 Some of this code come from here: https://github.com/ethereon/caffe-tensorflow
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py b/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..07d4ed1af50a803aee206da6c7582d079a1a1dca
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py
@@ -0,0 +1,85 @@
+#!/usr/bin/python
+
+#
+#a tool to compare tensors in two files or two directories
+#
+
+import sys
+import os
+
+
+def walk_dir(rootdir):
+    for subdir, dirs, files in os.walk(rootdir):
+        for file in files:
+            yield file
+
+
+def calc_diff(f1, f2):
+    import numpy as np
+
+    d1 = np.load(f1).flatten()
+    d2 = np.load(f2).flatten()
+
+    d1_num = reduce(lambda x, y: x * y, d1.shape)
+    d2_num = reduce(lambda x, y: x * y, d2.shape)
+    if d1_num != d2_num:
+        print d1.shape
+        print d2.shape
+        assert (d1_num == d2_num), "their shape is not consistent"
+
+    try:
+        df = np.abs(d1 - d2)
+        max_df = np.max(df)
+        sq_df = np.mean(df * df)
+        return max_df, sq_df
+    except Exception as e:
+        return -1.0, -1.0
+
+
+def compare(path1, path2):
+    def diff(f1, f2):
+        max_df, sq_df = calc_diff(f1, f2)
+        print('compare %s <=> %s with result[max_df:%.4e, sq_df:%.4e]' %
+              (f1, f2, max_df, sq_df))
+        assert (max_df < 1e-5), \
+                'max_df is too large with value[%.6e]' % (max_df)
+        assert (sq_df < 1e-10), \
+                'sq_df is too large with value[%.6e]' % (sq_df)
+
+    if os.path.exists(path1) is False:
+        print('not found %s' % (path1))
+        return 1
+    elif os.path.exists(path2) is False:
+        print('not found %s' % (path2))
+        return 1
+
+    if path1.find('.npy') > 0 and path2.find('.npy') > 0:
+        diff(path1, path2)
+        return
+
+    for f in walk_dir(path2):
+        if f.find('.npy') < 0:
+            continue
+
+        f1 = os.path.join(path1, f)
+        f2 = os.path.join(path2, f)
+        diff(f1, f2)
+
+    print('all checking succeed to pass')
+    return 0
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        path1 = 'lenet.tf/results'
+        path2 = 'lenet.paddle/results'
+    elif len(sys.argv) == 3:
+        path1 = sys.argv[1]
+        path2 = sys.argv[2]
+    else:
+        print('usage:')
+        print(' %s [path1] [path2]' % (sys.argv[0]))
+        exit(1)
+
+    print('compare inner result in %s %s' % (path1, path2))
+    exit(compare(path1, path2))
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/diff.sh b/fluid/image_classification/caffe2fluid/examples/imagenet/diff.sh
new file mode 100644
index 0000000000000000000000000000000000000000..af72caea536d6b6c3d1027e7d1327af52a6ceda6
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/diff.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+#
+#function:
+#   a tool used to check the difference of models' results generated by caffe model and paddle model
+#
+#howto:
+#   bash diff.sh resnet50 #when this has been finished, you can get the difference in precision
+#
+#notes:
+#   0, in order to infer using caffe, we need pycaffe installed
+#   1, prepare your caffe model in 'models.caffe/', eg: 'model.caffe/resnet101/resnet101.[prototxt|caffemodel]'
+#   2, converted paddle model will be in 'models'
+#   3, results of layers will be stored in 'results/${model_name}.[paddle|caffe]'
+#   4, only the last layer will be checked by default
+
+model_name="resnet50"
+results_root="results/"
+
+if [[ -n $1 ]];then
+    if [ $1 = "-h" ];then
+        echo "usage:"
+        echo "  bash $0 [model_name]"
+        echo "  eg:bash $0 resnet50"
+        exit 0
+    fi
+    model_name=$1
+fi
+
+mkdir -p $results_root
+
+model_prototxt="models.caffe/$model_name/${model_name}.prototxt"
+model_caffemodel="models.caffe/${model_name}/${model_name}.caffemodel"
+
+#1, dump layers' results from paddle
+paddle_results="$results_root/${model_name}.paddle"
+rm -rf $paddle_results
+rm -rf "results.paddle"
+bash run.sh $model_name ./models.caffe/$model_name ./models/$model_name
+if [[ $? -ne 0 ]] || [[ ! -e "results.paddle" ]];then
+    echo "not found paddle's results, maybe failed to convert"
+    exit 1
+fi
+mv results.paddle $paddle_results
+
+#2, dump layers' results from caffe
+caffe_results="$results_root/${model_name}.caffe"
+rm -rf $caffe_results
+rm -rf "results.caffe"
+cfpython ./infer.py caffe $model_prototxt $model_caffemodel $paddle_results/data.npy
+if [[ $? -ne 0 ]] || [[ ! -e "results.caffe" ]];then
+    echo "not found caffe's results, maybe failed to do inference with caffe"
+    exit 1
+fi
+mv results.caffe $caffe_results
+
+#3, extract layer names
+cat $model_prototxt | grep name | perl -ne 'if(/^\s*name:\s+\"([^\"]+)/){ print $1."\n";}' >.layer_names
+
+#4, compare one by one
+for i in $(cat ".layer_names" | tail -n1);do
+    echo "process $i"
+    python compare.py $caffe_results/${i}.npy $paddle_results/${i}.npy
+done
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py b/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py
index ec594199be5a3e7a33c9673b1d5497c95f20d946..bb75caa9e7364465042c5c88f471e8f6f5137237 100644
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py
@@ -10,8 +10,11 @@ import os
 import sys
 import inspect
 import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+
+
+def import_fluid():
+    import paddle.fluid as fluid
+    return fluid
 
 
 def load_data(imgfile, shape):
@@ -52,8 +55,10 @@ def build_model(net_file, net_name):
         print(e)
         return None
 
-    input_name = 'data'
-    input_shape = MyNet.input_shapes()[input_name]
+    fluid = import_fluid()
+    inputs_dict = MyNet.input_shapes()
+    input_name = inputs_dict.keys()[0]
+    input_shape = inputs_dict[input_name]
     images = fluid.layers.data(name='image', shape=input_shape, dtype='float32')
     #label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
@@ -64,7 +69,7 @@ def build_model(net_file, net_name):
 
 def dump_results(results, names, root):
     if os.path.exists(root) is False:
-        os.path.mkdir(root)
+        os.mkdir(root)
 
     for i in range(len(names)):
         n = names[i]
@@ -73,9 +78,12 @@ def dump_results(results, names, root):
         np.save(filename + '.npy', res)
 
 
-def infer(net_file, net_name, model_file, imgfile, debug=False):
+def infer(net_file, net_name, model_file, imgfile, debug=True):
     """ do inference using a model which consist 'xxx.py' and 'xxx.npy'
     """
+
+    fluid = import_fluid()
+
     #1, build model
     net, input_shape = build_model(net_file, net_name)
     prediction = net.get_output()
@@ -109,34 +117,79 @@ def infer(net_file, net_name, model_file, imgfile, debug=False):
                       fetch_list=fetch_list_var)
 
     if debug is True:
-        dump_path = 'results.layers'
+        dump_path = 'results.paddle'
         dump_results(results, fetch_list_name, dump_path)
-        print('all results dumped to [%s]' % (dump_path))
+        print('all result of layers dumped to [%s]' % (dump_path))
     else:
         result = results[0]
         print('predicted class:', np.argmax(result))
 
+    return 0
+
+
+def caffe_infer(prototxt, caffemodel, datafile):
+    """ do inference using pycaffe for debug,
+        all intermediate results will be dumpped to 'results.caffe'
+    """
+    import caffe
+
+    net = caffe.Net(prototxt, caffemodel, caffe.TEST)
+    input_layer = net.blobs.keys()[0]
+    print('got name of input layer is:%s' % (input_layer))
+    input_shape = list(net.blobs[input_layer].data.shape[1:])
+
+    if '.npy' in datafile:
+        np_images = np.load(datafile)
+    else:
+        np_images = load_data(datafile, input_shape)
+
+    inputs = {input_layer: np_images}
+    net.forward_all(**inputs)
+
+    results = []
+    names = []
+    for k, v in net.blobs.items():
+        k = k.rstrip('_output')
+        k = k.replace('/', '_')
+        names.append(k)
+        results.append(v.data.copy())
+
+    dump_path = 'results.caffe'
+    dump_results(results, names, dump_path)
+    print('all result of layers dumped to [%s]' % (dump_path))
+    return 0
+
 
 if __name__ == "__main__":
     """ maybe more convenient to use 'run.sh' to call this tool
     """
     net_file = 'models/resnet50/resnet50.py'
     weight_file = 'models/resnet50/resnet50.npy'
-    imgfile = 'data/65.jpeg'
+    datafile = 'data/65.jpeg'
     net_name = 'ResNet50'
 
     argc = len(sys.argv)
-    if argc == 5:
+    if sys.argv[1] == 'caffe':
+        if len(sys.argv) != 5:
+            print('usage:')
+            print('\tpython %s caffe [prototxt] [caffemodel] [datafile]' %
+                  (sys.argv[0]))
+            sys.exit(1)
+        prototxt = sys.argv[2]
+        caffemodel = sys.argv[3]
+        datafile = sys.argv[4]
+        sys.exit(caffe_infer(prototxt, caffemodel, datafile))
+    elif argc == 5:
         net_file = sys.argv[1]
         weight_file = sys.argv[2]
-        imgfile = sys.argv[3]
+        datafile = sys.argv[3]
         net_name = sys.argv[4]
     elif argc > 1:
         print('usage:')
-        print('\tpython %s [net_file] [weight_file] [imgfile] [net_name]' %
+        print('\tpython %s [net_file] [weight_file] [datafile] [net_name]' %
               (sys.argv[0]))
         print('\teg:python %s %s %s %s %s' % (sys.argv[0], net_file,
-                                              weight_file, imgfile, net_name))
+                                              weight_file, datafile, net_name))
         sys.exit(1)
 
-    infer(net_file, net_name, weight_file, imgfile)
+    infer(net_file, net_name, weight_file, datafile)
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh b/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh
index 7a1a5ebd7c0a5090c00a0c8ca6b0e11b110967dc..ff3cc4ac44a8ccaeb0b33f1bcdbc46886fb7d7e9 100644
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh
@@ -3,7 +3,7 @@
 #function:
 #   a tool used to:
 #       1, convert a caffe model
-#       2, do inference using this model
+#       2, do inference(only in fluid) using this model
 #
 #usage:
 #   bash run.sh resnet50 ./models.caffe/resnet50 ./models/resnet50
@@ -65,7 +65,12 @@ if [[ -z $only_convert ]];then
         PYTHON=`which python`
     fi
     imgfile="data/65.jpeg"
-    net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/\"([^\"]+)\"/){ print $1."\n";}'`
+    #FIX ME:
+    #   only look the first line in prototxt file for the name of this network, maybe not correct
+    net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/^\s*name\s*:\s*\"([^\"]+)\"/){ print $1."\n";}'`
+    if [[ -z $net_name ]];then
+        net_name="MyNet"
+    fi
     $PYTHON ./infer.py $net_file $weight_file $imgfile $net_name
     ret=$?
 fi
diff --git a/fluid/image_classification/caffe2fluid/kaffe/graph.py b/fluid/image_classification/caffe2fluid/kaffe/graph.py
index 5387f441852b8a318a41898ee0b62b4903ccdabb..c6fdada6e78c8fbeb98604033e4cb77995555ce9 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/graph.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/graph.py
@@ -52,7 +52,10 @@ class Graph(object):
     def __init__(self, nodes=None, name=None):
         self.nodes = nodes or []
         self.node_lut = {node.name: node for node in self.nodes}
-        self.name = name
+        if name is None or name == '':
+            self.name = 'MyNet'
+        else:
+            self.name = name
 
     def add_node(self, node):
         self.nodes.append(node)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
index fd6a71cb6acbfffe2aed1d3680fb91c8c85dc3d3..ac5ecf1d4491efb5043502824514498f79ab4db0 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
@@ -4,7 +4,7 @@ import numpy as np
 
 
 def import_fluid():
-    import paddle.v2.fluid as fluid
+    import paddle.fluid as fluid
     return fluid
 
 
@@ -64,7 +64,7 @@ class Network(object):
         if os.path.isdir(data_path):
             assert (exe is not None), \
                 'must provide a executor to load fluid model'
-            fluid.io.load_persistables_if_exist(executor=exe, dirname=data_path)
+            fluid.io.load_persistables(executor=exe, dirname=data_path)
             return True
 
         #load model from a npy file
@@ -161,56 +161,28 @@ class Network(object):
         output = fluid.layers.relu(x=input)
         return output
 
-    def _adjust_pad_if_needed(self, i_hw, k_hw, s_hw, p_hw):
-        #adjust the padding if needed
-        i_h, i_w = i_hw
-        k_h, k_w = k_hw
-        s_h, s_w = s_hw
-        p_h, p_w = p_hw
-
-        def is_consistent(i, k, s, p):
-            o = i + 2 * p - k
-            if o % s == 0:
-                return True
-            else:
-                return False
-
-        real_p_h = 0
-        real_p_w = 0
-        if is_consistent(i_h, k_h, s_h, p_h) is False:
-            real_p_h = int(k_h / 2)
-
-        if is_consistent(i_w, k_w, s_w, p_w) is False:
-            real_p_w = int(k_w / 2)
-
-        return [real_p_h, real_p_w]
-
     def pool(self, pool_type, input, k_h, k_w, s_h, s_w, name, padding):
         # Get the number of channels in the input
         in_hw = input.shape[2:]
         k_hw = [k_h, k_w]
         s_hw = [s_h, s_w]
 
-        if padding is None:
-            #fix bug about the difference between conv and pool
-            #more info: https://github.com/BVLC/caffe/issues/1318
-            padding = self._adjust_pad_if_needed(in_hw, k_hw, s_hw, [0, 0])
-
         fluid = import_fluid()
         output = fluid.layers.pool2d(
             input=input,
             pool_size=k_hw,
             pool_stride=s_hw,
             pool_padding=padding,
+            ceil_mode=True,
             pool_type=pool_type)
         return output
 
     @layer
-    def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
+    def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=[0, 0]):
         return self.pool('max', input, k_h, k_w, s_h, s_w, name, padding)
 
     @layer
-    def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
+    def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=[0, 0]):
         return self.pool('avg', input, k_h, k_w, s_h, s_w, name, padding)
 
     @layer
@@ -258,7 +230,12 @@ class Network(object):
         return output
 
     @layer
-    def batch_normalization(self, input, name, scale_offset=True, relu=False):
+    def batch_normalization(self,
+                            input,
+                            name,
+                            scale_offset=True,
+                            eps=1e-5,
+                            relu=False):
         # NOTE: Currently, only inference is supported
         fluid = import_fluid()
         prefix = name + '_'
@@ -276,7 +253,7 @@ class Network(object):
             bias_attr=bias_attr,
             moving_mean_name=mean_name,
             moving_variance_name=variance_name,
-            epsilon=1e-5,
+            epsilon=eps,
             act='relu' if relu is True else None)
 
         return output
diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
index 4d7ec49a39199bb1415f830d88f89e93a4b95266..3697529971fa6ca01d1703375243d16f0a0c1edd 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
@@ -142,7 +142,13 @@ class TensorFlowMapper(NodeMapper):
 
     def map_batch_norm(self, node):
         scale_offset = len(node.data) == 4
-        kwargs = {} if scale_offset else {'scale_offset': False}
+
+        #this default value comes from caffe's param in batch_norm
+        default_eps = 1e-5
+        kwargs = {'scale_offset': scale_offset}
+        if node.parameters.eps != default_eps:
+            kwargs['eps'] = node.parameters.eps
+
         return MaybeActivated(
             node, default=False)('batch_normalization', **kwargs)
 
@@ -236,7 +242,7 @@ class TensorFlowEmitter(object):
         func_def = self.statement('@classmethod')
         func_def += self.statement('def convert(cls, npy_model, fluid_path):')
         self.indent()
-        func_def += self.statement('import paddle.v2.fluid as fluid')
+        func_def += self.statement('fluid = import_fluid()')
         for l in codes:
             func_def += self.statement(l)
         return '\n' + func_def
diff --git a/fluid/image_classification/se_resnext.py b/fluid/image_classification/se_resnext.py
index b1adf0baba8a987ae1a971e148375c6a0730d860..573c6bec5bdc3c08e9503e46f6e09fad2cb09707 100644
--- a/fluid/image_classification/se_resnext.py
+++ b/fluid/image_classification/se_resnext.py
@@ -1,10 +1,5 @@
-import os
-import numpy as np
-import time
-import sys
 import paddle.v2 as paddle
 import paddle.fluid as fluid
-import reader
 
 
 def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
@@ -124,164 +119,3 @@ def SE_ResNeXt(input, class_dim, infer=False, layers=50):
         drop = pool
     out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
     return out
-
-
-def train(learning_rate,
-          batch_size,
-          num_passes,
-          init_model=None,
-          model_save_dir='model',
-          parallel=True,
-          use_nccl=True,
-          lr_strategy=None,
-          layers=50):
-    class_dim = 1000
-    image_shape = [3, 224, 224]
-
-    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if parallel:
-        places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
-
-        with pd.do():
-            image_ = pd.read_input(image)
-            label_ = pd.read_input(label)
-            out = SE_ResNeXt(input=image_, class_dim=class_dim, layers=layers)
-            cost = fluid.layers.cross_entropy(input=out, label=label_)
-            avg_cost = fluid.layers.mean(x=cost)
-            acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1)
-            acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5)
-            pd.write_output(avg_cost)
-            pd.write_output(acc_top1)
-            pd.write_output(acc_top5)
-
-        avg_cost, acc_top1, acc_top5 = pd()
-        avg_cost = fluid.layers.mean(x=avg_cost)
-        acc_top1 = fluid.layers.mean(x=acc_top1)
-        acc_top5 = fluid.layers.mean(x=acc_top5)
-    else:
-        out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
-        cost = fluid.layers.cross_entropy(input=out, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-
-    if lr_strategy is None:
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=learning_rate,
-            momentum=0.9,
-            regularization=fluid.regularizer.L2Decay(1e-4))
-    else:
-        bd = lr_strategy["bd"]
-        lr = lr_strategy["lr"]
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=fluid.layers.piecewise_decay(
-                boundaries=bd, values=lr),
-            momentum=0.9,
-            regularization=fluid.regularizer.L2Decay(1e-4))
-
-    opts = optimizer.minimize(avg_cost)
-    fluid.memory_optimize(fluid.default_main_program())
-
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            [avg_cost, acc_top1, acc_top5])
-
-    place = fluid.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-
-    if init_model is not None:
-        fluid.io.load_persistables(exe, init_model)
-
-    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
-    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
-    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
-
-    for pass_id in range(num_passes):
-        train_info = [[], [], []]
-        test_info = [[], [], []]
-        for batch_id, data in enumerate(train_reader()):
-            t1 = time.time()
-            loss, acc1, acc5 = exe.run(
-                fluid.default_main_program(),
-                feed=feeder.feed(data),
-                fetch_list=[avg_cost, acc_top1, acc_top5])
-            t2 = time.time()
-            period = t2 - t1
-            train_info[0].append(loss[0])
-            train_info[1].append(acc1[0])
-            train_info[2].append(acc5[0])
-            if batch_id % 10 == 0:
-                print("Pass {0}, trainbatch {1}, loss {2}, \
-                       acc1 {3}, acc5 {4} time {5}"
-                                                   .format(pass_id, \
-                       batch_id, loss[0], acc1[0], acc5[0], \
-                       "%2.2f sec" % period))
-                sys.stdout.flush()
-
-        train_loss = np.array(train_info[0]).mean()
-        train_acc1 = np.array(train_info[1]).mean()
-        train_acc5 = np.array(train_info[2]).mean()
-        for data in test_reader():
-            t1 = time.time()
-            loss, acc1, acc5 = exe.run(
-                inference_program,
-                feed=feeder.feed(data),
-                fetch_list=[avg_cost, acc_top1, acc_top5])
-            t2 = time.time()
-            period = t2 - t1
-            test_info[0].append(loss[0])
-            test_info[1].append(acc1[0])
-            test_info[2].append(acc5[0])
-            if batch_id % 10 == 0:
-                print("Pass {0},testbatch {1},loss {2}, \
-                       acc1 {3},acc5 {4},time {5}"
-                                                  .format(pass_id, \
-                       batch_id, loss[0], acc1[0], acc5[0], \
-                       "%2.2f sec" % period))
-                sys.stdout.flush()
-
-        test_loss = np.array(test_info[0]).mean()
-        test_acc1 = np.array(test_info[1]).mean()
-        test_acc5 = np.array(test_info[2]).mean()
-
-        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
-               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
-                                                           .format(pass_id, \
-              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
-              test_acc5))
-        sys.stdout.flush()
-
-        model_path = os.path.join(model_save_dir, str(pass_id))
-        if not os.path.isdir(model_path):
-            os.makedirs(model_path)
-        fluid.io.save_persistables(exe, model_path)
-
-
-if __name__ == '__main__':
-    epoch_points = [30, 60, 90]
-    total_images = 1281167
-    batch_size = 256
-    step = int(total_images / batch_size + 1)
-    bd = [e * step for e in epoch_points]
-    lr = [0.1, 0.01, 0.001, 0.0001]
-
-    lr_strategy = {"bd": bd, "lr": lr}
-
-    use_nccl = True
-    # layers: 50, 152
-    layers = 50
-
-    train(
-        learning_rate=0.1,
-        batch_size=batch_size,
-        num_passes=120,
-        init_model=None,
-        parallel=True,
-        use_nccl=True,
-        lr_strategy=lr_strategy,
-        layers=layers)
diff --git a/fluid/image_classification/train.py b/fluid/image_classification/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..f402c87d49862fd844d8cf36c6eb52f3e21895b3
--- /dev/null
+++ b/fluid/image_classification/train.py
@@ -0,0 +1,311 @@
+import os
+import numpy as np
+import time
+import sys
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+from se_resnext import SE_ResNeXt
+import reader
+
+import argparse
+import functools
+from utility import add_arguments, print_arguments
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',   int,  256, "Minibatch size.")
+add_arg('num_layers',   int,  50,  "How many layers for SE-ResNeXt model.")
+add_arg('with_mem_opt', bool, True, "Whether to use memory optimization or not.")
+add_arg('parallel_exe', bool, True, "Whether to use ParallelExecutor to train or not.")
+
+def train_paralle_do(args,
+                     learning_rate,
+                     batch_size,
+                     num_passes,
+                     init_model=None,
+                     model_save_dir='model',
+                     parallel=True,
+                     use_nccl=True,
+                     lr_strategy=None,
+                     layers=50):
+    class_dim = 1000
+    image_shape = [3, 224, 224]
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if parallel:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
+
+        with pd.do():
+            image_ = pd.read_input(image)
+            label_ = pd.read_input(label)
+            out = SE_ResNeXt(input=image_, class_dim=class_dim, layers=layers)
+            cost = fluid.layers.cross_entropy(input=out, label=label_)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1)
+            acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5)
+            pd.write_output(avg_cost)
+            pd.write_output(acc_top1)
+            pd.write_output(acc_top5)
+
+        avg_cost, acc_top1, acc_top5 = pd()
+        avg_cost = fluid.layers.mean(x=avg_cost)
+        acc_top1 = fluid.layers.mean(x=acc_top1)
+        acc_top5 = fluid.layers.mean(x=acc_top5)
+    else:
+        out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+    if lr_strategy is None:
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+    else:
+        bd = lr_strategy["bd"]
+        lr = lr_strategy["lr"]
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+
+    inference_program = fluid.default_main_program().clone(for_test=True)
+
+    opts = optimizer.minimize(avg_cost)
+    if args.with_mem_opt:
+        fluid.memory_optimize(fluid.default_main_program())
+        fluid.memory_optimize(inference_program)
+
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if init_model is not None:
+        fluid.io.load_persistables(exe, init_model)
+
+    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
+    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
+    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
+
+    for pass_id in range(num_passes):
+        train_info = [[], [], []]
+        test_info = [[], [], []]
+        for batch_id, data in enumerate(train_reader()):
+            t1 = time.time()
+            loss, acc1, acc5 = exe.run(
+                fluid.default_main_program(),
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost, acc_top1, acc_top5])
+            t2 = time.time()
+            period = t2 - t1
+            train_info[0].append(loss[0])
+            train_info[1].append(acc1[0])
+            train_info[2].append(acc5[0])
+            if batch_id % 10 == 0:
+                print("Pass {0}, trainbatch {1}, loss {2}, \
+                       acc1 {3}, acc5 {4} time {5}"
+                                                   .format(pass_id, \
+                       batch_id, loss[0], acc1[0], acc5[0], \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        train_loss = np.array(train_info[0]).mean()
+        train_acc1 = np.array(train_info[1]).mean()
+        train_acc5 = np.array(train_info[2]).mean()
+        for data in test_reader():
+            t1 = time.time()
+            loss, acc1, acc5 = exe.run(
+                inference_program,
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost, acc_top1, acc_top5])
+            t2 = time.time()
+            period = t2 - t1
+            test_info[0].append(loss[0])
+            test_info[1].append(acc1[0])
+            test_info[2].append(acc5[0])
+            if batch_id % 10 == 0:
+                print("Pass {0},testbatch {1},loss {2}, \
+                       acc1 {3},acc5 {4},time {5}"
+                                                  .format(pass_id, \
+                       batch_id, loss[0], acc1[0], acc5[0], \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        test_loss = np.array(test_info[0]).mean()
+        test_acc1 = np.array(test_info[1]).mean()
+        test_acc5 = np.array(test_info[2]).mean()
+
+        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
+               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
+                                                           .format(pass_id, \
+              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
+              test_acc5))
+        sys.stdout.flush()
+
+        model_path = os.path.join(model_save_dir, str(pass_id))
+        if not os.path.isdir(model_path):
+            os.makedirs(model_path)
+        fluid.io.save_persistables(exe, model_path)
+
+def train_parallel_exe(args,
+                       learning_rate,
+                       batch_size,
+                       num_passes,
+                       init_model=None,
+                       model_save_dir='model',
+                       parallel=True,
+                       use_nccl=True,
+                       lr_strategy=None,
+                       layers=50):
+    class_dim = 1000
+    image_shape = [3, 224, 224]
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    test_program = fluid.default_main_program().clone(for_test=True)
+
+    if lr_strategy is None:
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+    else:
+        bd = lr_strategy["bd"]
+        lr = lr_strategy["lr"]
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+
+    opts = optimizer.minimize(avg_cost)
+
+    if args.with_mem_opt:
+        fluid.memory_optimize(fluid.default_main_program())
+        fluid.memory_optimize(test_program)
+
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if init_model is not None:
+        fluid.io.load_persistables(exe, init_model)
+
+    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
+    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
+    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
+
+    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
+    test_exe = fluid.ParallelExecutor(
+        use_cuda=True,
+        main_program=test_program,
+        share_vars_from=train_exe)
+
+    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
+
+    for pass_id in range(num_passes):
+        train_info = [[], [], []]
+        test_info = [[], [], []]
+        for batch_id, data in enumerate(train_reader()):
+            t1 = time.time()
+            loss, acc1, acc5 = train_exe.run(
+                fetch_list,
+                feed_dict=feeder.feed(data))
+            t2 = time.time()
+            period = t2 - t1
+            loss = np.mean(np.array(loss))
+            acc1 = np.mean(np.array(acc1))
+            acc5 = np.mean(np.array(acc5))
+            train_info[0].append(loss)
+            train_info[1].append(acc1)
+            train_info[2].append(acc5)
+            if batch_id % 10 == 0:
+                print("Pass {0}, trainbatch {1}, loss {2}, \
+                       acc1 {3}, acc5 {4} time {5}"
+                                                   .format(pass_id, \
+                       batch_id, loss, acc1, acc5, \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        train_loss = np.array(train_info[0]).mean()
+        train_acc1 = np.array(train_info[1]).mean()
+        train_acc5 = np.array(train_info[2]).mean()
+        for data in test_reader():
+            t1 = time.time()
+            loss, acc1, acc5 = test_exe.run(
+                fetch_list,
+                feed_dict=feeder.feed(data))
+            t2 = time.time()
+            period = t2 - t1
+            loss = np.mean(np.array(loss))
+            acc1 = np.mean(np.array(acc1))
+            acc5 = np.mean(np.array(acc5))
+            test_info[0].append(loss)
+            test_info[1].append(acc1)
+            test_info[2].append(acc5)
+            if batch_id % 10 == 0:
+                print("Pass {0},testbatch {1},loss {2}, \
+                       acc1 {3},acc5 {4},time {5}"
+                                                  .format(pass_id, \
+                       batch_id, loss, acc1, acc5, \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        test_loss = np.array(test_info[0]).mean()
+        test_acc1 = np.array(test_info[1]).mean()
+        test_acc5 = np.array(test_info[2]).mean()
+
+        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
+               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
+                                                           .format(pass_id, \
+              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
+              test_acc5))
+        sys.stdout.flush()
+
+        model_path = os.path.join(model_save_dir, str(pass_id))
+        if not os.path.isdir(model_path):
+            os.makedirs(model_path)
+        fluid.io.save_persistables(exe, model_path)
+
+
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+
+    epoch_points = [30, 60, 90]
+    total_images = 1281167
+    batch_size = args.batch_size
+    step = int(total_images / batch_size + 1)
+    bd = [e * step for e in epoch_points]
+    lr = [0.1, 0.01, 0.001, 0.0001]
+
+    lr_strategy = {"bd": bd, "lr": lr}
+
+    use_nccl = True
+    # layers: 50, 152
+    layers = args.num_layers
+    method = train_parallel_exe if args.parallel_exe else train_parallel_do
+    method(args,
+           learning_rate=0.1,
+           batch_size=batch_size,
+           num_passes=120,
+           init_model=None,
+           parallel=True,
+           use_nccl=True,
+           lr_strategy=lr_strategy,
+           layers=layers)
diff --git a/fluid/image_classification/utility.py b/fluid/image_classification/utility.py
new file mode 100644
index 0000000000000000000000000000000000000000..506e6007ceb9059caf1163befb6ff594d67b547a
--- /dev/null
+++ b/fluid/image_classification/utility.py
@@ -0,0 +1,62 @@
+"""Contains common utility functions."""
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import distutils.util
+import numpy as np
+from paddle.fluid import core
+
+
+def print_arguments(args):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).iteritems()):
+        print("%s: %s" % (arg, value))
+    print("------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
diff --git a/fluid/policy_gradient/brain.py b/fluid/policy_gradient/brain.py
index 9c7041e698b0cdc03d614ffdef7d779f4f632327..ad556902f1f2d9b40e9ce8905373541decffa642 100644
--- a/fluid/policy_gradient/brain.py
+++ b/fluid/policy_gradient/brain.py
@@ -30,15 +30,12 @@ class PolicyGradient:
         acts = fluid.layers.data(name='acts', shape=[1], dtype='int64')
         vt = fluid.layers.data(name='vt', shape=[1], dtype='float32')
         # fc1
-        fc1 = fluid.layers.fc(
-            input=obs,
-            size=10,
-            act="tanh"  # tanh activation
-        )
+        fc1 = fluid.layers.fc(input=obs, size=10, act="tanh")  # tanh activation
         # fc2
-        self.all_act_prob = fluid.layers.fc(input=fc1,
-                                            size=self.n_actions,
-                                            act="softmax")
+        all_act_prob = fluid.layers.fc(input=fc1,
+                                       size=self.n_actions,
+                                       act="softmax")
+        self.inferece_program = fluid.defaul_main_program().clone()
         # to maximize total reward (log_p * R) is to minimize -(log_p * R)
         neg_log_prob = fluid.layers.cross_entropy(
             input=self.all_act_prob,
@@ -52,10 +49,9 @@ class PolicyGradient:
         self.exe.run(fluid.default_startup_program())
 
     def choose_action(self, observation):
-        prob_weights = self.exe.run(
-            fluid.default_main_program().prune(self.all_act_prob),
-            feed={"obs": observation[np.newaxis, :]},
-            fetch_list=[self.all_act_prob])
+        prob_weights = self.exe.run(self.inferece_program,
+                                    feed={"obs": observation[np.newaxis, :]},
+                                    fetch_list=[self.all_act_prob])
         prob_weights = np.array(prob_weights[0])
         action = np.random.choice(
             range(prob_weights.shape[1]),