Merge branch 'develop' of https://github.com/PaddlePaddle/models into develop

e246cde7 · frankwhzhang · 8207f5fe · 409785dc · e246cde7 · e246cde7
90 changed file
--- a/fluid/PaddleCV/image_classification/caffe2fluid/.gitignore
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/.gitignore
--- a/fluid/PaddleCV/image_classification/caffe2fluid/README.md
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/README.md
--- a/fluid/PaddleCV/image_classification/caffe2fluid/convert.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/convert.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/README.md
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/README.md
--- a/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/compare.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/compare.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/data/65.jpeg
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/data/65.jpeg
--- a/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/infer.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/infer.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/tools/cmp.sh
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/tools/cmp.sh
--- a/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/tools/cmp_layers.sh
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/tools/cmp_layers.sh
--- a/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/tools/diff.sh
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/tools/diff.sh
--- a/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/tools/run.sh
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/tools/run.sh
--- a/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/tools/test.sh
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/examples/imagenet/tools/test.sh
--- a/fluid/PaddleCV/image_classification/caffe2fluid/examples/mnist/README.md
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/examples/mnist/README.md
--- a/fluid/PaddleCV/image_classification/caffe2fluid/examples/mnist/evaluate.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/examples/mnist/evaluate.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/examples/mnist/run.sh
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/examples/mnist/run.sh
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/__init__.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/__init__.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/caffe/__init__.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/caffe/__init__.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/caffe/resolver.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/caffe/resolver.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py
@@ -21,6 +21,7 @@ import reduction

 custom_layers = get_registered_layers()

+
 def set_args(f, params, node=None):
    """ set args for function 'f' using the parameters in node.layer.parameters


--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/argmax.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/argmax.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/axpy.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/axpy.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/crop.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/crop.py
@@ -18,12 +18,14 @@ def crop_shape(input_shape, shape=None):
        assert len(input_shape) == 2, "the number of crop's inputs must be 2"
        return input_shape[1]
    elif not shape is None:
-        assert len(shape) == len(input_shape.shape), "input_shape is diff with output_shape"
+        assert len(shape) == len(
+            input_shape.shape), "input_shape is diff with output_shape"
        return shape
    else:
-        raise Exception,"crop_shape input error"
+        raise Exception, "crop_shape input error"
        return None

+
 def crop_layer(input, name, shape=None, axis=2, offset=None):
    """ build a layer of type 'Crop' using fluid

@@ -46,23 +48,28 @@ def crop_layer(input, name, shape=None, axis=2, offset=None):
        output_shape = input[1].shape
        input_tensor = input[0]
    elif not shape is None:
-        assert len(shape) == len(input.shape), "input_shape is diff with output_shape"
+        assert len(shape) == len(
+            input.shape), "input_shape is diff with output_shape"
        input_shape = input.shape
        output_shape = shape
        input_tensor = input
    else:
-        raise Exception,"crop_layer input error"
+        raise Exception, "crop_layer input error"

-    assert len(output_shape) == len(input_shape), "input_shape is diff with output_shape"
+    assert len(output_shape) == len(
+        input_shape), "input_shape is diff with output_shape"

    if axis < 0:
        axis += len(input_shape)

    if offset is not None:
-    	assert (len(input_shape) - axis) == len(offset), "invalid offset[%s] in crop layer" % (str(offset))
+        assert (len(input_shape) - axis
+                ) == len(offset), "invalid offset[%s] in crop layer" % (
+                    str(offset))
        offset = [0] * axis + offset
    import paddle.fluid as fluid
-    output = fluid.layers.crop(input_tensor, shape=output_shape, offsets=offset, name=name)
+    output = fluid.layers.crop(
+        input_tensor, shape=output_shape, offsets=offset, name=name)

    return output


--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/detection_out.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/detection_out.py
 """ A custom layer for 'detectionout' used in 'SSD' model to produce outputs
    Note: Since Paddle's implementation of 'detectionout' applied 'flatten' and 'softmax' ops on the input of 'conf', 
-    while Caffe's implementation do not. Hence, you should ajust generated 'ssd.py' to remove 'softmax' and 'flatten' ops applied on 'conf' input.
+    while Caffe's implementation do not.
 """

 from .register import register

--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/normalize.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/normalize.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/permute.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/permute.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/priorbox.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/priorbox.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/reduction.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/reduction.py
@@ -18,7 +18,7 @@ def reduction_shape(input_shape, axis=0):
        axis += len(input_shape) + 1

    assert axis <= len(input_shape), 'invalid axis[%d] error' % (axis)
-    
+
    return input_shape[0:axis]


@@ -35,27 +35,33 @@ def reduction_layer(input, name, axis=0, operation=1, coeff=1.0):
    Returns:
        output (variable): output variable for this layer
    """
-    assert operation >= 1 and operation <= 4, "reduction reduction [%s] error" % (operation)
-    
+    assert operation >= 1 and operation <= 4, "reduction reduction [%s] error" % (
+        operation)
+
    input_len = len(input.shape)
    if axis < 0:
        axis += input_len + 1
-    
+
    dim = range(input_len)

    import paddle.fluid as fluid
-    if operation == 1:   ## operation = SUM
-        output = fluid.layers.reduce_sum(input, dim=dim[axis:], keep_dim=False, name=name)
-    elif operation == 2: ## operation = ASUM
+    if operation == 1:  ## operation = SUM
+        output = fluid.layers.reduce_sum(
+            input, dim=dim[axis:], keep_dim=False, name=name)
+    elif operation == 2:  ## operation = ASUM
        absout = fluid.layers.abs(input)
-        output = fluid.layers.reduce_sum(absout, dim=dim[axis:], keep_dim=False, name=name)
-    elif operation == 3: ## operation = SUMSQ
+        output = fluid.layers.reduce_sum(
+            absout, dim=dim[axis:], keep_dim=False, name=name)
+    elif operation == 3:  ## operation = SUMSQ
        powout = fluid.layers.pow(x=input, factor=2.0)
-        output = fluid.layers.reduce_sum(powout, dim=dim[axis:], keep_dim=False, name=name)
-    else:                ## operation = MEAN
-        output = fluid.layers.reduce_mean(input, dim=dim[axis:], keep_dim=False, name=name)
+        output = fluid.layers.reduce_sum(
+            powout, dim=dim[axis:], keep_dim=False, name=name)
+    else:  ## operation = MEAN
+        output = fluid.layers.reduce_mean(
+            input, dim=dim[axis:], keep_dim=False, name=name)

    mulout = fluid.layers.scale(x=output, scale=coeff)
    return mulout

+
 register(kind='Reduction', shape=reduction_shape, layer=reduction_layer)
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/register.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/register.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/reshape.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/reshape.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/roipooling.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/roipooling.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/select.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/custom_layers/select.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/errors.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/errors.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/graph.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/graph.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/layers.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/layers.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/net_template.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/net_template.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/paddle/__init__.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/paddle/__init__.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/paddle/network.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/paddle/network.py
@@ -23,10 +23,13 @@ def layer(op):
        else:
            layer_input = list(self.terminals)

+        self.layer_reverse_trace[name] = layer_input
        # Perform the operation and get the output.
        layer_output = op(self, layer_input, *args, **kwargs)
        # Add to layer LUT.
        self.layers[name] = layer_output
+        self.var2name[layer_output.name] = (name, layer_output)
+
        # This output is now the input for the next layer.
        self.feed(layer_output)
        # Return self for chained calls.
@@ -49,12 +52,31 @@ class Network(object):
        self.paddle_env = None
        self.output_names = []
        self.name_trace = None
+
+        self.layer_reverse_trace = {}
+        self.var2name = {}
        self.setup()

    def setup(self):
        '''Construct the network. '''
        raise NotImplementedError('Must be implemented by the subclass.')

+    def locate_ancestor(self, v, which=[0], ancestor_level=1):
+        """ find a ancestor for a node 'v' which is a fluid variable
+        """
+        ancestor = None
+        which = which * ancestor_level
+        name = self.var2name[v.name][0]
+
+        for i in range(ancestor_level):
+            v = self.layer_reverse_trace[name]
+            if type(v) is list:
+                ancestor = self.var2name[v[which[i]].name]
+            else:
+                ancestor = self.var2name[v.name]
+            name = ancestor[0]
+        return ancestor
+
    def load(self, data_path, exe=None, place=None, ignore_missing=False):
        '''Load network weights.
        data_path: The path to the numpy-serialized network weights
@@ -316,7 +338,8 @@ class Network(object):
            s_w,
            ceil_mode,
            padding,
-            name=self.get_unique_output_name(name, 'avg_pool'))
+            name=self.get_unique_output_name(name, 'avg_pool'),
+            exclusive=False)

    @layer
    def sigmoid(self, input, name):
@@ -395,17 +418,35 @@ class Network(object):
        return output

    @layer
-    def softmax(self, input, name):
+    def softmax(self, input, axis=2, name=None):
        fluid = import_fluid()
        shape = input.shape
-        if len(shape) > 2:
-            for sz in shape[2:]:
-                assert sz == 1, "invalid input shape[%s] for softmax" % (
-                    str(shape))
-            input = fluid.layers.reshape(input, shape[0:2])
+        dims = len(shape)
+        axis = axis + dims if axis < 0 else axis
+
+        need_transpose = False
+        if axis + 1 != dims:
+            need_transpose = True
+
+        if need_transpose:
+            order = range(dims)
+            order.remove(axis).append(axis)
+            input = fluid.layers.transpose(
+                input,
+                perm=order,
+                name=self.get_unique_output_name(name, 'transpose'))

        output = fluid.layers.softmax(
            input, name=self.get_unique_output_name(name, 'softmax'))
+
+        if need_transpose:
+            order = range(len(shape))
+            order[axis] = dims - 1
+            order[-1] = axis
+            output = fluid.layers.transpose(
+                output,
+                perm=order,
+                name=self.get_unique_output_name(name, 'transpose'))
        return output

    @layer
@@ -502,6 +543,13 @@ class Network(object):
    def custom_layer(self, inputs, kind, name, *args, **kwargs):
        """ make custom layer
        """
+        #FIX ME:
+        #   there is a trick for different API between caffe and paddle
+        if kind == "DetectionOutput":
+            conf_var = inputs[1]
+            real_conf_var = self.locate_ancestor(conf_var, ancestor_level=2)
+            inputs[1] = real_conf_var[1]
+
        name = self.get_unique_output_name(name, kind)
        layer_factory = self.custom_layer_factory()
        return layer_factory(kind, inputs, name, *args, **kwargs)
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/paddle/transformer.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/paddle/transformer.py
@@ -156,7 +156,7 @@ class PaddleMapper(NodeMapper):
        return MaybeActivated(node)('fc', node.parameters.num_output)

    def map_softmax(self, node):
-        return PaddleNode('softmax')
+        return PaddleNode('softmax', node.parameters.axis)

    def map_lrn(self, node):
        params = node.parameters

--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/protobuf_to_dict.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/protobuf_to_dict.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/shapes.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/shapes.py
@@ -62,9 +62,11 @@ def shape_identity(node):
 def shape_scalar(node):
    return make_tensor(1, 1, 1, 1)

+
 def shape_crop(node):
    raise KaffeError('crop function had been defined in customer_layers')

+
 def shape_data(node):
    if node.output_shape:
        # Old-style input specification

--- a/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/transformers.py
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/kaffe/transformers.py
--- a/fluid/PaddleCV/image_classification/caffe2fluid/proto/caffe.proto
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/proto/caffe.proto
--- a/fluid/PaddleCV/image_classification/caffe2fluid/proto/compile.sh
+++ b/fluid/PaddleCV/image_classification/caffe2fluid/proto/compile.sh
--- a/fluid/PaddleCV/deeplabv3+/models.py
+++ b/fluid/PaddleCV/deeplabv3+/models.py
@@ -166,7 +166,7 @@ def xception_block(input,
    filters = check(filters, repeat_number)
    strides = check(strides, repeat_number)
    data = input
-    datum = []
+    results = []
    for i in range(repeat_number):
        with scope('separable_conv' + str(i + 1)):
            if not activation_fn_in_separable_conv:
@@ -185,9 +185,9 @@ def xception_block(input,
                    filters[i],
                    dilation=dilation,
                    act=relu)
-            datum.append(data)
+            results.append(data)
    if not has_skip:
-        return append_op_result(data, 'xception_block'), datum
+        return append_op_result(data, 'xception_block'), results
    if skip_conv:
        with scope('shortcut'):
            skip = bn(
@@ -195,7 +195,7 @@ def xception_block(input,
                    input, channels[-1], 1, strides[-1], groups=1, padding=0))
    else:
        skip = input
-    return append_op_result(data + skip, 'xception_block'), datum
+    return append_op_result(data + skip, 'xception_block'), results


 def entry_flow(data):
@@ -209,10 +209,10 @@ def entry_flow(data):
        with scope("block1"):
            data, _ = xception_block(data, 128, [1, 1, 2])
        with scope("block2"):
-            data, datum = xception_block(data, 256, [1, 1, 2])
+            data, results = xception_block(data, 256, [1, 1, 2])
        with scope("block3"):
            data, _ = xception_block(data, 728, [1, 1, 2])
-        return data, datum[1]
+        return data, results[1]


 def middle_flow(data):

--- a/fluid/PaddleCV/faster_rcnn/README.md
+++ b/fluid/PaddleCV/faster_rcnn/README.md
@@ -90,21 +90,11 @@ To train the model, [cocoapi](https://github.com/cocodataset/cocoapi) is needed.

 *  Use momentum optimizer with momentum=0.9.
 *  Weight decay is 0.0001.
-*  In first 500 iteration, the learning rate increases linearly from 0.00333 to 0.01. Then lr is decayed at 120000, 160000 iteration with multiplier 0.1, 0.01. The maximum iteration is 180000.
+*  In first 500 iteration, the learning rate increases linearly from 0.00333 to 0.01. Then lr is decayed at 120000, 160000 iteration with multiplier 0.1, 0.01. The maximum iteration is 180000. Also, we released a 2x model which has 360000 iterations and lr is decayed at 240000, 320000. These configuration can be set by max_iter and lr_steps in config.py.
 *  Set the learning rate of bias to two times as global lr in non basic convolutional layers.
 *  In basic convolutional layers, parameters of affine layers and res body do not update.
 *  Use Nvidia Tesla V100 8GPU, total time for training is about 40 hours.

-Training result is shown as below：
-<p align="center">
-<img src="image/train_loss.jpg" height=500 width=650 hspace='10'/> <br />
-Faster RCNN train loss
-</p>
-
-* Fluid RoIPool minibatch padding: Use RoIPool. Images in one batch padding to the same size. This method is same as detectron.
-* Fluid RoIpool no padding: Use RoIPool. Images without padding.
-* Fluid RoIAlign no padding: Use RoIAlign. Images without padding.
-
 ## Evaluation

 Evaluation is to evaluate the performance of a trained model. This sample provides `eval_coco_map.py` which uses a COCO-specific mAP metric defined by [COCO committee](http://cocodataset.org/#detections-eval).
@@ -118,20 +108,18 @@ Evaluation is to evaluate the performance of a trained model. This sample provid
 - Set ```export CUDA_VISIBLE_DEVICES=0``` to specifiy one GPU to eval.

 Evalutaion result is shown as below:
-<p align="center">
-<img src="image/mAP.jpg" height=500 width=650 hspace='10'/> <br />
-Faster RCNN mAP
-</p>

 | Model              | RoI function    | Batch size     | Max iteration    | mAP  |
 | :--------------- | :--------: | :------------:    | :------------------:    |------: |
 | [Fluid RoIPool minibatch padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_minibatch_padding.tar.gz) | RoIPool | 8   |    180000        | 0.314 |
 | [Fluid RoIPool no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_no_padding.tar.gz)  | RoIPool | 8   |    180000        | 0.316 |
 | [Fluid RoIAlign no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding.tar.gz)  | RoIAlign | 8   |    180000        | 0.345 |
+| [Fluid RoIAlign no padding 2x](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding_2x.tar.gz)  | RoIAlign | 8   |    360000        | 0.364 |

 * Fluid RoIPool minibatch padding: Use RoIPool. Images in one batch padding to the same size. This method is same as detectron.
 * Fluid RoIPool no padding: Images without padding.
 * Fluid RoIAlign no padding: Images without padding.
+* Fluid RoIAlign no padding 2x: Images without padding, train for 360000 iterations, learning rate is decayed at 240000, 320000.

 ## Inference and Visualization


--- a/fluid/PaddleCV/faster_rcnn/README_cn.md
+++ b/fluid/PaddleCV/faster_rcnn/README_cn.md
@@ -81,20 +81,10 @@ Faster RCNN 目标检测模型
 * RPN选择anchor时，rpn\_fg\_fraction=0.5，rpn\_positive\_overlap=0.7，rpn\_negative\_overlap=0.3


-下图为模型训练结果：
-<p align="center">
-<img src="image/train_loss.jpg" height=500 width=650 hspace='10'/> <br />
-Faster RCNN 训练loss
-</p>
-
-* Fluid RoIPool minibatch padding: 使用RoIPool，同一个batch内的图像填充为相同尺寸。该方法与detectron处理相同。
-* Fluid RoIPool no padding: 使用RoIPool，不对图像做填充处理。
-* Fluid RoIAlign no padding: 使用RoIAlign，不对图像做填充处理。
-
 **训练策略：**

 *  采用momentum优化算法训练Faster RCNN，momentum=0.9。
-*  权重衰减系数为0.0001，前500轮学习率从0.00333线性增加至0.01。在120000，160000轮时使用0.1,0.01乘子进行学习率衰减，最大训练180000轮。
+*  权重衰减系数为0.0001，前500轮学习率从0.00333线性增加至0.01。在120000，160000轮时使用0.1,0.01乘子进行学习率衰减，最大训练180000轮。同时我们也提供了2x模型，该模型采用更多的迭代轮数进行训练，训练360000轮，学习率在240000，320000轮衰减，其他参数不变，训练最大轮数和学习率策略可以在config.py中对max_iter和lr_steps进行设置。
 *  非基础卷积层卷积bias学习率为整体学习率2倍。
 *  基础卷积层中，affine_layers参数不更新，res2层参数不更新。
 *  使用Nvidia Tesla V100 8卡并行，总共训练时长大约40小时。
@@ -111,24 +101,21 @@ Faster RCNN 训练loss

 - 通过设置export CUDA\_VISIBLE\_DEVICES=0指定单卡GPU评估。

-下图为模型评估结果：
-<p align="center">
-<img src="image/mAP.jpg" height=500 width=650 hspace='10'/> <br />
-Faster RCNN mAP
-</p>
+下表为模型评估结果：

 | 模型                   |   RoI处理方式  | 批量大小   | 迭代次数   | mAP  |
 | :--------------- | :--------: | :------------:    | :------------------:    |------: |
 | [Fluid RoIPool minibatch padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_minibatch_padding.tar.gz) | RoIPool | 8   |    180000        | 0.314 |
 | [Fluid RoIPool no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_no_padding.tar.gz)  | RoIPool | 8   |    180000        | 0.316 |
 | [Fluid RoIAlign no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding.tar.gz)  | RoIAlign | 8   |    180000        | 0.345 |
-
+| [Fluid RoIAlign no padding 2x](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding_2x.tar.gz)  | RoIAlign | 8   |    360000        | 0.364 |



 * Fluid RoIPool minibatch padding: 使用RoIPool，同一个batch内的图像填充为相同尺寸。该方法与detectron处理相同。
 * Fluid RoIPool no padding: 使用RoIPool，不对图像做填充处理。
 * Fluid RoIAlign no padding: 使用RoIAlign，不对图像做填充处理。
+* Fluid RoIAlign no padding 2x: 使用RoIAlign，不对图像做填充处理。训练360000轮，学习率在240000，320000轮衰减。

 ## 模型推断及可视化


--- a/fluid/PaddleCV/faster_rcnn/config.py
+++ b/fluid/PaddleCV/faster_rcnn/config.py
@@ -163,15 +163,17 @@ _C.spatial_scale = 1. / 16.
 # derived learning rate the to get the final learning rate.
 _C.learning_rate = 0.01

-# maximum number of iterations
+# maximum number of iterations, 1x: 180000, 2x:360000
 _C.max_iter = 180000
+#_C.max_iter = 360000

 # warm up to learning rate 
 _C.warm_up_iter = 500
 _C.warm_up_factor = 1. / 3.

-# lr steps_with_decay
+# lr steps_with_decay, 1x: [120000, 160000], 2x: [240000, 320000]
 _C.lr_steps = [120000, 160000]
+#_C.lr_steps = [240000, 320000]
 _C.lr_gamma = 0.1

 # L2 regularization hyperparameter

--- a/fluid/PaddleCV/faster_rcnn/image/mAP.jpg
+++ b/fluid/PaddleCV/faster_rcnn/image/mAP.jpg
--- a/fluid/PaddleCV/faster_rcnn/image/train_loss.jpg
+++ b/fluid/PaddleCV/faster_rcnn/image/train_loss.jpg
--- a/fluid/PaddleCV/human_pose_estimation/README.md
+++ b/fluid/PaddleCV/human_pose_estimation/README.md
+# Simple Baselines for Human Pose Estimation in Fluid
+
+## Introduction
+This is a simple demonstration of re-implementation in [PaddlePaddle.Fluid](http://www.paddlepaddle.org/en) for the paper [Simple Baselines for Human Pose Estimation and Tracking](https://arxiv.org/abs/1804.06208) (ECCV'18) from MSRA. 
+
+![demo](demo.gif)
+
+> **Video in Demo**: *Bruno Mars - That’s What I Like [Official Video]*.
+
+## Requirements
+
+  - Python == 2.7
+  - PaddlePaddle >= 1.0
+  - opencv-python >= 3.3
+  - tqdm >= 4.25
+
+## Environment
+
+The code is developed and tested under 4 Tesla K40 GPUS cards on CentOS with installed CUDA-9.2/8.0 and cuDNN-7.1.
+
+## Known Issues
+
+  - The model does not converge with large batch\_size (e.g. = 32) on Tesla P40 / V100 / P100 GPUS cards, because PaddlePaddle uses the batch normalization function of cuDNN. Changing batch\_size into 1 image on each card during training will ease this problem, but not sure the performance. The issue can be tracked at [here](https://github.com/PaddlePaddle/Paddle/issues/14580).
+
+## Results on MPII Val
+| Arch | Head | Shoulder | Elbow | Wrist | Hip | Knee | Ankle | Mean | Mean@0.1| Models |
+| ---- |:----:|:--------:|:-----:|:-----:|:---:|:----:|:-----:|:----:|:-------:|:------:|
+| 383x384\_pose\_resnet\_50 in PyTorch | 96.658 | 95.754 | 89.790 | 84.614 | 88.523 | 84.666 | 79.287 | 89.066 | 38.046 | - |
+| 383x384\_pose\_resnet\_50 in Fluid   | 96.248 | 95.346 | 89.807 | 84.873 | 88.298 | 83.679 | 78.649 | 88.767 | 37.374 | [`link`](http://paddlemodels.bj.bcebos.com/pose/pose-resnet-50-384x384-mpii.tar.gz) |
+
+### Notes:
+
+  - Flip test is used.
+  - We do not hardly search the best model, just use the last saved model to make validation.
+
+## Getting Start
+
+### Prepare Datasets and Pretrained Models
+
+  - Following the [instruction](https://github.com/Microsoft/human-pose-estimation.pytorch#data-preparation) to prepare datasets.
+  - Download the pretrained ResNet-50 model in PaddlePaddle.Fluid on ImageNet from [Model Zoo](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#supported-models-and-performances).
+
+```bash
+wget http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar
+```
+
+Then, put them in the folder `pretrained` under the directory root of this repo, make them look like:
+
+```
+${THIS REPO ROOT}
+  `-- pretrained
+      `-- resnet_50
+          |-- 115
+  `-- data
+      `-- coco
+          |-- annotations
+          |-- images
+      `-- mpii
+          |-- annot
+          |-- images
+```
+
+### Install [COCOAPI](https://github.com/cocodataset/cocoapi)
+
+```bash
+# COCOAPI=/path/to/clone/cocoapi
+git clone https://github.com/cocodataset/cocoapi.git $COCOAPI
+cd $COCOAPI/PythonAPI
+# if cython is not installed
+pip install Cython
+# Install into global site-packages
+make install
+# Alternatively, if you do not have permissions or prefer
+# not to install the COCO API into global site-packages
+python2 setup.py install --user
+```
+
+### Perform Validating
+
+Downloading the checkpoints of Pose-ResNet-50 trained on MPII dataset from [here](http://paddlemodels.bj.bcebos.com/pose/pose-resnet-50-384x384-mpii.tar.gz). Extract it into the folder `checkpoints` under the directory root of this repo. Then run
+
+```bash
+python2 val.py --dataset 'mpii' --checkpoint 'checkpoints/pose-resnet-50-384x384-mpii'
+```
+
+### Perform Training
+
+```bash
+python2 train.py --dataset 'mpii' # or coco
+```
+
+**Note**: Configurations for training are aggregated in the `lib/mpii_reader.py` and `lib/coco_reader.py`.
+
+### Perform Test on Images
+
+Put the images into the folder `test` under the directory root of this repo. Then run
+
+```bash
+python2 test.py --checkpoint 'checkpoints/pose-resnet-50-384x384-mpii'
+```
+
+If there are multiple persons in images, detectors such as [Faster R-CNN](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/faster_rcnn), [SSD](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/object_detection) or others should be used first to crop them out. Because the simple baseline for human pose estimation is a top-down method.
+
+## Reference
+
+  - Simple Baselines for Human Pose Estimation and Tracking in PyTorch [`code`](https://github.com/Microsoft/human-pose-estimation.pytorch#data-preparation)
+
+## License
+
+This code is released under the Apache License 2.0.
--- a/fluid/PaddleCV/human_pose_estimation/demo.gif
+++ b/fluid/PaddleCV/human_pose_estimation/demo.gif
--- a/fluid/PaddleCV/metric_learning/__init__.py
+++ b/fluid/PaddleCV/metric_learning/__init__.py
--- a/fluid/PaddleCV/human_pose_estimation/lib/base_reader.py
+++ b/fluid/PaddleCV/human_pose_estimation/lib/base_reader.py
+# Copyright (c) 2018-present, Baidu, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+"""Libs for data reader."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import cv2
+import numpy as np
+
+def visualize(cfg, filename, data_numpy, input, joints, target):
+    """
+    :param cfg: global configurations for dataset
+    :param filename: the name of image file
+    :param data_numpy: original numpy image data
+    :param input: input tensor [b, c, h, w]
+    :param joints: [num_joints, 3]
+    :param target: target tensor [b, c, h, w]
+    """
+    TMPDIR = cfg.TMPDIR
+    NUM_JOINTS = cfg.NUM_JOINTS
+
+    if os.path.exists(TMPDIR):
+        shutil.rmtree(TMPDIR)
+        os.mkdir(TMPDIR)
+    else:
+        os.mkdir(TMPDIR)
+
+    f = open(os.path.join(TMPDIR, filename), 'w')
+    f.close()
+
+    cv2.imwrite(os.path.join(TMPDIR, 'flip.jpg'), data_numpy)
+    cv2.imwrite(os.path.join(TMPDIR, 'input.jpg'), input)
+    for i in range(NUM_JOINTS):
+        cv2.imwrite(os.path.join(TMPDIR, 'target_{}.jpg'.format(i)), cv2.applyColorMap(
+            np.uint8(np.expand_dims(target[i], 2)*255.), cv2.COLORMAP_JET))
+        cv2.circle(input, (int(joints[i, 0]), int(joints[i, 1])), 5, [170, 255, 0], -1)
+    cv2.imwrite(os.path.join(TMPDIR, 'input_kps.jpg'), input)
+
+def generate_target(cfg, joints, joints_vis):
+    """
+    :param joints:  [num_joints, 3]
+    :param joints_vis: [num_joints, 3]
+    :return: target, target_weight(1: visible, 0: invisible)
+    """
+    NUM_JOINTS = cfg.NUM_JOINTS
+    TARGET_TYPE = cfg.TARGET_TYPE
+    HEATMAP_SIZE = cfg.HEATMAP_SIZE
+    IMAGE_SIZE = cfg.IMAGE_SIZE
+    SIGMA = cfg.SIGMA
+
+    target_weight = np.ones((NUM_JOINTS, 1), dtype=np.float32)
+    target_weight[:, 0] = joints_vis[:, 0]
+
+    assert TARGET_TYPE == 'gaussian', \
+        'Only support gaussian map now!'
+
+    if TARGET_TYPE == 'gaussian':
+        target = np.zeros((NUM_JOINTS,
+                           HEATMAP_SIZE[1],
+                           HEATMAP_SIZE[0]),
+                           dtype=np.float32)
+
+        tmp_size = SIGMA * 3
+
+        for joint_id in range(NUM_JOINTS):
+            feat_stride = np.array(IMAGE_SIZE) / np.array(HEATMAP_SIZE)
+            mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
+            mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
+
+            # Check that any part of the gaussian is in-bounds
+            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+            if ul[0] >= HEATMAP_SIZE[0] or ul[1] >= HEATMAP_SIZE[1] \
+                    or br[0] < 0 or br[1] < 0:
+                # If not, just return the image as is
+                target_weight[joint_id] = 0
+                continue
+
+            # Generate gaussian
+            size = 2 * tmp_size + 1
+            x = np.arange(0, size, 1, np.float32)
+            y = x[:, np.newaxis]
+            x0 = y0 = size // 2
+            # The gaussian is not normalized, we want the center value to equal 1
+            g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * SIGMA ** 2))
+
+            # Usable gaussian range
+            g_x = max(0, -ul[0]), min(br[0], HEATMAP_SIZE[0]) - ul[0]
+            g_y = max(0, -ul[1]), min(br[1], HEATMAP_SIZE[1]) - ul[1]
+            # Image range
+            img_x = max(0, ul[0]), min(br[0], HEATMAP_SIZE[0])
+            img_y = max(0, ul[1]), min(br[1], HEATMAP_SIZE[1])
+
+            v = target_weight[joint_id]
+            if v > 0.5:
+                target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
+                    g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+
+    return target, target_weight
--- a/fluid/PaddleCV/human_pose_estimation/lib/coco_reader.py
+++ b/fluid/PaddleCV/human_pose_estimation/lib/coco_reader.py
+# Copyright (c) 2018-present, Baidu, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+"""Data reader for COCO dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import functools
+import numpy as np
+import cv2
+import random
+
+from utils.transforms import fliplr_joints
+from utils.transforms import get_affine_transform
+from utils.transforms import affine_transform
+from lib.base_reader import visualize, generate_target
+from pycocotools.coco import COCO
+
+# NOTE
+# -- COCO Datatset --
+# "keypoints": 
+# {
+#   0: "nose",
+#   1: "left_eye",
+#   2: "right_eye",
+#   3: "left_ear",
+#   4: "right_ear",
+#   5: "left_shoulder",
+#   6: "right_shoulder",
+#   7: "left_elbow",
+#   8: "right_elbow",
+#   9: "left_wrist",
+#   10: "right_wrist",
+#   11: "left_hip",
+#   12: "right_hip",
+#   13: "left_knee",
+#   14: "right_knee",
+#   15: "left_ankle",
+#   16: "right_ankle"
+# },
+#
+# "skeleton": 
+# [
+#   [16,14],[14,12],[17,15],[15,13],[12,13],[6,12],[7,13], [6,7],[6,8],
+#   [7,9],[8,10],[9,11],[2,3],[1,2],[1,3],[2,4],[3,5],[4,6],[5,7]
+# ]
+
+class Config:
+    """Configurations for COCO dataset.
+    """
+    DEBUG = False
+    TMPDIR = 'tmp_fold_for_debug'
+
+    # For reader
+    BUF_SIZE = 102400
+    THREAD = 1 if DEBUG else 8 # have to be larger than 0
+
+    # Fixed infos of dataset
+    DATAROOT = 'data/coco'
+    IMAGEDIR = 'images'
+    NUM_JOINTS = 17
+    FLIP_PAIRS = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
+    PARENT_IDS = None
+
+    # CFGS
+    SCALE_FACTOR = 0.3
+    ROT_FACTOR = 40
+    FLIP = True
+    TARGET_TYPE = 'gaussian'
+    SIGMA = 3
+    IMAGE_SIZE = [288, 384]
+    HEATMAP_SIZE = [72, 96]
+    ASPECT_RATIO = IMAGE_SIZE[0] * 1.0 / IMAGE_SIZE[1]
+    MEAN = [0.485, 0.456, 0.406]
+    STD = [0.229, 0.224, 0.225]
+    PIXEL_STD = 200
+
+cfg = Config()
+
+def _box2cs(box):
+    x, y, w, h = box[:4]
+    return _xywh2cs(x, y, w, h)
+
+def _xywh2cs(x, y, w, h):
+    center = np.zeros((2), dtype=np.float32)
+    center[0] = x + w * 0.5
+    center[1] = y + h * 0.5
+
+    if w > cfg.ASPECT_RATIO * h:
+        h = w * 1.0 / cfg.ASPECT_RATIO
+    elif w < cfg.ASPECT_RATIO * h:
+        w = h * cfg.ASPECT_RATIO
+    scale = np.array(
+        [w * 1.0 / cfg.PIXEL_STD, h * 1.0 / cfg.PIXEL_STD],
+        dtype=np.float32)
+    if center[0] != -1:
+        scale = scale * 1.25
+
+    return center, scale
+
+def _select_data(db):
+    db_selected = []
+    for rec in db:
+        num_vis = 0
+        joints_x = 0.0
+        joints_y = 0.0
+        for joint, joint_vis in zip(
+                rec['joints_3d'], rec['joints_3d_vis']):
+            if joint_vis[0] <= 0:
+                continue
+            num_vis += 1
+
+            joints_x += joint[0]
+            joints_y += joint[1]
+        if num_vis == 0:
+            continue
+
+        joints_x, joints_y = joints_x / num_vis, joints_y / num_vis
+
+        area = rec['scale'][0] * rec['scale'][1] * (cfg.PIXEL_STD**2)
+        joints_center = np.array([joints_x, joints_y])
+        bbox_center = np.array(rec['center'])
+        diff_norm2 = np.linalg.norm((joints_center-bbox_center), 2)
+        ks = np.exp(-1.0*(diff_norm2**2) / ((0.2)**2*2.0*area))
+
+        metric = (0.2 / 16) * num_vis + 0.45 - 0.2 / 16
+        if ks > metric:
+            db_selected.append(rec)
+
+    print('=> num db: {}'.format(len(db)))
+    print('=> num selected db: {}'.format(len(db_selected)))
+    return db_selected
+
+def _load_coco_keypoint_annotation(image_set_index, coco, _coco_ind_to_class_ind, image_set):
+    """Ground truth bbox and keypoints.
+    """
+    print('generating coco gt_db...')
+    gt_db = []
+    for index in image_set_index:
+        im_ann = coco.loadImgs(index)[0]
+        width = im_ann['width']
+        height = im_ann['height']
+
+        annIds = coco.getAnnIds(imgIds=index, iscrowd=False)
+        objs = coco.loadAnns(annIds)
+
+        # Sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            x, y, w, h = obj['bbox']
+            x1 = np.max((0, x))
+            y1 = np.max((0, y))
+            x2 = np.min((width - 1, x1 + np.max((0, w - 1))))
+            y2 = np.min((height - 1, y1 + np.max((0, h - 1))))
+            if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
+                obj['clean_bbox'] = [x1, y1, x2-x1, y2-y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        rec = []
+        for obj in objs:
+            cls = _coco_ind_to_class_ind[obj['category_id']]
+            if cls != 1:
+                continue
+
+            # Ignore objs without keypoints annotation
+            if max(obj['keypoints']) == 0:
+                continue
+
+            joints_3d = np.zeros((cfg.NUM_JOINTS, 3), dtype=np.float)
+            joints_3d_vis = np.zeros((cfg.NUM_JOINTS, 3), dtype=np.float)
+            for ipt in range(cfg.NUM_JOINTS):
+                joints_3d[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
+                joints_3d[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
+                joints_3d[ipt, 2] = 0
+                t_vis = obj['keypoints'][ipt * 3 + 2]
+                if t_vis > 1:
+                    t_vis = 1
+                joints_3d_vis[ipt, 0] = t_vis
+                joints_3d_vis[ipt, 1] = t_vis
+                joints_3d_vis[ipt, 2] = 0
+
+            center, scale = _box2cs(obj['clean_bbox'][:4])
+            rec.append({
+                'image': os.path.join(cfg.DATAROOT, cfg.IMAGEDIR, image_set+'2017', '%012d.jpg' % index),
+                'center': center,
+                'scale': scale,
+                'joints_3d': joints_3d,
+                'joints_3d_vis': joints_3d_vis,
+                'filename': '%012d.jpg' % index,
+                'imgnum': 0,
+            })
+
+        gt_db.extend(rec)
+    return gt_db
+
+def data_augmentation(sample, is_train):
+    image_file = sample['image']
+    filename = sample['filename'] if 'filename' in sample else ''
+    joints = sample['joints_3d']
+    joints_vis = sample['joints_3d_vis']
+    c = sample['center']
+    s = sample['scale']
+    # score = sample['score'] if 'score' in sample else 1
+    # imgnum = sample['imgnum'] if 'imgnum' in sample else ''
+    r = 0
+
+    data_numpy = cv2.imread(
+        image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
+
+    if is_train:
+        sf = cfg.SCALE_FACTOR
+        rf = cfg.ROT_FACTOR
+        s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
+        r = np.clip(np.random.randn()*rf, -rf*2, rf*2) \
+                if random.random() <= 0.6 else 0
+
+        if cfg.FLIP and random.random() <= 0.5:
+            data_numpy = data_numpy[:, ::-1, :]
+            joints, joints_vis = fliplr_joints(
+                    joints, joints_vis, data_numpy.shape[1], cfg.FLIP_PAIRS)
+            c[0] = data_numpy.shape[1] - c[0] - 1
+
+    trans = get_affine_transform(c, s, r, cfg.IMAGE_SIZE)
+    input = cv2.warpAffine(
+            data_numpy,
+            trans,
+            (int(cfg.IMAGE_SIZE[0]), int(cfg.IMAGE_SIZE[1])),
+            flags=cv2.INTER_LINEAR)
+
+    for i in range(cfg.NUM_JOINTS):
+        if joints_vis[i, 0] > 0.0:
+            joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
+
+    # Numpy target
+    target, target_weight = generate_target(cfg, joints, joints_vis)
+
+    if cfg.DEBUG:
+        visualize(cfg, filename, data_numpy, input.copy(), joints, target)
+
+    # Normalization
+    input = input.astype('float32').transpose((2, 0, 1)) / 255
+    input -= np.array(cfg.MEAN).reshape((3, 1, 1))
+    input /= np.array(cfg.STD).reshape((3, 1, 1))
+
+    if is_train:
+        return input, target, target_weight
+    else:
+        return input, target, target_weight, c, s
+
+# Create a reader
+def _reader_creator(root, image_set, shuffle=False, is_train=False, use_gt_bbox=False):
+
+    def reader():
+        if image_set in ['train', 'val']:
+            file_name = os.path.join(root, 'annotations', 'person_keypoints_'+image_set+'2017.json')
+        elif image_set in ['test', 'test-dev']:
+            file_name = os.path.join(root, 'annotations', 'image_info_'+image_set+'2017.json')
+        else:
+            raise ValueError("The dataset '{}' is not supported".format(image_set))
+
+        # Load annotations
+        coco = COCO(file_name)
+
+        # Deal with class names
+        cats = [cat['name']
+                for cat in coco.loadCats(coco.getCatIds())]
+        classes = ['__background__'] + cats
+        print('=> classes: {}'.format(classes))
+        num_classes = len(classes)
+        _class_to_ind = dict(zip(classes, range(num_classes)))
+        _class_to_coco_ind = dict(zip(cats, coco.getCatIds()))
+        _coco_ind_to_class_ind = dict([(_class_to_coco_ind[cls],
+                                        _class_to_ind[cls])
+                                        for cls in classes[1:]])
+
+        # Load image file names
+        image_set_index = coco.getImgIds()
+        num_images = len(image_set_index)
+        print('=> num_images: {}'.format(num_images))
+
+        if is_train or use_gt_bbox:
+            gt_db = _load_coco_keypoint_annotation(
+                    image_set_index, coco, _coco_ind_to_class_ind, image_set)
+            gt_db = _select_data(gt_db)
+
+        if shuffle:
+            random.shuffle(gt_db)
+
+        for db in gt_db:
+            yield db
+
+    mapper = functools.partial(data_augmentation, is_train=is_train)
+    return reader, mapper
+
+def train():
+    reader, mapper = _reader_creator(cfg.DATAROOT, 'train', shuffle=True, is_train=True)
+    def pop():
+        for i, x in enumerate(reader()):
+            yield mapper(x)
+    return pop
+
+def valid():
+    reader, mapper = _reader_creator(cfg.DATAROOT, 'val', shuffle=False, is_train=False)
+    def pop():
+        for i, x in enumerate(reader()):
+            yield mapper(x)
+    return pop
--- a/fluid/PaddleCV/human_pose_estimation/lib/mpii_reader.py
+++ b/fluid/PaddleCV/human_pose_estimation/lib/mpii_reader.py
+# Copyright (c) 2018-present, Baidu, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+"""Data reader for MPII."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+import functools
+import json
+import numpy as np
+import cv2
+
+from utils.transforms import fliplr_joints
+from utils.transforms import get_affine_transform
+from utils.transforms import affine_transform
+from lib.base_reader import visualize, generate_target
+
+class Config:
+    """Configurations for MPII dataset.
+    """
+    DEBUG = False
+    TMPDIR = 'tmp_fold_for_debug'
+
+    # For reader
+    BUF_SIZE = 102400
+    THREAD = 1 if DEBUG else 8 # have to be larger than 0
+
+    # Fixed infos of dataset
+    DATAROOT = 'data/mpii'
+    IMAGEDIR = 'images'
+    NUM_JOINTS = 16
+    FLIP_PAIRS = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]]
+    PARENT_IDS = [1, 2, 6, 6, 3, 4, 6, 6, 7, 8, 11, 12, 7, 7, 13, 14]
+
+    # CFGS
+    SCALE_FACTOR = 0.3
+    ROT_FACTOR = 40
+    FLIP = True
+    TARGET_TYPE = 'gaussian'
+    SIGMA = 3
+    IMAGE_SIZE = [384, 384]
+    HEATMAP_SIZE = [96, 96]
+    MEAN = [0.485, 0.456, 0.406]
+    STD = [0.229, 0.224, 0.225]
+
+cfg = Config()
+
+def data_augmentation(sample, is_train):
+    image_file = sample['image']
+    filename = sample['filename'] if 'filename' in sample else ''
+    joints = sample['joints_3d']
+    joints_vis = sample['joints_3d_vis']
+    c = sample['center']
+    s = sample['scale']
+    score = sample['score'] if 'score' in sample else 1
+    # imgnum = sample['imgnum'] if 'imgnum' in sample else ''
+    r = 0
+
+    data_numpy = cv2.imread(
+        image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
+
+    if is_train:
+        sf = cfg.SCALE_FACTOR
+        rf = cfg.ROT_FACTOR
+        s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
+        r = np.clip(np.random.randn()*rf, -rf*2, rf*2) \
+                if random.random() <= 0.6 else 0
+
+        if cfg.FLIP and random.random() <= 0.5:
+            data_numpy = data_numpy[:, ::-1, :]
+            joints, joints_vis = fliplr_joints(
+                    joints, joints_vis, data_numpy.shape[1], cfg.FLIP_PAIRS)
+            c[0] = data_numpy.shape[1] - c[0] - 1
+
+    trans = get_affine_transform(c, s, r, cfg.IMAGE_SIZE)
+    input = cv2.warpAffine(
+            data_numpy,
+            trans,
+            (int(cfg.IMAGE_SIZE[0]), int(cfg.IMAGE_SIZE[1])),
+            flags=cv2.INTER_LINEAR)
+
+    for i in range(cfg.NUM_JOINTS):
+        if joints_vis[i, 0] > 0.0:
+            joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
+
+    # Numpy target
+    target, target_weight = generate_target(cfg, joints, joints_vis)
+
+    if cfg.DEBUG:
+        visualize(cfg, filename, data_numpy, input.copy(), joints, target)
+
+    # Normalization
+    input = input.astype('float32').transpose((2, 0, 1)) / 255
+    input -= np.array(cfg.MEAN).reshape((3, 1, 1))
+    input /= np.array(cfg.STD).reshape((3, 1, 1))
+
+    if is_train:
+        return input, target, target_weight
+    else:
+        return input, target, target_weight, c, s, score
+
+def test_data_augmentation(sample):
+    image_file = sample['image']
+    filename = sample['filename'] if 'filename' in sample else ''
+
+    file_id = int(filename.split('.')[0].split('_')[1])
+
+    input = cv2.imread(
+            image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
+
+    input = cv2.resize(input, (int(cfg.IMAGE_SIZE[0]), int(cfg.IMAGE_SIZE[1])))
+
+    # Normalization
+    input = input.astype('float32').transpose((2, 0, 1)) / 255
+    input -= np.array(cfg.MEAN).reshape((3, 1, 1))
+    input /= np.array(cfg.STD).reshape((3, 1, 1))
+
+    return input, file_id
+
+# Create a reader
+def _reader_creator(root, image_set, shuffle=False, is_train=False):
+    def reader():
+        if image_set != 'test':
+            file_name = os.path.join(root, 'annot', image_set+'.json')
+            with open(file_name) as anno_file:
+                anno = json.load(anno_file)
+            print('=> load {} samples of {} dataset'.format(len(anno), image_set))
+
+            if shuffle:
+                random.shuffle(anno)
+
+            for a in anno:
+                image_name = a['image']
+
+                c = np.array(a['center'], dtype=np.float)
+                s = np.array([a['scale'], a['scale']], dtype=np.float)
+
+                # Adjust center/scale slightly to avoid cropping limbs
+                if c[0] != -1:
+                    c[1] = c[1] + 15 * s[1]
+                    s = s * 1.25
+
+                # MPII uses matlab format, index is based 1,
+                # we should first convert to 0-based index
+                c = c - 1
+
+                joints_3d = np.zeros((cfg.NUM_JOINTS, 3), dtype=np.float)
+                joints_3d_vis = np.zeros((cfg.NUM_JOINTS, 3), dtype=np.float)
+
+                joints = np.array(a['joints'])
+                joints[:, 0:2] = joints[:, 0:2] - 1
+                joints_vis = np.array(a['joints_vis'])
+                assert len(joints) == cfg.NUM_JOINTS, \
+                        'joint num diff: {} vs {}'.format(len(joints), cfg.NUM_JOINTS)
+
+                joints_3d[:, 0:2] = joints[:, 0:2]
+                joints_3d_vis[:, 0] = joints_vis[:]
+                joints_3d_vis[:, 1] = joints_vis[:]
+
+                yield dict(
+                        image = os.path.join(cfg.DATAROOT, cfg.IMAGEDIR, image_name),
+                        center = c,
+                        scale = s,
+                        joints_3d = joints_3d,
+                        joints_3d_vis = joints_3d_vis,
+                        filename = image_name,
+                        test_mode = False,
+                        imagenum = 0)
+        else:
+            fold = 'test'
+            for img_name in os.listdir(fold):
+                yield dict(image = os.path.join(fold, img_name),
+                           filename = img_name)
+
+    if not image_set == 'test':
+        mapper = functools.partial(data_augmentation, is_train=is_train)
+    else:
+        mapper = functools.partial(test_data_augmentation)
+    return reader, mapper
+
+def train():
+    reader, mapper = _reader_creator(cfg.DATAROOT, 'train', shuffle=True, is_train=True)
+    def pop():
+         for i, x in enumerate(reader()):
+             yield mapper(x)
+    return pop
+
+def valid():
+    reader, mapper = _reader_creator(cfg.DATAROOT, 'valid', shuffle=False, is_train=False)
+    def pop():
+        for i, x in enumerate(reader()):
+            yield mapper(x)
+    return pop
+
+def test():
+    reader, mapper = _reader_creator(cfg.DATAROOT, 'test')
+    def pop():
+        for i, x in enumerate(reader()):
+            yield mapper(x)
+    return pop
--- a/fluid/PaddleCV/human_pose_estimation/lib/pose_resnet.py
+++ b/fluid/PaddleCV/human_pose_estimation/lib/pose_resnet.py
+# Copyright (c) 2018-present, Baidu, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+"""Functions for building network."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+
+__all__ = ["ResNet", "ResNet50", "ResNet101", "ResNet152"]
+
+# Global parameters
+BN_MOMENTUM = 0.1
+
+class ResNet():
+    def __init__(self, layers=50, kps_num=16, test_mode=False):
+        """
+        :param layers:  int, the layers number which is used here
+        :param kps_num: int, the number of keypoints in accord with the dataset
+        :param test_mode: bool, if True, only return output heatmaps, no loss
+
+        :return: loss, output heatmaps
+        """
+        self.k = kps_num
+        self.layers = layers
+        self.test_mode = test_mode
+
+    def net(self, input, target=None, target_weight=None):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+
+        conv = self.conv_bn_layer(
+            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1)
+
+        conv = fluid.layers.conv2d_transpose(
+                input=conv, num_filters=256,
+                filter_size=4,
+                padding=1,
+                stride=2,
+                param_attr=fluid.param_attr.ParamAttr(
+                    initializer=fluid.initializer.Normal(0., 0.001)),
+                act=None,
+                bias_attr=False)
+        conv = fluid.layers.batch_norm(input=conv, act='relu', momentum=BN_MOMENTUM)
+        conv = fluid.layers.conv2d_transpose(
+                input=conv, num_filters=256,
+                filter_size=4,
+                padding=1,
+                stride=2,
+                param_attr=fluid.param_attr.ParamAttr(
+                    initializer=fluid.initializer.Normal(0., 0.001)),
+                act=None,
+                bias_attr=False)
+        conv = fluid.layers.batch_norm(input=conv, act='relu', momentum=BN_MOMENTUM)
+        conv = fluid.layers.conv2d_transpose(
+                input=conv, num_filters=256,
+                filter_size=4,
+                padding=1,
+                stride=2,
+                param_attr=fluid.param_attr.ParamAttr(
+                    initializer=fluid.initializer.Normal(0., 0.001)),
+                act=None,
+                bias_attr=False)
+        conv = fluid.layers.batch_norm(input=conv, act='relu', momentum=BN_MOMENTUM)
+
+        out = fluid.layers.conv2d(
+                input=conv,
+                num_filters=self.k,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                act=None,
+                param_attr=fluid.param_attr.ParamAttr(
+                    initializer=fluid.initializer.Normal(0., 0.001)))
+
+        if self.test_mode:
+            return out
+        else:
+            loss = self.calc_loss(out, target, target_weight)
+            return loss, out
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Normal(0., 0.001)),
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=conv, act=act, momentum=BN_MOMENTUM)
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            return self.conv_bn_layer(input, ch_out, 1, stride)
+        else:
+            return input
+
+    def calc_loss(self, heatmap, target, target_weight):
+        _, c, h, w = heatmap.shape
+        x = fluid.layers.reshape(heatmap, (-1, self.k, h*w))
+        y = fluid.layers.reshape(target,  (-1, self.k, h*w))
+        w = fluid.layers.reshape(target_weight, (-1, self.k))
+
+        x = fluid.layers.split(x, num_or_sections=self.k, dim=1)
+        y = fluid.layers.split(y, num_or_sections=self.k, dim=1)
+        w = fluid.layers.split(w, num_or_sections=self.k, dim=1)
+
+        _list = []
+        for idx in range(self.k):
+            _tmp = fluid.layers.scale(x=x[idx] - y[idx], scale=1.)
+            _tmp = _tmp * _tmp
+            _tmp = fluid.layers.reduce_mean(_tmp, dim=2)
+            _list.append(_tmp * w[idx])
+
+        _loss = fluid.layers.concat(_list, axis=0)
+        _loss = fluid.layers.reduce_mean(_loss)
+        return 0.5 * _loss
+
+    def bottleneck_block(self, input, num_filters, stride):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
+
+        short = self.shortcut(input, num_filters * 4, stride)
+
+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+def ResNet50():
+    model = ResNet(layers=50)
+    return model
+
+def ResNet101():
+    model = ResNet(layers=101)
+    return model
+
+def ResNet152():
+    model = ResNet(layers=152)
+    return model
--- a/fluid/PaddleCV/human_pose_estimation/test.py
+++ b/fluid/PaddleCV/human_pose_estimation/test.py
+# Copyright (c) 2018-present, Baidu, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+"""Functions for inference."""
+
+import os
+import argparse
+import functools
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+from tqdm import tqdm
+from lib import pose_resnet
+from utils.transforms import flip_back
+from utils.utility import *
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+
+# yapf: disable
+add_arg('batch_size',       int,   32,                  "Minibatch size.")
+add_arg('dataset',          str,   'mpii',              "Dataset")
+add_arg('use_gpu',          bool,  True,                "Whether to use GPU or not.")
+add_arg('num_epochs',       int,   140,                 "Number of epochs.")
+add_arg('total_images',     int,   144406,              "Training image number.")
+add_arg('kp_dim',           int,   16,                  "Class number.")
+add_arg('model_save_dir',   str,   "output",            "Model save directory")
+add_arg('with_mem_opt',     bool,  True,               "Whether to use memory optimization or not.")
+add_arg('pretrained_model', str,   None,                "Whether to use pretrained model.")
+add_arg('checkpoint',       str,   None,                "Whether to resume checkpoint.")
+add_arg('lr',               float, 0.001,               "Set learning rate.")
+add_arg('lr_strategy',      str,   "piecewise_decay",   "Set the learning rate decay strategy.")
+add_arg('flip_test',        bool,  True,                "Flip test")
+add_arg('shift_heatmap',    bool,  True,                "Shift heatmap")
+add_arg('post_process',     bool,  False,               "post process")
+# yapf: enable
+
+FLIP_PAIRS = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]]
+
+def test(args):
+    if args.dataset == 'coco':
+        import lib.coco_reader as reader
+        IMAGE_SIZE = [288, 384]
+        # HEATMAP_SIZE = [72, 96]
+        FLIP_PAIRS = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
+        args.kp_dim = 17
+        args.total_images = 144406 # 149813
+    elif args.dataset == 'mpii':
+        import lib.mpii_reader as reader
+        IMAGE_SIZE = [384, 384]
+        # HEATMAP_SIZE = [96, 96]
+        FLIP_PAIRS = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]]
+        args.kp_dim = 16
+        args.total_images = 2958 # validation
+    else:
+        raise ValueError('The dataset {} is not supported yet.'.format(args.dataset))
+
+    print_arguments(args)
+
+    # Image and target
+    image = layers.data(name='image', shape=[3, IMAGE_SIZE[1], IMAGE_SIZE[0]], dtype='float32')
+    file_id = layers.data(name='file_id', shape=[1,], dtype='int')
+
+    # Build model
+    model = pose_resnet.ResNet(layers=50, kps_num=args.kp_dim, test_mode=True)
+
+    # Output
+    output = model.net(input=image, target=None, target_weight=None)
+
+    # Parameters from model and arguments
+    params = {}
+    params["total_images"] = args.total_images
+    params["lr"] = args.lr
+    params["num_epochs"] = args.num_epochs
+    params["learning_strategy"] = {}
+    params["learning_strategy"]["batch_size"] = args.batch_size
+    params["learning_strategy"]["name"] = args.lr_strategy
+
+    if args.with_mem_opt:
+        fluid.memory_optimize(fluid.default_main_program(),
+                              skip_opt_set=[output.name])
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    args.pretrained_model = './pretrained/resnet_50/115'
+    if args.pretrained_model:
+        def if_exist(var):
+            exist_flag = os.path.exists(os.path.join(args.pretrained_model, var.name))
+            return exist_flag
+        fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist)
+
+    if args.checkpoint is not None:
+        fluid.io.load_persistables(exe, args.checkpoint)
+
+    # Dataloader
+    test_reader = paddle.batch(reader.test(), batch_size=args.batch_size)
+    feeder = fluid.DataFeeder(place=place, feed_list=[image, file_id])
+
+    test_exe = fluid.ParallelExecutor(
+            use_cuda=True if args.use_gpu else False,
+            main_program=fluid.default_main_program().clone(for_test=False),
+            loss_name=None)
+
+    fetch_list = [image.name, output.name]
+
+    for batch_id, data in tqdm(enumerate(test_reader())):
+        num_images = len(data)
+
+        file_ids = []
+        for i in range(num_images):
+            file_ids.append(data[i][1])
+
+        input_image, out_heatmaps  = test_exe.run(
+                fetch_list=fetch_list,
+                feed=feeder.feed(data))
+
+        if args.flip_test:
+            # Flip all the images in a same batch
+            data_fliped = []
+            for i in range(num_images):
+                data_fliped.append((
+                            data[i][0][:, :, ::-1],
+                            data[i][1]))
+
+            # Inference again
+            _, output_flipped = test_exe.run(
+                    fetch_list=fetch_list,
+                    feed=feeder.feed(data_fliped))
+
+            # Flip back
+            output_flipped = flip_back(output_flipped, FLIP_PAIRS)
+
+            # Feature is not aligned, shift flipped heatmap for higher accuracy
+            if args.shift_heatmap:
+                output_flipped[:, :, :, 1:] = \
+                        output_flipped.copy()[:, :, :, 0:-1]
+
+            # Aggregate
+            out_heatmaps = (out_heatmaps + output_flipped) * 0.5
+            save_predict_results(input_image, out_heatmaps, file_ids, fold_name='results')
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    test(args)
--- a/fluid/PaddleCV/human_pose_estimation/train.py
+++ b/fluid/PaddleCV/human_pose_estimation/train.py
+# Copyright (c) 2018-present, Baidu, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+"""Functions for training."""
+
+import os
+import numpy as np
+import cv2
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import argparse
+import functools
+
+from lib import pose_resnet
+from utils.utility import *
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',       int,   32,                   "Minibatch size.")
+add_arg('dataset',          str,   'mpii',               "Dataset")
+add_arg('use_gpu',          bool,  True,                 "Whether to use GPU or not.")
+add_arg('num_epochs',       int,   140,                  "Number of epochs.")
+add_arg('total_images',     int,   144406,               "Training image number.")
+add_arg('kp_dim',           int,   16,                   "Class number.")
+add_arg('model_save_dir',   str,   "output",             "Model save directory")
+add_arg('with_mem_opt',     bool,  True,                 "Whether to use memory optimization or not.")
+add_arg('pretrained_model', str,   None,                 "Whether to use pretrained model.")
+add_arg('checkpoint',       str,   None,                 "Whether to resume checkpoint.")
+add_arg('lr',               float, 0.001,                "Set learning rate.")
+add_arg('lr_strategy',      str,   "piecewise_decay",    "Set the learning rate decay strategy.")
+# yapf: enable
+
+def optimizer_setting(args, params):
+    lr_drop_ratio = 0.1
+
+    ls = params["learning_strategy"]
+
+    if ls["name"] == "piecewise_decay":
+        total_images = params["total_images"]
+        batch_size = ls["batch_size"]
+        step = int(total_images / batch_size + 1)
+
+        ls['epochs'] = [91, 121]
+        print('=> LR will be dropped at the epoch of {}'.format(ls['epochs']))
+
+        bd = [step * e for e in ls["epochs"]]
+        base_lr = params["lr"]
+        lr = []
+        lr = [base_lr * (lr_drop_ratio**i) for i in range(len(bd) + 1)]
+
+        # AdamOptimizer
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(
+                        learning_rate=fluid.layers.piecewise_decay(
+                        boundaries=bd, values=lr))
+    else:
+        lr = params["lr"]
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=lr,
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(0.0005))
+
+    return optimizer
+
+def train(args):
+    if args.dataset == 'coco':
+        import lib.coco_reader as reader
+        IMAGE_SIZE = [288, 384]
+        HEATMAP_SIZE = [72, 96]
+        args.kp_dim = 17
+        args.total_images = 144406 # 149813
+    elif args.dataset == 'mpii':
+        import lib.mpii_reader as reader
+        IMAGE_SIZE = [384, 384]
+        HEATMAP_SIZE = [96, 96]
+        args.kp_dim = 16
+        args.total_images = 22246
+    else:
+        raise ValueError('The dataset {} is not supported yet.'.format(args.dataset))
+
+    print_arguments(args)
+
+    # Image and target
+    image = layers.data(name='image', shape=[3, IMAGE_SIZE[1], IMAGE_SIZE[0]], dtype='float32')
+    target = layers.data(name='target', shape=[args.kp_dim, HEATMAP_SIZE[1], HEATMAP_SIZE[0]], dtype='float32')
+    target_weight = layers.data(name='target_weight', shape=[args.kp_dim, 1], dtype='float32')
+
+    # Build model
+    model = pose_resnet.ResNet(layers=50, kps_num=args.kp_dim)
+
+    # Output
+    loss, output = model.net(input=image, target=target, target_weight=target_weight)
+
+    # Parameters from model and arguments
+    params = {}
+    params["total_images"] = args.total_images
+    params["lr"] = args.lr
+    params["num_epochs"] = args.num_epochs
+    params["learning_strategy"] = {}
+    params["learning_strategy"]["batch_size"] = args.batch_size
+    params["learning_strategy"]["name"] = args.lr_strategy
+
+    # Initialize optimizer
+    optimizer = optimizer_setting(args, params)
+    optimizer.minimize(loss)
+
+    if args.with_mem_opt:
+        fluid.memory_optimize(fluid.default_main_program(),
+                              skip_opt_set=[loss.name, output.name, target.name])
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    args.pretrained_model = './pretrained/resnet_50/115'
+    if args.pretrained_model:
+        def if_exist(var):
+            exist_flag = os.path.exists(os.path.join(args.pretrained_model, var.name))
+            return exist_flag
+        fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist)
+
+    if args.checkpoint is not None:
+        fluid.io.load_persistables(exe, args.checkpoint)
+
+    # Dataloader
+    train_reader = paddle.batch(reader.train(), batch_size=args.batch_size)
+    feeder = fluid.DataFeeder(place=place, feed_list=[image, target, target_weight])
+
+    train_exe = fluid.ParallelExecutor(
+        use_cuda=True if args.use_gpu else False, loss_name=loss.name)
+    fetch_list = [image.name, loss.name, output.name]
+
+    for pass_id in range(params["num_epochs"]):
+        for batch_id, data in enumerate(train_reader()):
+            current_lr = np.array(paddle.fluid.global_scope().find_var('learning_rate').get_tensor())
+
+            input_image, loss, out_heatmaps = train_exe.run(
+                    fetch_list, feed=feeder.feed(data))
+
+            loss = np.mean(np.array(loss))
+
+            print('Epoch [{:4d}/{:3d}] LR: {:.10f} '
+                  'Loss = {:.5f}'.format(
+                  batch_id, pass_id, current_lr[0], loss))
+
+            if batch_id % 10 == 0:
+                save_batch_heatmaps(input_image, out_heatmaps, file_name='visualization@train.jpg', normalize=True)
+
+        model_path = os.path.join(args.model_save_dir + '/' + 'simplebase-{}'.format(args.dataset),
+                                  str(pass_id))
+        if not os.path.isdir(model_path):
+            os.makedirs(model_path)
+        fluid.io.save_persistables(exe, model_path)
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    train(args)
+
--- a/fluid/PaddleCV/human_pose_estimation/utils/__init__.py
+++ b/fluid/PaddleCV/human_pose_estimation/utils/__init__.py
--- a/fluid/PaddleCV/human_pose_estimation/utils/transforms.py
+++ b/fluid/PaddleCV/human_pose_estimation/utils/transforms.py
+# Copyright (c) 2018-present, Baidu, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+#
+# Based on
+# ------------------------------------------------------------------------------
+# https://github.com/Microsoft/human-pose-estimation.pytorch
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+"""Transforms functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import cv2
+
+
+def flip_back(output_flipped, matched_parts):
+    """
+    :param ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width)
+    """
+    assert output_flipped.ndim == 4,\
+        'output_flipped should be [batch_size, num_joints, height, width]'
+
+    output_flipped = output_flipped[:, :, :, ::-1]
+
+    for pair in matched_parts:
+        tmp = output_flipped[:, pair[0], :, :].copy()
+        output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+        output_flipped[:, pair[1], :, :] = tmp
+
+    return output_flipped
+
+
+def fliplr_joints(joints, joints_vis, width, matched_parts):
+    """Flip coords.
+    """
+    # Flip horizontal
+    joints[:, 0] = width - joints[:, 0] - 1
+
+    # Change left-right parts
+    for pair in matched_parts:
+        joints[pair[0], :], joints[pair[1], :] = \
+            joints[pair[1], :], joints[pair[0], :].copy()
+        joints_vis[pair[0], :], joints_vis[pair[1], :] = \
+            joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
+
+    return joints*joints_vis, joints_vis
+
+
+def transform_preds(coords, center, scale, output_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, 0, output_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=np.array([0, 0], dtype=np.float32),
+                         inv=0):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        print(scale)
+        scale = np.array([scale, scale])
+
+    scale_tmp = scale * 200.0
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+
+
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+
+    return src_result
+
+
+def crop(img, center, scale, output_size, rot=0):
+    trans = get_affine_transform(center, scale, rot, output_size)
+
+    dst_img = cv2.warpAffine(img,
+                             trans,
+                             (int(output_size[0]), int(output_size[1])),
+                             flags=cv2.INTER_LINEAR)
+
+    return dst_img
--- a/fluid/PaddleCV/human_pose_estimation/utils/utility.py
+++ b/fluid/PaddleCV/human_pose_estimation/utils/utility.py
+# Copyright (c) 2018-present, Baidu, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+"""Utility functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import distutils.util
+import numpy as np
+import cv2
+from pathlib import Path
+
+
+def print_arguments(args):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).iteritems()):
+        print("%s: %s" % (arg, value))
+    print("------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
+
+
+def get_max_preds(batch_heatmaps):
+    """Get predictions from score maps.
+    heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
+    """
+    assert isinstance(batch_heatmaps, np.ndarray), \
+        'batch_heatmaps should be numpy.ndarray'
+    assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim'
+
+    batch_size = batch_heatmaps.shape[0]
+    num_joints = batch_heatmaps.shape[1]
+    width = batch_heatmaps.shape[3]
+    heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1))
+    idx = np.argmax(heatmaps_reshaped, 2)
+    maxvals = np.amax(heatmaps_reshaped, 2)
+
+    maxvals = maxvals.reshape((batch_size, num_joints, 1))
+    idx = idx.reshape((batch_size, num_joints, 1))
+
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+
+    preds[:, :, 0] = (preds[:, :, 0]) % width
+    preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)
+
+    pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
+    pred_mask = pred_mask.astype(np.float32)
+
+    preds *= pred_mask
+    return preds, maxvals
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+
+
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+
+    return src_result
+
+
+def crop(img, center, scale, output_size, rot=0):
+    trans = get_affine_transform(center, scale, rot, output_size)
+
+    dst_img = cv2.warpAffine(img,
+                             trans,
+                             (int(output_size[0]), int(output_size[1])),
+                             flags=cv2.INTER_LINEAR)
+
+    return dst_img
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=np.array([0, 0], dtype=np.float32),
+                         inv=0):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        print(scale)
+        scale = np.array([scale, scale])
+
+    scale_tmp = scale * 200.0
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def transform_preds(coords, center, scale, output_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, 0, output_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+
+
+def get_final_preds(args, batch_heatmaps, center, scale):
+    coords, maxvals = get_max_preds(batch_heatmaps)
+
+    heatmap_height = batch_heatmaps.shape[2]
+    heatmap_width = batch_heatmaps.shape[3]
+
+    # Post-processing
+    if args.post_process:
+        for n in range(coords.shape[0]):
+            for p in range(coords.shape[1]):
+                hm = batch_heatmaps[n][p]
+                px = int(math.floor(coords[n][p][0] + 0.5))
+                py = int(math.floor(coords[n][p][1] + 0.5))
+                if 1 < px < heatmap_width-1 and 1 < py < heatmap_height-1:
+                    diff = np.array([hm[py][px+1] - hm[py][px-1],
+                                     hm[py+1][px]-hm[py-1][px]])
+                    coords[n][p] += np.sign(diff) * .25
+
+    preds = coords.copy()
+
+    # Transform back
+    for i in range(coords.shape[0]):
+        preds[i] = transform_preds(coords[i], center[i], scale[i],
+                                   [heatmap_width, heatmap_height])
+    return preds, maxvals
+
+
+def calc_dists(preds, target, normalize):
+    preds = preds.astype(np.float32)
+    target = target.astype(np.float32)
+    dists = np.zeros((preds.shape[1], preds.shape[0]))
+    for n in range(preds.shape[0]):
+        for c in range(preds.shape[1]):
+            if target[n, c, 0] > 1 and target[n, c, 1] > 1:
+                normed_preds = preds[n, c, :] / normalize[n]
+                normed_targets = target[n, c, :] / normalize[n]
+                dists[c, n] = np.linalg.norm(normed_preds - normed_targets)
+            else:
+                dists[c, n] = -1
+    return dists
+
+
+def dist_acc(dists, thr=0.5):
+    """Return percentage below threshold while ignoring values with a -1.
+    """
+    dist_cal = np.not_equal(dists, -1)
+    num_dist_cal = dist_cal.sum()
+    if num_dist_cal > 0:
+        return np.less(dists[dist_cal], thr).sum() * 1.0 / num_dist_cal
+    else:
+        return -1
+
+
+def accuracy(output, target, hm_type='gaussian', thr=0.5):
+    """
+    Calculate accuracy according to PCK,
+    but uses ground truth heatmap rather than x,y locations
+    First value to be returned is average accuracy across 'idxs',
+    followed by individual accuracies
+    """
+    idx = list(range(output.shape[1]))
+    norm = 1.0
+    if hm_type == 'gaussian':
+        pred, _ = get_max_preds(output)
+        target, _ = get_max_preds(target)
+        h = output.shape[2]
+        w = output.shape[3]
+        norm = np.ones((pred.shape[0], 2)) * np.array([h, w]) / 10
+    dists = calc_dists(pred, target, norm)
+
+    acc = np.zeros((len(idx) + 1))
+    avg_acc = 0
+    cnt = 0
+
+    for i in range(len(idx)):
+        acc[i + 1] = dist_acc(dists[idx[i]])
+        if acc[i + 1] >= 0:
+            avg_acc = avg_acc + acc[i + 1]
+            cnt += 1
+
+    avg_acc = avg_acc / cnt if cnt != 0 else 0
+    if cnt != 0:
+        acc[0] = avg_acc
+    return acc, avg_acc, cnt, pred
+
+
+def save_batch_heatmaps(batch_image, batch_heatmaps, file_name, normalize=True):
+    """
+    :param batch_image: [batch_size, channel, height, width]
+    :param batch_heatmaps: ['batch_size, num_joints, height, width]
+    :param file_name: saved file name
+    """
+    if normalize:
+        min = np.array(batch_image.min(), dtype=np.float)
+        max = np.array(batch_image.max(), dtype=np.float)
+
+        batch_image = np.add(batch_image, -min)
+        batch_image = np.divide(batch_image, max - min + 1e-5)
+
+    batch_size, num_joints, \
+            heatmap_height, heatmap_width = batch_heatmaps.shape
+
+    grid_image = np.zeros((batch_size*heatmap_height,
+                           (num_joints+1)*heatmap_width,
+                           3),
+                          dtype=np.uint8)
+
+    preds, maxvals = get_max_preds(batch_heatmaps)
+
+    for i in range(batch_size):
+        image = batch_image[i] * 255
+        image = image.clip(0, 255).astype(np.uint8)
+        image = image.transpose(1, 2, 0)
+
+        heatmaps = batch_heatmaps[i] * 255
+        heatmaps = heatmaps.clip(0, 255).astype(np.uint8)
+
+        resized_image = cv2.resize(image,
+                                   (int(heatmap_width), int(heatmap_height)))
+        height_begin = heatmap_height * i
+        height_end = heatmap_height * (i + 1)
+        for j in range(num_joints):
+            cv2.circle(resized_image,
+                       (int(preds[i][j][0]), int(preds[i][j][1])),
+                       1, [0, 0, 255], 1)
+            heatmap = heatmaps[j, :, :]
+            colored_heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
+            masked_image = colored_heatmap*0.7 + resized_image*0.3
+            cv2.circle(masked_image,
+                       (int(preds[i][j][0]), int(preds[i][j][1])),
+                       1, [0, 0, 255], 1)
+
+            width_begin = heatmap_width * (j+1)
+            width_end = heatmap_width * (j+2)
+            grid_image[height_begin:height_end, width_begin:width_end, :] = \
+                masked_image
+        grid_image[height_begin:height_end, 0:heatmap_width, :] = resized_image
+
+    cv2.imwrite(file_name, grid_image)
+
+
+def save_predict_results(batch_image, batch_heatmaps, file_ids, fold_name, normalize=True):
+    """
+    :param batch_image: [batch_size, channel, height, width]
+    :param batch_heatmaps: ['batch_size, num_joints, height, width]
+    :param fold_name: saved files in this folder
+    """
+    save_dir = Path('./{}'.format(fold_name))
+    try:
+        save_dir.mkdir()
+    except OSError:
+        pass
+
+    if normalize:
+        min = np.array(batch_image.min(), dtype=np.float)
+        max = np.array(batch_image.max(), dtype=np.float)
+
+        batch_image = np.add(batch_image, -min)
+        batch_image = np.divide(batch_image, max - min + 1e-5)
+
+    batch_size, num_joints, \
+            heatmap_height, heatmap_width = batch_heatmaps.shape
+
+    # (32, 16, 2), (32, 16, 1))
+    preds, maxvals = get_max_preds(batch_heatmaps)
+
+    # Blue
+    icolor = (255, 137, 0)
+    ocolor = (138, 255, 0)
+
+    for i in range(batch_size):
+        image = batch_image[i] * 255
+        image = image.clip(0, 255).astype(np.uint8)
+        image = image.transpose(1, 2, 0)
+        image = cv2.resize(image, (384, 384))
+
+        file_id = file_ids[i]
+        imgname = save_dir.joinpath('rendered_{}.png'.format(str(file_id).zfill(7)))
+
+        for j in range(num_joints):
+            x, y = preds[i][j]
+            cv2.circle(image, (int(x * 4), int(y * 4)), 3, icolor, -1, 16)
+            cv2.circle(image, (int(x * 4), int(y * 4)), 6, ocolor,  1, 16)
+
+        cv2.imwrite(str(imgname), image)
+
+# Clean format output
+def print_name_value(name_value, full_arch_name):
+    names = name_value.keys()
+    values = name_value.values()
+    num_values = len(name_value)
+
+    results = []
+    for value in values:
+        results.append('| {:.3f}'.format(value))
+
+    print(
+        '| Arch ' +
+        ' '.join(['| {}'.format(name) for name in names]) +
+        ' |'
+    )
+    print('|---' * (num_values+1) + '|')
+    print('| ' + 'SIMPLEBASE RESNET50 ' + ' '.join(results) + ' |')
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value.
+    """
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count if self.count != 0 else 0
--- a/fluid/PaddleCV/human_pose_estimation/val.py
+++ b/fluid/PaddleCV/human_pose_estimation/val.py
+# Copyright (c) 2018-present, Baidu, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+"""Functions for validation."""
+
+import os
+import argparse
+import functools
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+from collections import OrderedDict
+from scipy.io import loadmat, savemat
+from lib import pose_resnet
+from utils.transforms import flip_back
+from utils.utility import *
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+
+# yapf: disable
+add_arg('batch_size',       int,   32,                  "Minibatch size.")
+add_arg('dataset',          str,   'mpii',              "Dataset")
+add_arg('use_gpu',          bool,  True,                "Whether to use GPU or not.")
+add_arg('num_epochs',       int,   140,                 "Number of epochs.")
+add_arg('total_images',     int,   144406,              "Training image number.")
+add_arg('kp_dim',           int,   16,                  "Class number.")
+add_arg('model_save_dir',   str,   "output",            "Model save directory")
+add_arg('with_mem_opt',     bool,  True,               "Whether to use memory optimization or not.")
+add_arg('pretrained_model', str,   None,                "Whether to use pretrained model.")
+add_arg('checkpoint',       str,   None,                "Whether to resume checkpoint.")
+add_arg('lr',               float, 0.001,               "Set learning rate.")
+add_arg('lr_strategy',      str,   "piecewise_decay",   "Set the learning rate decay strategy.")
+add_arg('flip_test',        bool,  True,                "Flip test")
+add_arg('shift_heatmap',    bool,  True,                "Shift heatmap")
+add_arg('post_process',     bool,  True,               "Post process")
+# yapf: enable
+
+def valid(args):
+    if args.dataset == 'coco':
+        import lib.coco_reader as reader
+        IMAGE_SIZE = [288, 384]
+        HEATMAP_SIZE = [72, 96]
+        FLIP_PAIRS = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
+        args.kp_dim = 17
+        args.total_images = 144406 # 149813
+    elif args.dataset == 'mpii':
+        import lib.mpii_reader as reader
+        IMAGE_SIZE = [384, 384]
+        HEATMAP_SIZE = [96, 96]
+        FLIP_PAIRS = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]]
+        args.kp_dim = 16
+        args.total_images = 2958 # validation
+    else:
+        raise ValueError('The dataset {} is not supported yet.'.format(args.dataset))
+
+    print_arguments(args)
+
+    # Image and target
+    image = layers.data(name='image', shape=[3, IMAGE_SIZE[1], IMAGE_SIZE[0]], dtype='float32')
+    target = layers.data(name='target', shape=[args.kp_dim, HEATMAP_SIZE[1], HEATMAP_SIZE[0]], dtype='float32')
+    target_weight = layers.data(name='target_weight', shape=[args.kp_dim, 1], dtype='float32')
+    center = layers.data(name='center', shape=[2,], dtype='float32')
+    scale = layers.data(name='scale', shape=[2,], dtype='float32')
+    score = layers.data(name='score', shape=[1,], dtype='float32')
+
+    # Build model
+    model = pose_resnet.ResNet(layers=50, kps_num=args.kp_dim)
+
+    # Output
+    loss, output = model.net(input=image, target=target, target_weight=target_weight)
+
+    # Parameters from model and arguments
+    params = {}
+    params["total_images"] = args.total_images
+    params["lr"] = args.lr
+    params["num_epochs"] = args.num_epochs
+    params["learning_strategy"] = {}
+    params["learning_strategy"]["batch_size"] = args.batch_size
+    params["learning_strategy"]["name"] = args.lr_strategy
+
+    if args.with_mem_opt:
+        fluid.memory_optimize(fluid.default_main_program(),
+                              skip_opt_set=[loss.name, output.name, target.name])
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    args.pretrained_model = './pretrained/resnet_50/115'
+    if args.pretrained_model:
+        def if_exist(var):
+            exist_flag = os.path.exists(os.path.join(args.pretrained_model, var.name))
+            return exist_flag
+        fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist)
+
+    if args.checkpoint is not None:
+        fluid.io.load_persistables(exe, args.checkpoint)
+
+    # Dataloader
+    valid_reader = paddle.batch(reader.valid(), batch_size=args.batch_size)
+    feeder = fluid.DataFeeder(place=place, feed_list=[image, target, target_weight, center, scale, score])
+
+    valid_exe = fluid.ParallelExecutor(
+            use_cuda=True if args.use_gpu else False,
+            main_program=fluid.default_main_program().clone(for_test=False),
+            loss_name=loss.name)
+
+    fetch_list = [image.name, loss.name, output.name, target.name]
+
+    # For validation
+    acc = AverageMeter()
+    idx = 0
+
+    num_samples = args.total_images
+    all_preds = np.zeros((num_samples, args.kp_dim, 3),
+                        dtype=np.float32)
+    all_boxes = np.zeros((num_samples, 6))
+
+    for batch_id, data in enumerate(valid_reader()):
+        num_images = len(data)
+
+        centers = []
+        scales = []
+        scores = []
+        for i in range(num_images):
+            centers.append(data[i][3])
+            scales.append(data[i][4])
+            scores.append(data[i][5])
+
+        input_image, loss, out_heatmaps, target_heatmaps = valid_exe.run(
+                fetch_list=fetch_list,
+                feed=feeder.feed(data))
+
+        if args.flip_test:
+            # Flip all the images in a same batch
+            data_fliped = []
+            for i in range(num_images):
+                # Input, target, target_weight, c, s, score
+                data_fliped.append((
+                            # np.flip(input_image, 3)[i],
+                            data[i][0][:, :, ::-1],
+                            data[i][1],
+                            data[i][2],
+                            data[i][3],
+                            data[i][4],
+                            data[i][5]))
+
+            # Inference again
+            _, _, output_flipped, _ = valid_exe.run(
+                    fetch_list=fetch_list,
+                    feed=feeder.feed(data_fliped))
+
+            # Flip back
+            output_flipped = flip_back(output_flipped, FLIP_PAIRS)
+
+            # Feature is not aligned, shift flipped heatmap for higher accuracy
+            if args.shift_heatmap:
+                output_flipped[:, :, :, 1:] = \
+                        output_flipped.copy()[:, :, :, 0:-1]
+
+            # Aggregate
+            # out_heatmaps.shape: size[b, args.kp_dim, 96, 96]
+            out_heatmaps = (out_heatmaps + output_flipped) * 0.5
+
+        loss = np.mean(np.array(loss))
+
+        # Accuracy
+        _, avg_acc, cnt, pred = accuracy(out_heatmaps, target_heatmaps)
+        acc.update(avg_acc, cnt)
+
+        # Current center, scale, score
+        centers = np.array(centers)
+        scales = np.array(scales)
+        scores = np.array(scores)
+
+        preds, maxvals = get_final_preds(
+            args, out_heatmaps, centers, scales)
+
+        all_preds[idx:idx + num_images, :, 0:2] = preds[:, :, 0:2]
+        all_preds[idx:idx + num_images, :, 2:3] = maxvals
+        # Double check this all_boxes parts
+        all_boxes[idx:idx + num_images, 0:2] = centers[:, 0:2]
+        all_boxes[idx:idx + num_images, 2:4] = scales[:, 0:2]
+        all_boxes[idx:idx + num_images, 4] = np.prod(scales*200, 1)
+        all_boxes[idx:idx + num_images, 5] = scores
+        # image_path.extend(meta['image'])
+
+        idx += num_images
+
+        print('Epoch [{:4d}] '
+              'Loss = {:.5f} '
+              'Acc = {:.5f}'.format(batch_id, loss, acc.avg))
+
+        if batch_id % 10 == 0:
+            save_batch_heatmaps(input_image, out_heatmaps, file_name='visualization@val.jpg', normalize=True)
+
+    # Evaluate
+    args.DATAROOT = 'data/mpii'
+    args.TEST_SET = 'valid'
+    output_dir = ''
+    filenames = []
+    imgnums = []
+    image_path = []
+    name_values, perf_indicator = mpii_evaluate(
+        args, all_preds, output_dir, all_boxes, image_path,
+        filenames, imgnums)
+
+    print_name_value(name_values, perf_indicator)
+
+def mpii_evaluate(cfg, preds, output_dir, *args, **kwargs):
+    # Convert 0-based index to 1-based index
+    preds = preds[:, :, 0:2] + 1.0
+
+    if output_dir:
+        pred_file = os.path.join(output_dir, 'pred.mat')
+        savemat(pred_file, mdict={'preds': preds})
+
+    if 'test' in cfg.TEST_SET:
+        return {'Null': 0.0}, 0.0
+
+    SC_BIAS = 0.6
+    threshold = 0.5
+
+    gt_file = os.path.join(cfg.DATAROOT,
+                           'annot',
+                           'gt_{}.mat'.format(cfg.TEST_SET))
+    gt_dict = loadmat(gt_file)
+    dataset_joints = gt_dict['dataset_joints']
+    jnt_missing = gt_dict['jnt_missing']
+    pos_gt_src = gt_dict['pos_gt_src']
+    headboxes_src = gt_dict['headboxes_src']
+
+    pos_pred_src = np.transpose(preds, [1, 2, 0])
+
+    head = np.where(dataset_joints == 'head')[1][0]
+    lsho = np.where(dataset_joints == 'lsho')[1][0]
+    lelb = np.where(dataset_joints == 'lelb')[1][0]
+    lwri = np.where(dataset_joints == 'lwri')[1][0]
+    lhip = np.where(dataset_joints == 'lhip')[1][0]
+    lkne = np.where(dataset_joints == 'lkne')[1][0]
+    lank = np.where(dataset_joints == 'lank')[1][0]
+
+    rsho = np.where(dataset_joints == 'rsho')[1][0]
+    relb = np.where(dataset_joints == 'relb')[1][0]
+    rwri = np.where(dataset_joints == 'rwri')[1][0]
+    rkne = np.where(dataset_joints == 'rkne')[1][0]
+    rank = np.where(dataset_joints == 'rank')[1][0]
+    rhip = np.where(dataset_joints == 'rhip')[1][0]
+
+    jnt_visible = 1 - jnt_missing
+    uv_error = pos_pred_src - pos_gt_src
+    uv_err = np.linalg.norm(uv_error, axis=1)
+    headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :]
+    headsizes = np.linalg.norm(headsizes, axis=0)
+    headsizes *= SC_BIAS
+    scale = np.multiply(headsizes, np.ones((len(uv_err), 1)))
+    scaled_uv_err = np.divide(uv_err, scale)
+    scaled_uv_err = np.multiply(scaled_uv_err, jnt_visible)
+    jnt_count = np.sum(jnt_visible, axis=1)
+    less_than_threshold = np.multiply((scaled_uv_err <= threshold),
+                                      jnt_visible)
+    PCKh = np.divide(100.*np.sum(less_than_threshold, axis=1), jnt_count)
+
+    # Save
+    rng = np.arange(0, 0.5+0.01, 0.01)
+    pckAll = np.zeros((len(rng), cfg.kp_dim))
+
+    for r in range(len(rng)):
+        threshold = rng[r]
+        less_than_threshold = np.multiply(scaled_uv_err <= threshold,
+                                          jnt_visible)
+        pckAll[r, :] = np.divide(100.*np.sum(less_than_threshold, axis=1),
+                                 jnt_count)
+
+    PCKh = np.ma.array(PCKh, mask=False)
+    PCKh.mask[6:8] = True
+
+    jnt_count = np.ma.array(jnt_count, mask=False)
+    jnt_count.mask[6:8] = True
+    jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64)
+
+    name_value = [
+        ('Head', PCKh[head]),
+        ('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])),
+        ('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])),
+        ('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])),
+        ('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])),
+        ('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])),
+        ('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])),
+        ('Mean', np.sum(PCKh * jnt_ratio)),
+        ('Mean@0.1', np.sum(pckAll[11, :] * jnt_ratio))
+    ]
+    name_value = OrderedDict(name_value)
+
+    return name_value, name_value['Mean']
+
+# TODO: coco_evaluate()
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    valid(args)
--- a/fluid/PaddleCV/metric_learning/README.md
+++ b/fluid/PaddleCV/metric_learning/README.md
@@ -17,87 +17,62 @@ Running sample code in this directory requires PaddelPaddle Fluid v0.14.0 and la

 ## Data preparation

-Caltech-UCSD Birds 200 (CUB-200) is an image dataset including 200 bird species. We use it to conduct the metric learning experiments. More details of this dataset can be found from its [official website](http://www.vision.caltech.edu/visipedia/CUB-200.html). First of all, preparation of CUB-200 data can be done as:
+Stanford Online Product(SOP) dataset contains 120,053 images of 22,634 products downloaded from eBay.com. We use it to conduct the metric learning experiments. For training, 59,5511 out of 11,318 classes are used, and 11,316 classes(60,502 images) are held out for testing. First of all, preparation of SOP data can be done as:
 ```
 cd data/
-sh download_cub200.sh
-```
-The script ```data/split.py``` is used to split train/valid set. In our settings, we use images from first 100 classes(001-100) as training data while the other 100 classes are validation data. After the splitting, there are two label files which contain train and validation image labels respectively:
-
-* *CUB200_train.txt*: label file of CUB-200 training set, with each line seperated by ```SPACE```, like:
-```
-current_path/images/097.Orchard_Oriole/Orchard_Oriole_0021_2432168643.jpg 97
-current_path/images/097.Orchard_Oriole/Orchard_Oriole_0022_549995638.jpg 97
-current_path/images/097.Orchard_Oriole/Orchard_Oriole_0034_2244771004.jpg 97
-current_path/images/097.Orchard_Oriole/Orchard_Oriole_0010_2501839798.jpg 97
-current_path/images/097.Orchard_Oriole/Orchard_Oriole_0008_491860362.jpg 97
-current_path/images/097.Orchard_Oriole/Orchard_Oriole_0015_2545116359.jpg 97
-...
-```
-* *CUB200_val.txt*: label file of CUB-200 validation set, with each line seperated by ```SPACE```, like.
-```
-current_path/images/154.Red_eyed_Vireo/Red_eyed_Vireo_0029_59210443.jpg 154
-current_path/images/154.Red_eyed_Vireo/Red_eyed_Vireo_0021_2693953672.jpg 154
-current_path/images/154.Red_eyed_Vireo/Red_eyed_Vireo_0016_2917350638.jpg 154
-current_path/images/154.Red_eyed_Vireo/Red_eyed_Vireo_0027_2503540454.jpg 154
-current_path/images/154.Red_eyed_Vireo/Red_eyed_Vireo_0026_2502710393.jpg 154
-current_path/images/154.Red_eyed_Vireo/Red_eyed_Vireo_0022_2693134681.jpg 154
-...
+sh download_sop.sh
 ```

 ## Training metric learning models

-To train a metric learning model, one need to set the neural network as backbone and the metric loss function to optimize. One example of training triplet loss using ResNet-50 is shown below:
+To train a metric learning model, one need to set the neural network as backbone and the metric loss function to optimize. We train meiric learning model using softmax or [arcmargin](https://arxiv.org/abs/1801.07698) loss firstly, and then fine-turned the model using other metric learning loss, such as triplet, [quadruplet](https://arxiv.org/abs/1710.00478) and [eml](https://arxiv.org/abs/1212.6094) loss. One example of training using arcmargin loss is shown below:
+

 ```
-python train.py  \
+python train_elem.py  \
        --model=ResNet50 \
-        --lr=0.001 \
-        --num_epochs=120 \
+        --train_batch_size=256 \
+        --test_batch_size=50 \
+        --lr=0.01 \
+        --total_iter_num=30000 \
        --use_gpu=True \
-        --train_batch_size=20 \
-        --test_batch_size=20 \
-        --loss_name=tripletloss \
-        --model_save_dir="output_tripletloss"
+        --pretrained_model=${path_to_pretrain_imagenet_model} \
+        --model_save_dir=${output_model_path} \
+        --loss_name=arcmargin \
+        --arc_scale=80.0 \ 
+        --arc_margin=0.15 \
+        --arc_easy_margin=False
 ```
 **parameter introduction:**
-* **model**: name model to use. Default: "SE_ResNeXt50_32x4d".
-* **num_epochs**: the number of epochs. Default: 120.
-* **batch_size**: the size of each mini-batch. Default: 256.
+* **model**: name model to use. Default: "ResNet50".
+* **train_batch_size**: the size of each training mini-batch. Default: 256.
+* **test_batch_size**: the size of each testing mini-batch. Default: 50.
+* **lr**: initialized learning rate. Default: 0.01.
+* **total_iter_num**: total number of training iterations. Default: 30000.
 * **use_gpu**: whether to use GPU or not. Default: True.
-* **model_save_dir**: the directory to save trained model. Default: "output".
-* **lr**: initialized learning rate. Default: 0.1.
 * **pretrained_model**: model path for pretraining. Default: None.
-
-**training log:** the log from training ResNet-50 based triplet loss is like:
-```
-Pass 0, trainbatch 0, lr 9.99999974738e-05, loss_metric 0.0700866878033, loss_cls 5.23635625839, acc1 0.0, acc5 0.100000008941, time 0.16 sec
-Pass 0, trainbatch 10, lr 9.99999974738e-05, loss_metric 0.0752244070172, loss_cls 5.30303478241, acc1 0.0, acc5 0.100000008941, time 0.14 sec
-Pass 0, trainbatch 20, lr 9.99999974738e-05, loss_metric 0.0840565115213, loss_cls 5.41880941391, acc1 0.0, acc5 0.0333333350718, time 0.14 sec
-Pass 0, trainbatch 30, lr 9.99999974738e-05, loss_metric 0.0698839947581, loss_cls 5.35385560989, acc1 0.0, acc5 0.0333333350718, time 0.14 sec
-Pass 0, trainbatch 40, lr 9.99999974738e-05, loss_metric 0.0596057735384, loss_cls 5.34744024277, acc1 0.0, acc5 0.0, time 0.14 sec
-Pass 0, trainbatch 50, lr 9.99999974738e-05, loss_metric 0.067836754024, loss_cls 5.37124729156, acc1 0.0, acc5 0.0333333350718, time 0.14 sec
-Pass 0, trainbatch 60, lr 9.99999974738e-05, loss_metric 0.0637686774135, loss_cls 5.47412204742, acc1 0.0, acc5 0.0333333350718, time 0.14 sec
-Pass 0, trainbatch 70, lr 9.99999974738e-05, loss_metric 0.0772982165217, loss_cls 5.38295936584, acc1 0.0, acc5 0.0, time 0.14 sec
-Pass 0, trainbatch 80, lr 9.99999974738e-05, loss_metric 0.0861896127462, loss_cls 5.41250753403, acc1 0.0, acc5 0.0, time 0.14 sec
-Pass 0, trainbatch 90, lr 9.99999974738e-05, loss_metric 0.0653102770448, loss_cls 5.53133153915, acc1 0.0, acc5 0.0, time 0.14 sec
-...
-```
+* **model_save_dir**: the directory to save trained model. Default: "output".
+* **loss_name**: loss fortraining model. Default: "softmax".
+* **arc_scale**: parameter of arcmargin loss. Default: 80.0.
+* **arc_margin**: parameter of arcmargin loss. Default: 0.15.
+* **arc_easy_margin**: parameter of arcmargin loss. Default: False.

 ## Finetuning

-Finetuning is to finetune model weights in a specific task by loading pretrained weights. After initializing ```path_to_pretrain_model```, one can finetune a model as:
+Finetuning is to finetune model weights in a specific task by loading pretrained weights. After training model using softmax or arcmargin loss, one can finetune the model using triplet, quadruplet or eml loss. One example of fine-turned using eml loss is shown below:
+
 ```
-python train.py  \
+python train_pair.py  \
        --model=ResNet50 \
-        --pretrained_model=${path_to_pretrain_model} \
-        --lr=0.001 \
-        --num_epochs=120 \
+        --train_batch_size=160 \
+        --test_batch_size=50 \
+        --lr=0.0001 \
+        --total_iter_num=100000 \
        --use_gpu=True \
-        --train_batch_size=20 \
-        --test_batch_size=20 \
-        --loss_name=tripletloss \
-        --model_save_dir="output_tripletloss"
+        --pretrained_model=${path_to_pretrain_arcmargin_model} \
+        --model_save_dir=${output_model_path} \
+        --loss_name=eml \
+        --samples_each_class=2
 ```

 ## Evaluation
@@ -105,58 +80,26 @@ Evaluation is to evaluate the performance of a trained model. One can download [
 ```
 python eval.py \
       --model=ResNet50 \
+       --batch_size=50 \
       --pretrained_model=${path_to_pretrain_model} \
-       --batch_size=30 \
-       --loss_name=tripletloss
-```
-
-According to the congfiguration of evaluation, the output log is like:
-```
-testbatch 0, loss 17.0384693146, recall 0.133333333333, time 0.08 sec
-testbatch 10, loss 15.4248628616, recall 0.2, time 0.07 sec
-testbatch 20, loss 19.3986873627, recall 0.0666666666667, time 0.07 sec
-testbatch 30, loss 19.8149013519, recall 0.166666666667, time 0.07 sec
-testbatch 40, loss 18.7500724792, recall 0.0333333333333, time 0.07 sec
-testbatch 50, loss 15.1477527618, recall 0.166666666667, time 0.07 sec
-testbatch 60, loss 21.6039619446, recall 0.0666666666667, time 0.07 sec
-testbatch 70, loss 16.3203811646, recall 0.1, time 0.08 sec
-testbatch 80, loss 17.3300457001, recall 0.133333333333, time 0.14 sec
-testbatch 90, loss 17.9943237305, recall 0.0333333333333, time 0.07 sec
-testbatch 100, loss 20.4538421631, recall 0.1, time 0.07 sec
-End test, test_loss 18.2126255035, test recall 0.573597359736
-...
 ```

 ## Inference
 Inference is used to get prediction score or image features based on trained models.
 ```
-python infer.py --model=ResNet50 \
-                --pretrained_model=${path_to_pretrain_model}
-```
-The output contains learned feature for each test sample:
-```
-Test-0-feature: [0.1551965  0.48882252 0.3528545  ... 0.35809007 0.6210782 0.34474897]
-Test-1-feature: [0.26215672 0.71406883 0.36118034 ... 0.4711366  0.6783772 0.26591945]
-Test-2-feature: [0.26164916 0.46013424 0.38381338 ... 0.47984493 0.5830286 0.22124235]
-Test-3-feature: [0.22502825 0.44153655 0.29287377 ... 0.45510024 0.81386226 0.21451607]
-Test-4-feature: [0.27748746 0.49068335 0.28269237 ... 0.47356504 0.73254013 0.22317657]
-Test-5-feature: [0.17743547 0.5232162  0.35012805 ... 0.38921246 0.80238944 0.26693743]
-Test-6-feature: [0.18314484 0.4294481  0.37652573 ... 0.4795592  0.7446839 0.24178651]
-Test-7-feature: [0.25836483 0.49866533 0.3469289  ... 0.38316026 0.56015515 0.22388287]
-Test-8-feature: [0.30613047 0.5200348  0.2847372  ... 0.5700768  0.76645917 0.26504722]
-Test-9-feature: [0.3305695  0.46257797 0.27108437 ... 0.42891273 0.5112956 0.26442713]
-Test-10-feature: [0.16024818 0.46871603 0.32608703 ... 0.3341719  0.6876993 0.26097256]
-Test-11-feature: [0.37611157 0.6006333  0.3023942  ... 0.4729057  0.53841203 0.19621202]
-Test-12-feature: [0.17515017 0.41597834 0.45567667 ... 0.45650777 0.5987687 0.25734115]
-...
+python infer.py \
+       --model=ResNet50 \
+       --batch_size=1 \         
+       --pretrained_model=${path_to_pretrain_model}
 ```

 ## Performances

 For comparation, many metric learning models with different neural networks and loss functions are trained using corresponding experiential parameters. Recall@Rank-1 is used as evaluation metric and the performance is listed in the table. Pretrained models can be downloaded by clicking related model names.

-|model | ResNet50 | SE-ResNeXt-50
+|pretrain model | softmax | arcmargin
 |- | - | -:
-|[triplet loss]() | 57.36% | 51.62%
-|[eml loss]() | 58.84% | 52.94%  
-|[quadruplet loss]() | 62.67% | 56.40%
+|without fine-tuned | 77.42% | 78.11%
+|fine-tuned with triplet | 78.37% | 79.21%
+|fine-tuned with quadruplet | 78.10% | 79.59%
+|fine-tuned with eml | 79.32% | 80.11%
--- a/fluid/PaddleCV/metric_learning/_ce.py
+++ b/fluid/PaddleCV/metric_learning/_ce.py
+# this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+# NOTE kpi.py should shared in models in some way!!!!
+
+train_cost_kpi = CostKpi('train_cost', 0.02 0, actived=True)
+test_recall_kpi = AccKpi('test_recall', 0.02, 0, actived=True)
+
+tracking_kpis = [
+    train_cost_kpi,
+    test_recall_kpi,
+]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    log_to_ce(log)
+
--- a/fluid/PaddleCV/metric_learning/data/download_cub200.sh
+++ b/fluid/PaddleCV/metric_learning/data/download_cub200.sh
-wget http://www.vision.caltech.edu/visipedia-data/CUB-200/images.tgz
-tar zxf images.tgz
-find images|grep jpg|grep -v "\._" > list.txt
-python split.py
-rm -rf images.tgz list.txt
--- a/fluid/PaddleCV/metric_learning/data/download_sop.sh
+++ b/fluid/PaddleCV/metric_learning/data/download_sop.sh
+wget ftp://cs.stanford.edu/cs/cvgl/Stanford_Online_Products.zip
+unzip Stanford_Online_Products.zip
--- a/fluid/PaddleCV/metric_learning/data/split.py
+++ b/fluid/PaddleCV/metric_learning/data/split.py
-input = open("list.txt", "r").readlines()
-fout_train = open("CUB200_train.txt", "w")
-fout_valid = open("CUB200_val.txt", "w")
-for i, item in enumerate(input):
-    label = item.strip().split("/")[-2].split(".")[0]
-    label = int(label)
-    if label <= 100:
-        fout = fout_train
-    else:
-        fout = fout_valid
-    fout.write(item.strip() + " " + str(label) + "\n")
-
-fout_train.close()
-fout_valid.close()
--- a/fluid/PaddleCV/metric_learning/eval.py
+++ b/fluid/PaddleCV/metric_learning/eval.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import os
-import numpy as np
-import time
 import sys
+import math
+import time
+import argparse
+import functools
+import numpy as np
 import paddle
 import paddle.fluid as fluid
 import models
-import argparse
-import functools
-from losses import tripletloss
-from losses import quadrupletloss
-from losses import emlloss
-from losses.metrics import recall_topk
+import reader
 from utility import add_arguments, print_arguments
-import math
+from utility import fmt_time, recall_topk

 # yapf: disable
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
-add_arg('batch_size',        int,     120,                  "Minibatch size.")
-add_arg('use_gpu',           bool,    True,                 "Whether to use GPU or not.")
-add_arg('image_shape',       str,     "3,224,224",          "Input image size.")
-add_arg('with_mem_opt',      bool,    False,                "Whether to use memory optimization or not.")
-add_arg('pretrained_model',  str,     None,                 "Whether to use pretrained model.")
-add_arg('model',             str,     "SE_ResNeXt50_32x4d", "Set the network to use.")
-add_arg('loss_name',         str,     "emlloss",            "Loss name.")
+add_arg('model', str, "ResNet50", "Set the network to use.")
+add_arg('embedding_size', int, 0, "Embedding size.")
+add_arg('batch_size', int, 10, "Minibatch size.")
+add_arg('image_shape', str, "3,224,224", "Input image size.")
+add_arg('use_gpu', bool, True, "Whether to use GPU or not.")
+add_arg('with_mem_opt', bool, False, "Whether to use memory optimization or not.")
+add_arg('pretrained_model', str, None, "Whether to use pretrained model.")
 # yapf: enable

 model_list = [m for m in dir(models) if "__" not in m]
@@ -34,8 +36,6 @@ def eval(args):
    model_name = args.model
    pretrained_model = args.pretrained_model
    with_memory_optimization = args.with_mem_opt
-    loss_name = args.loss_name
-
    image_shape = [int(m) for m in args.image_shape.split(",")]

    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
@@ -46,19 +46,8 @@ def eval(args):

    # model definition
    model = models.__dict__[model_name]()
-    out = model.net(input=image, class_dim=200)
-
-    if loss_name == "tripletloss":
-        metricloss = tripletloss()
-        cost = metricloss.loss(out[0])
-    elif loss_name == "quadrupletloss":
-        metricloss = quadrupletloss()
-        cost = metricloss.loss(out[0])
-    elif loss_name == "emlloss":
-        metricloss = emlloss()
-        cost = metricloss.loss(out[0])
-
-    avg_cost = fluid.layers.mean(x=cost)
+    out = model.net(input=image, embedding_size=args.embedding_size)
+
    test_program = fluid.default_main_program().clone(for_test=True)

    if with_memory_optimization:
@@ -75,39 +64,29 @@ def eval(args):

        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)

-    test_reader = paddle.batch(metricloss.test_reader, batch_size=args.batch_size)
+    test_reader = paddle.batch(reader.test(args), batch_size=args.batch_size, drop_last=False)
    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])

-    fetch_list = [avg_cost.name, out[0].name]
+    fetch_list = [out.name]

-    test_info = [[]]
-    f = []
-    l = []
+    f, l = [], []
    for batch_id, data in enumerate(test_reader()):
-        if len(data) < args.batch_size:
-            continue
        t1 = time.time()
-        loss, feas = exe.run(test_program,
-                             fetch_list=fetch_list,
-                             feed=feeder.feed(data))
+        [feas] = exe.run(test_program, fetch_list=fetch_list, feed=feeder.feed(data))
        label = np.asarray([x[1] for x in data])
        f.append(feas)
        l.append(label)

        t2 = time.time()
        period = t2 - t1
-        loss = np.mean(np.array(loss))
-        test_info[0].append(loss)
        if batch_id % 20 == 0:
-            print("testbatch {0}, loss {1}, time {2}".format(  \
-                  batch_id, loss, "%2.2f sec" % period))
+            print("[%s] testbatch %d, time %2.2f sec" % \
+                    (fmt_time(), batch_id, period))

-    test_loss = np.array(test_info[0]).mean()
    f = np.vstack(f)
    l = np.hstack(l)
    recall = recall_topk(f, l, k=1)
-    print("End test, test_loss {0}, test recall {1}".format(  \
-          test_loss, recall))
+    print("[%s] End test %d, test_recall %.5f" % (fmt_time(), len(f), recall))
    sys.stdout.flush()



--- a/fluid/PaddleCV/metric_learning/imgtool.py
+++ b/fluid/PaddleCV/metric_learning/imgtool.py
+""" tools for processing images
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cv2
+import math
+import random
+import functools
+import numpy as np
+
+#random.seed(0)
+
+def rotate_image(img):
+    """ rotate_image """
+    (h, w) = img.shape[:2]
+    center = (w // 2, h // 2)
+    angle = random.randint(-10, 10)
+    M = cv2.getRotationMatrix2D(center, angle, 1.0)
+    rotated = cv2.warpAffine(img, M, (w, h))
+    return rotated
+
+def random_crop(img, size, scale=None, ratio=None):
+    """ random_crop """
+    scale = [0.08, 1.0] if scale is None else scale
+    ratio = [3. / 4., 4. / 3.] if ratio is None else ratio
+
+    aspect_ratio = math.sqrt(random.uniform(*ratio))
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+
+    bound = min((float(img.shape[1]) / img.shape[0]) / (w ** 2),
+                (float(img.shape[0]) / img.shape[1]) / (h ** 2))
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+
+    target_area = img.shape[0] * img.shape[1] * random.uniform(scale_min,
+                                                             scale_max)
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+
+    i = random.randint(0, img.shape[0] - h)
+    j = random.randint(0, img.shape[1] - w)
+
+    img = img[i:i+h, j:j+w, :]
+    resized = cv2.resize(img, (size, size), interpolation=cv2.INTER_LANCZOS4)
+    return resized
+
+def distort_color(img):
+    return img
+
+def resize_short(img, target_size):
+    """ resize_short """
+    percent = float(target_size) / min(img.shape[0], img.shape[1])
+    resized_width = int(round(img.shape[1] * percent))
+    resized_height = int(round(img.shape[0] * percent))
+    resized = cv2.resize(img, (resized_width, resized_height), interpolation=cv2.INTER_LANCZOS4)
+    return resized
+
+def crop_image(img, target_size, center):
+    """ crop_image """
+    height, width = img.shape[:2]
+    size = target_size
+    if center == True:
+        w_start = (width - size) // 2
+        h_start = (height - size) // 2
+    else:
+        w_start = random.randint(0, width - size)
+        h_start = random.randint(0, height - size)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img[h_start:h_end, w_start:w_end, :]
+    return img
+
+def process_image(sample, mode, color_jitter, rotate,
+        crop_size=224, mean=None, std=None):
+    """ process_image """
+
+    mean = [0.485, 0.456, 0.406] if mean is None else mean
+    std = [0.229, 0.224, 0.225] if std is None else std
+
+    image_name = sample[0]
+    img = cv2.imread(image_name) # BGR mode, but need RGB mode
+
+    if mode == 'train':
+        if rotate:
+            img = rotate_image(img)
+        if crop_size > 0:
+            img = random_crop(img, crop_size)
+        if color_jitter:
+            img = distort_color(img)
+        if random.randint(0, 1) == 1:
+            img = img[:, ::-1, :]
+    else:
+        if crop_size > 0:
+            img = resize_short(img, crop_size)
+            img = crop_image(img, target_size=crop_size, center=True)
+
+    img = img[:, :, ::-1].astype('float32').transpose((2, 0, 1)) / 255
+
+    img_mean = np.array(mean).reshape((3, 1, 1))
+    img_std = np.array(std).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+
+    if mode == 'train' or mode == 'val':
+        return (img, sample[1])
+    elif mode == 'test':
+        return (img, )
+
+def image_mapper(**kwargs):
+    """ image_mapper """
+    return functools.partial(process_image, **kwargs)
--- a/fluid/PaddleCV/metric_learning/infer.py
+++ b/fluid/PaddleCV/metric_learning/infer.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import os
-import numpy as np
-import time
 import sys
+import math
+import time
+import argparse
+import functools
+import numpy as np
 import paddle
 import paddle.fluid as fluid
 import models
-import argparse
-import functools
-from losses import tripletloss
+import reader
 from utility import add_arguments, print_arguments
-import math

 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('batch_size',       int,   1,                    "Minibatch size.")
-add_arg('use_gpu',          bool,  True,                 "Whether to use GPU or not.")
-add_arg('image_shape',      str,   "3,224,224",          "Input image size.")
-add_arg('with_mem_opt',     bool,  False,                "Whether to use memory optimization or not.")
-add_arg('pretrained_model', str,   None,                 "Whether to use pretrained model.")
-add_arg('model',            str,   "SE_ResNeXt50_32x4d", "Set the network to use.")
+add_arg('model', str, "ResNet50", "Set the network to use.")
+add_arg('embedding_size', int, 0, "Embedding size.")
+add_arg('batch_size', int, 1, "Minibatch size.")
+add_arg('image_shape', str, "3,224,224", "Input image size.")
+add_arg('use_gpu', bool, True, "Whether to use GPU or not.")
+add_arg('with_mem_opt', bool, False, "Whether to use memory optimization or not.")
+add_arg('pretrained_model', str, None, "Whether to use pretrained model.")
 # yapf: enable

 model_list = [m for m in dir(models) if "__" not in m]
@@ -39,7 +44,8 @@ def infer(args):

    # model definition
    model = models.__dict__[model_name]()
-    out = model.net(input=image, class_dim=200)
+    out = model.net(input=image, embedding_size=args.embedding_size)
+
    test_program = fluid.default_main_program().clone(for_test=True)

    if with_memory_optimization:
@@ -56,15 +62,13 @@ def infer(args):

        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)

-    infer_reader = paddle.batch(tripletloss().infer_reader, batch_size=args.batch_size)
+    infer_reader = paddle.batch(reader.infer(args), batch_size=args.batch_size, drop_last=False)
    feeder = fluid.DataFeeder(place=place, feed_list=[image])

-    fetch_list = [out[0].name]
+    fetch_list = [out.name]

    for batch_id, data in enumerate(infer_reader()):
-        result = exe.run(test_program,
-                         fetch_list=fetch_list,
-                         feed=feeder.feed(data))
+        result = exe.run(test_program, fetch_list=fetch_list, feed=feeder.feed(data))
        result = result[0][0].reshape(-1)
        print("Test-{0}-feature: {1}".format(batch_id, result))
        sys.stdout.flush()

--- a/fluid/PaddleCV/metric_learning/losses/__init__.py
+++ b/fluid/PaddleCV/metric_learning/losses/__init__.py
-from .tripletloss import tripletloss
-from .quadrupletloss import quadrupletloss
-from .emlloss import emlloss
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from .softmaxloss import SoftmaxLoss
+from .arcmarginloss import ArcMarginLoss
+from .tripletloss import TripletLoss
+from .quadrupletloss import QuadrupletLoss
+from .emlloss import EmlLoss
--- a/fluid/PaddleCV/metric_learning/losses/arcmarginloss.py
+++ b/fluid/PaddleCV/metric_learning/losses/arcmarginloss.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle.fluid as fluid
+
+class ArcMarginLoss():
+    def __init__(self, class_dim, margin=0.15, scale=80.0, easy_margin=False):
+        self.class_dim = class_dim
+        self.margin = margin
+        self.scale = scale
+        self.easy_margin = easy_margin
+
+    def loss(self, input, label):
+        out = self.arc_margin_product(input, label, self.class_dim, self.margin, self.scale, self.easy_margin)
+        #loss = fluid.layers.softmax_with_cross_entropy(logits=out, label=label)
+        out = fluid.layers.softmax(input=out)
+        loss = fluid.layers.cross_entropy(input=out, label=label)
+        return loss, out
+
+    def arc_margin_product(self, input, label, out_dim, m, s, easy_margin=False):
+        #input = fluid.layers.l2_normalize(input, axis=1)
+        input_norm = fluid.layers.sqrt(fluid.layers.reduce_sum(fluid.layers.square(input), dim=1))
+        input = fluid.layers.elementwise_div(input, input_norm, axis=0)
+
+        weight = fluid.layers.create_parameter(
+                    shape=[out_dim, input.shape[1]],
+                    dtype='float32',
+                    name='weight_norm',
+                    attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Xavier()))
+        #weight = fluid.layers.l2_normalize(weight, axis=1)
+        weight_norm = fluid.layers.sqrt(fluid.layers.reduce_sum(fluid.layers.square(weight), dim=1))
+        weight = fluid.layers.elementwise_div(weight, weight_norm, axis=0)
+        weight = fluid.layers.transpose(weight, perm = [1, 0])
+        cosine = fluid.layers.mul(input, weight)
+        sine = fluid.layers.sqrt(1.0 - fluid.layers.square(cosine) + 1e-6)
+
+        cos_m = math.cos(m)
+        sin_m = math.sin(m)
+        phi = cosine * cos_m - sine * sin_m
+
+        th = math.cos(math.pi - m)
+        mm = math.sin(math.pi - m) * m
+        if easy_margin:
+            phi = self.paddle_where_more_than(cosine, 0, phi, cosine)
+        else:
+            phi = self.paddle_where_more_than(cosine, th, phi, cosine-mm)
+
+        one_hot = fluid.layers.one_hot(input=label, depth=out_dim)
+        output = fluid.layers.elementwise_mul(one_hot, phi) + fluid.layers.elementwise_mul((1.0 - one_hot), cosine)
+        output = output * s
+        return output
+
+    def paddle_where_more_than(self, target, limit, x, y):
+        mask = fluid.layers.cast(x=(target>limit), dtype='float32')
+        output = fluid.layers.elementwise_mul(mask, x) + fluid.layers.elementwise_mul((1.0 - mask), y)
+        return output
--- a/fluid/PaddleCV/metric_learning/losses/metrics.py
+++ b/fluid/PaddleCV/metric_learning/losses/metrics.py
-import numpy as np
-def recall_topk(fea, lab, k = 1):
-    fea = np.array(fea)
-    fea = fea.reshape(fea.shape[0], -1)
-    n = np.sqrt(np.sum(fea**2, 1)).reshape(-1, 1)
-    fea = fea/n
-    a = np.sum(fea ** 2, 1).reshape(-1, 1)
-    b = a.T
-    ab = np.dot(fea, fea.T)
-    d = a + b - 2*ab
-    d = d + np.eye(len(fea)) * 1e8
-    sorted_index = np.argsort(d, 1)
-    res = 0
-    for i in range(len(fea)):
-        pred = lab[sorted_index[i][0]]
-        if lab[i] == pred:
-            res += 1.0
-    res = res/len(fea)
-    return res
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function

-import subprocess
 import os
-
-def get_gpu_num():
-    visibledevice = os.getenv('CUDA_VISIBLE_DEVICES')
-    if visibledevice:
-        devicenum = len(visibledevice.split(','))
-    else:
-        devicenum = subprocess.check_output(
-            [str.encode('nvidia-smi'), str.encode('-L')]).decode('utf-8').count('\n')
-    return devicenum
-
+import numpy as np
 import paddle as paddle
 import paddle.fluid as fluid

 def generate_index(batch_size, samples_each_class):
-    a = np.arange(0, batch_size * batch_size)
-    a = a.reshape(-1, batch_size)
+    a = np.arange(0, batch_size * batch_size) # N*N x 1
+    a = a.reshape(-1, batch_size) # N x N
    steps = batch_size // samples_each_class
    res = []
    for i in range(batch_size):
@@ -72,7 +46,3 @@ def calculate_order_dist_matrix(feature, batch_size, samples_each_class):
    d = fluid.layers.gather(d, index=index_var)
    d = fluid.layers.reshape(d, shape=[-1, batch_size])
    return d
-    
-
-
-
--- a/fluid/PaddleCV/metric_learning/losses/datareader.py
+++ b/fluid/PaddleCV/metric_learning/losses/datareader.py
-import os
-import math
-import random
-import functools
-import numpy as np
-import paddle
-from PIL import Image, ImageEnhance
-
-random.seed(0)
-
-DATA_DIM = 224
-
-THREAD = 8
-BUF_SIZE = 1024000
-
-DATA_DIR = "./data/"
-TRAIN_LIST = './data/CUB200_train.txt'
-TEST_LIST = './data/CUB200_val.txt'
-#DATA_DIR = "./data/CUB200/"
-#TRAIN_LIST = './data/CUB200/CUB200_train.txt'
-#TEST_LIST = './data/CUB200/CUB200_val.txt'
-train_data = {}
-test_data = {}
-train_list = open(TRAIN_LIST, "r").readlines()
-train_image_list = []
-for i, item in enumerate(train_list):
-    path, label = item.strip().split()
-    label = int(label) - 1
-    train_image_list.append((path, label))
-    if label not in train_data:
-        train_data[label] = []
-    train_data[label].append(path)
-
-test_list = open(TEST_LIST, "r").readlines()
-test_image_list = []
-infer_image_list = []
-for i, item in enumerate(test_list):
-    path, label = item.strip().split()
-    label = int(label) - 1
-    test_image_list.append((path, label))
-    infer_image_list.append(path)
-    if label not in test_data:
-        test_data[label] = []
-    test_data[label].append(path)
-
-print("train_data size:", len(train_data))
-print("test_data size:", len(test_data))
-print("test_data image number:", len(test_image_list))
-random.shuffle(test_image_list)
-
-
-img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
-img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
-
-
-def resize_short(img, target_size):
-    percent = float(target_size) / min(img.size[0], img.size[1])
-    resized_width = int(round(img.size[0] * percent))
-    resized_height = int(round(img.size[1] * percent))
-    img = img.resize((resized_width, resized_height), Image.BILINEAR)
-    return img
-
-def Scale(img, size):
-    w, h = img.size
-    if (w <= h and w == size) or (h <= w and h == size):
-        return img
-    if w < h:
-        ow = size
-        oh = int(size * h / w)
-        return img.resize((ow, oh), Image.BILINEAR)
-    else:
-        oh = size
-        ow = int(size * w / h)
-        return img.resize((ow, oh), Image.BILINEAR)
-
-def CenterCrop(img, size):
-    w, h = img.size
-    th, tw = int(size), int(size)
-    x1 = int(round((w - tw) / 2.))
-    y1 = int(round((h - th) / 2.))
-    return img.crop((x1, y1, x1 + tw, y1 + th))
-
-def crop_image(img, target_size, center):
-    width, height = img.size
-    size = target_size
-    if center == True:
-        w_start = (width - size) / 2
-        h_start = (height - size) / 2
-    else:
-        w_start = random.randint(0, width - size)
-        h_start = random.randint(0, height - size)
-    w_end = w_start + size
-    h_end = h_start + size
-    img = img.crop((w_start, h_start, w_end, h_end))
-    return img
-
-def RandomResizedCrop(img, size):
-    for attempt in range(10):
-        area = img.size[0] * img.size[1]
-        target_area = random.uniform(0.08, 1.0) * area
-        aspect_ratio = random.uniform(3. / 4, 4. / 3)
-
-        w = int(round(math.sqrt(target_area * aspect_ratio)))
-        h = int(round(math.sqrt(target_area / aspect_ratio)))
-
-        if random.random() < 0.5:
-            w, h = h, w
-
-        if w <= img.size[0] and h <= img.size[1]:
-            x1 = random.randint(0, img.size[0] - w)
-            y1 = random.randint(0, img.size[1] - h)
-
-            img = img.crop((x1, y1, x1 + w, y1 + h))
-            assert(img.size == (w, h))
-
-            return img.resize((size, size), Image.BILINEAR)
-
-    w = min(img.size[0], img.size[1])
-    i = (img.size[1] - w) // 2
-    j = (img.size[0] - w) // 2
-    img = img.crop((i, j, i+w, j+w))
-    img = img.resize((size, size), Image.BILINEAR)
-    return img
-
-
-def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
-    aspect_ratio = math.sqrt(random.uniform(*ratio))
-    w = 1. * aspect_ratio
-    h = 1. / aspect_ratio
-
-    bound = min((float(img.size[0]) / img.size[1]) / (w**2),
-                (float(img.size[1]) / img.size[0]) / (h**2))
-    scale_max = min(scale[1], bound)
-    scale_min = min(scale[0], bound)
-
-    target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
-                                                             scale_max)
-    target_size = math.sqrt(target_area)
-    w = int(target_size * w)
-    h = int(target_size * h)
-
-    i = random.randint(0, img.size[0] - w)
-    j = random.randint(0, img.size[1] - h)
-
-    img = img.crop((i, j, i + w, j + h))
-    img = img.resize((size, size), Image.BILINEAR)
-    return img
-
-
-def rotate_image(img):
-    angle = random.randint(-10, 10)
-    img = img.rotate(angle)
-    return img
-
-
-def distort_color(img):
-    def random_brightness(img, lower=0.8, upper=1.2):
-        e = random.uniform(lower, upper)
-        return ImageEnhance.Brightness(img).enhance(e)
-
-    def random_contrast(img, lower=0.8, upper=1.2):
-        e = random.uniform(lower, upper)
-        return ImageEnhance.Contrast(img).enhance(e)
-
-    def random_color(img, lower=0.8, upper=1.2):
-        e = random.uniform(lower, upper)
-        return ImageEnhance.Color(img).enhance(e)
-
-    ops = [random_brightness, random_contrast, random_color]
-    random.shuffle(ops)
-
-    img = ops[0](img)
-    img = ops[1](img)
-    img = ops[2](img)
-
-    return img
-
-def process_image_imagepath(sample, mode, color_jitter, rotate):
-    imgpath = sample[0]
-    img = Image.open(imgpath)
-    if mode == 'train':
-        if rotate: img = rotate_image(img)
-        img = RandomResizedCrop(img, DATA_DIM)
-    else:
-        img = Scale(img, 256)
-        img = CenterCrop(img, DATA_DIM)
-    if mode == 'train':
-        if color_jitter:
-            img = distort_color(img)
-        if random.randint(0, 1) == 1:
-            img = img.transpose(Image.FLIP_LEFT_RIGHT)
-
-    if img.mode != 'RGB':
-        img = img.convert('RGB')
-
-    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
-    img -= img_mean
-    img /= img_std
-
-    if mode in ['train', 'test']:
-        return img, sample[1]
-    elif mode == 'infer':
-        return [img]
-
-
-def eml_iterator(data,
-                 mode,
-                 batch_size,
-                 samples_each_class,
-                 iter_size,
-                 shuffle=False,
-                 color_jitter=False,
-                 rotate=False):
-    def reader():
-        labs = list(data.keys())
-        lab_num = len(labs)
-        ind = list(range(0, lab_num))
-        assert batch_size % samples_each_class == 0, "batch_size % samples_each_class != 0"
-        num_class = batch_size // samples_each_class
-        for i in range(iter_size):
-            random.shuffle(ind)
-            for n in range(num_class):
-                lab_ind = ind[n]
-                label = labs[lab_ind]
-                data_list = data[label]
-                random.shuffle(data_list)
-                for s in range(samples_each_class):
-                    path = DATA_DIR + data_list[s]
-                    yield path, label
-
-    mapper = functools.partial(
-        process_image_imagepath, mode=mode, color_jitter=color_jitter, rotate=rotate)
-
-    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE, order=True)
-
-
-def quadruplet_iterator(data,
-                        mode,
-                        class_num,
-                        samples_each_class,
-                        iter_size,
-                        shuffle=False,
-                        color_jitter=False,
-                        rotate=False):
-    def reader():
-        labs = list(data.keys())
-        lab_num = len(labs)
-        ind = list(range(0, lab_num))
-        for i in range(iter_size):
-            random.shuffle(ind)
-            ind_sample = ind[:class_num]
-
-            for ind_i in ind_sample:
-                lab = labs[ind_i]
-                data_list = data[lab]
-                data_ind = list(range(0, len(data_list)))
-                random.shuffle(data_ind)
-                anchor_ind = data_ind[:samples_each_class]
-
-                for anchor_ind_i in anchor_ind:
-                    anchor_path = DATA_DIR + data_list[anchor_ind_i]
-                    yield anchor_path, lab
-
-    mapper = functools.partial(
-        process_image_imagepath, mode=mode, color_jitter=color_jitter, rotate=rotate)
-
-    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE, order=True)
-
-
-def triplet_iterator(data,
-                     mode,
-                     batch_size,
-                     iter_size,
-                     shuffle=False,
-                     color_jitter=False,
-                     rotate=False):
-    def reader():
-        labs = list(data.keys())
-        lab_num = len(labs)
-        ind = list(range(0, lab_num))
-        for i in range(iter_size):
-            random.shuffle(ind)
-            ind_pos, ind_neg = ind[:2]
-            lab_pos = labs[ind_pos]
-            pos_data_list = data[lab_pos]
-            data_ind = list(range(0, len(pos_data_list)))
-            random.shuffle(data_ind)
-            anchor_ind, pos_ind = data_ind[:2]
-
-            lab_neg = labs[ind_neg]
-            neg_data_list = data[lab_neg]
-            neg_ind = random.randint(0, len(neg_data_list) - 1)
-            
-            anchor_path = DATA_DIR + pos_data_list[anchor_ind]
-            yield anchor_path, lab_pos
-
-            pos_path = DATA_DIR + pos_data_list[pos_ind]
-            yield pos_path, lab_pos
-
-            neg_path = DATA_DIR + neg_data_list[neg_ind]
-            yield neg_path, lab_neg
-
-
-    mapper = functools.partial(
-        process_image_imagepath, mode=mode, color_jitter=color_jitter, rotate=rotate)
-
-    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE, order=True)
-
-
-def image_iterator(data,
-                   mode,
-                   shuffle=False,
-                   color_jitter=False,
-                   rotate=False):
-    def test_reader():
-        for i in range(len(data)):
-            path, label = data[i]
-            path = DATA_DIR + path 
-            yield path, label
-
-    def infer_reader():
-        for i in range(len(data)):
-            path = data[i]
-            path = DATA_DIR + path 
-            yield [path]
-
-    if mode == "test":
-        mapper = functools.partial(
-            process_image_imagepath, mode=mode, color_jitter=color_jitter, rotate=rotate)
-        return paddle.reader.xmap_readers(mapper, test_reader, THREAD, BUF_SIZE)
-    elif mode == "infer":
-        mapper = functools.partial(
-            process_image_imagepath, mode=mode, color_jitter=color_jitter, rotate=rotate)
-        return paddle.reader.xmap_readers(mapper, infer_reader, THREAD, BUF_SIZE)
-
-
-def eml_train(batch_size, samples_each_class):
-    return eml_iterator(train_data, 'train', batch_size, samples_each_class, iter_size = 100, \
-                           shuffle=True, color_jitter=False, rotate=False)
-
-def quadruplet_train(class_num, samples_each_class):
-    return quadruplet_iterator(train_data, 'train', class_num, samples_each_class, iter_size=100, \
-                           shuffle=True, color_jitter=False, rotate=False)
-            
-def triplet_train(batch_size):
-    assert(batch_size % 3 == 0)
-    return triplet_iterator(train_data, 'train', batch_size, iter_size = batch_size//3 * 100, \
-                           shuffle=True, color_jitter=False, rotate=False)
-
-def test():
-    return image_iterator(test_image_list, "test", shuffle=False)
-
-def infer():
-    return image_iterator(infer_image_list, "infer", shuffle=False)
--- a/fluid/PaddleCV/metric_learning/losses/emlloss.py
+++ b/fluid/PaddleCV/metric_learning/losses/emlloss.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import math
-import numpy as np
 import paddle.fluid as fluid
-from . import datareader as reader
-from .metrics import calculate_order_dist_matrix
-from .metrics import get_gpu_num
+from utility import get_gpu_num
+from .commonfunc import calculate_order_dist_matrix

-class emlloss():
+class EmlLoss():
    def __init__(self, train_batch_size = 40, samples_each_class=2):
-        num_gpus = get_gpu_num()
        self.samples_each_class = samples_each_class
        self.train_batch_size = train_batch_size
+        num_gpus = get_gpu_num()
        assert(train_batch_size % num_gpus == 0)
        self.cal_loss_batch_size = train_batch_size // num_gpus
        assert(self.cal_loss_batch_size % samples_each_class == 0)
-        class_num = train_batch_size // samples_each_class
-        self.train_reader = reader.eml_train(train_batch_size, samples_each_class)
-        self.test_reader = reader.test()

    def surrogate_function(self, beta, theta, bias):
        x = theta * fluid.layers.exp(bias) 
@@ -41,7 +40,10 @@ class emlloss():

    def loss(self, input):
        samples_each_class = self.samples_each_class
-        batch_size = self.cal_loss_batch_size   
+        batch_size = self.cal_loss_batch_size
+        #input = fluid.layers.l2_normalize(input, axis=1)
+        #input_norm = fluid.layers.sqrt(fluid.layers.reduce_sum(fluid.layers.square(input), dim=1))
+        #input = fluid.layers.elementwise_div(input, input_norm, axis=0)
        d = calculate_order_dist_matrix(input, self.cal_loss_batch_size, self.samples_each_class)
        ignore, pos, neg = fluid.layers.split(d, num_or_sections= [1, 
            samples_each_class-1, batch_size-samples_each_class], dim=1)

--- a/fluid/PaddleCV/metric_learning/losses/quadrupletloss.py
+++ b/fluid/PaddleCV/metric_learning/losses/quadrupletloss.py
-import numpy as np
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import paddle.fluid as fluid
-from . import datareader as reader
-from .metrics import calculate_order_dist_matrix
-from .metrics import get_gpu_num
+from utility import get_gpu_num
+from .commonfunc import calculate_order_dist_matrix

-class quadrupletloss():
+class QuadrupletLoss():
    def __init__(self, 
                 train_batch_size = 80, 
                 samples_each_class = 2,
-                 margin=0.1):
+                 margin = 0.1):
        self.margin = margin
-        num_gpus = get_gpu_num()
        self.samples_each_class = samples_each_class
        self.train_batch_size = train_batch_size
+        num_gpus = get_gpu_num()
        assert(train_batch_size % num_gpus == 0)
        self.cal_loss_batch_size = train_batch_size // num_gpus
        assert(self.cal_loss_batch_size % samples_each_class == 0)
-        class_num = train_batch_size // samples_each_class
-        self.train_reader = reader.quadruplet_train(class_num, samples_each_class)
-        self.test_reader = reader.test()

    def loss(self, input):
-        feature = fluid.layers.l2_normalize(input, axis=1)
+        #input = fluid.layers.l2_normalize(input, axis=1)
+        input_norm = fluid.layers.sqrt(fluid.layers.reduce_sum(fluid.layers.square(input), dim=1))
+        input = fluid.layers.elementwise_div(input, input_norm, axis=0)
+
        samples_each_class = self.samples_each_class
        batch_size = self.cal_loss_batch_size
        margin = self.margin
-        d = calculate_order_dist_matrix(feature, self.cal_loss_batch_size, self.samples_each_class)
+        d = calculate_order_dist_matrix(input, self.cal_loss_batch_size, self.samples_each_class)
        ignore, pos, neg = fluid.layers.split(d, num_or_sections= [1, 
            samples_each_class-1, batch_size-samples_each_class], dim=1)
        ignore.stop_gradient = True
        pos_max = fluid.layers.reduce_max(pos)
        neg_min = fluid.layers.reduce_min(neg)
-        pos_max = fluid.layers.sqrt(pos_max)
-        neg_min = fluid.layers.sqrt(neg_min)
+        #pos_max = fluid.layers.sqrt(pos_max + 1e-6)
+        #neg_min = fluid.layers.sqrt(neg_min + 1e-6)
        loss = fluid.layers.relu(pos_max - neg_min + margin)
        return loss
    
--- a/fluid/PaddleCV/metric_learning/losses/softmaxloss.py
+++ b/fluid/PaddleCV/metric_learning/losses/softmaxloss.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle.fluid as fluid
+
+class SoftmaxLoss():
+    def __init__(self, class_dim):
+        self.class_dim = class_dim
+
+    def loss(self, input, label):
+        out = self.fc_product(input, self.class_dim)
+        loss = fluid.layers.cross_entropy(input=out, label=label)
+        return loss, out
+
+    def fc_product(self, input, out_dim):
+        stdv = 1.0 / math.sqrt(input.shape[1] * 1.0)
+        out = fluid.layers.fc(input=input,
+                              size=out_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
--- a/fluid/PaddleCV/metric_learning/losses/tripletloss.py
+++ b/fluid/PaddleCV/metric_learning/losses/tripletloss.py
-from . import datareader as reader
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import paddle.fluid as fluid

-class tripletloss():
-    def __init__(self, train_batch_size = 120, margin=0.1):
-        self.train_reader = reader.triplet_train(train_batch_size)
-        self.test_reader = reader.test()
-        self.infer_reader = reader.infer()
+class TripletLoss():
+    def __init__(self, margin=0.1):
        self.margin = margin

    def loss(self, input):
        margin = self.margin
        fea_dim = input.shape[1] # number of channels
+        #input = fluid.layers.l2_normalize(input, axis=1)
+        input_norm = fluid.layers.sqrt(fluid.layers.reduce_sum(fluid.layers.square(input), dim=1))
+        input = fluid.layers.elementwise_div(input, input_norm, axis=0)
        output = fluid.layers.reshape(input, shape = [-1, 3, fea_dim])
-        output = fluid.layers.l2_normalize(output, axis=2)
+
        anchor, positive, negative = fluid.layers.split(output, num_or_sections = 3, dim = 1)
 
        anchor = fluid.layers.reshape(anchor, shape = [-1, fea_dim])
@@ -23,7 +26,7 @@ class tripletloss():
        a_n = fluid.layers.square(anchor - negative)
        a_p = fluid.layers.reduce_sum(a_p, dim = 1)
        a_n = fluid.layers.reduce_sum(a_n, dim = 1)
-        a_p = fluid.layers.sqrt(a_p)
-        a_n = fluid.layers.sqrt(a_n)
+        #a_p = fluid.layers.sqrt(a_p + 1e-6)
+        #a_n = fluid.layers.sqrt(a_n + 1e-6)
        loss = fluid.layers.relu(a_p + margin - a_n)
        return loss
--- a/fluid/PaddleCV/metric_learning/models/__init__.py
+++ b/fluid/PaddleCV/metric_learning/models/__init__.py
-from .resnet import ResNet50
-from .resnet import ResNet101
-from .resnet import ResNet152
-from .se_resnext import SE_ResNeXt50_32x4d
-from .se_resnext import SE_ResNeXt101_32x4d
-from .se_resnext import SE_ResNeXt152_32x4d
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from .resnet_embedding import ResNet50
+from .resnet_embedding import ResNet101
+from .resnet_embedding import ResNet152
--- a/fluid/PaddleCV/metric_learning/models/resnet.py
+++ b/fluid/PaddleCV/metric_learning/models/resnet.py
 import paddle
 import paddle.fluid as fluid
 import math
+from paddle.fluid.param_attr import ParamAttr

 __all__ = ["ResNet", "ResNet50", "ResNet101", "ResNet152"]

@@ -22,7 +23,7 @@ class ResNet():
        self.params = train_parameters
        self.layers = layers

-    def net(self, input, class_dim=1000):
+    def net(self, input, embedding_size=256):
        layers = self.layers
        supported_layers = [50, 101, 152]
        assert layers in supported_layers, \
@@ -37,7 +38,7 @@ class ResNet():
        num_filters = [64, 128, 256, 512]

        conv = self.conv_bn_layer(
-            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
+            input=input, num_filters=64, filter_size=7, stride=2, act='relu',name="conv1")
        conv = fluid.layers.pool2d(
            input=conv,
            pool_size=3,
@@ -47,21 +48,26 @@ class ResNet():

        for block in range(len(depth)):
            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name="res"+str(block+2)+"a"
+                    else:
+                        conv_name="res"+str(block+2)+"b"+str(i)
+                else:
+                    conv_name="res"+str(block+2)+chr(97+i)
                conv = self.bottleneck_block(
                    input=conv,
                    num_filters=num_filters[block],
-                    stride=2 if i == 0 and block != 0 else 1)
+                    stride=2 if i == 0 and block != 0 else 1,name=conv_name)

        pool = fluid.layers.pool2d(
            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
-        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
-        out = fluid.layers.fc(input=pool,
-                              size=class_dim,
-                              act='softmax',
-                              param_attr=fluid.param_attr.ParamAttr(
-                                  initializer=fluid.initializer.Uniform(-stdv,
-                                                                        stdv)))
-        return pool, out
+
+        if embedding_size > 0:
+            embedding = fluid.layers.fc(input=pool, size=embedding_size)
+            return embedding
+        else:
+            return pool

    def conv_bn_layer(self,
                      input,
@@ -69,7 +75,8 @@ class ResNet():
                      filter_size,
                      stride=1,
                      groups=1,
-                      act=None):
+                      act=None,
+                      name=None):
        conv = fluid.layers.conv2d(
            input=input,
            num_filters=num_filters,
@@ -78,31 +85,44 @@ class ResNet():
            padding=(filter_size - 1) // 2,
            groups=groups,
            act=None,
-            bias_attr=False)
-        return fluid.layers.batch_norm(input=conv, act=act)
-
-    def shortcut(self, input, ch_out, stride):
+            param_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            name=name + '.conv2d.output.1')
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:] 
+        return fluid.layers.batch_norm(input=conv, 
+                                       act=act,
+                                       name=bn_name+'.output.1',
+                                       param_attr=ParamAttr(name=bn_name + '_scale'),
+                                       bias_attr=ParamAttr(bn_name + '_offset'),
+                                       moving_mean_name=bn_name + '_mean',
+                                       moving_variance_name=bn_name + '_variance',)
+
+    def shortcut(self, input, ch_out, stride, name):
        ch_in = input.shape[1]
        if ch_in != ch_out or stride != 1:
-            return self.conv_bn_layer(input, ch_out, 1, stride)
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
        else:
            return input

-    def bottleneck_block(self, input, num_filters, stride):
+    def bottleneck_block(self, input, num_filters, stride, name):
        conv0 = self.conv_bn_layer(
-            input=input, num_filters=num_filters, filter_size=1, act='relu')
+            input=input, num_filters=num_filters, filter_size=1, act='relu',name=name+"_branch2a")
        conv1 = self.conv_bn_layer(
            input=conv0,
            num_filters=num_filters,
            filter_size=3,
            stride=stride,
-            act='relu')
+            act='relu',
+        name=name+"_branch2b")
        conv2 = self.conv_bn_layer(
-            input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
+            input=conv1, num_filters=num_filters * 4, filter_size=1, act=None, name=name+"_branch2c")

-        short = self.shortcut(input, num_filters * 4, stride)
+        short = self.shortcut(input, num_filters * 4, stride, name=name + "_branch1")

-        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu',name=name+".add.output.5")


 def ResNet50():

--- a/fluid/PaddleCV/metric_learning/models/se_resnext.py
+++ b/fluid/PaddleCV/metric_learning/models/se_resnext.py
-import paddle
-import paddle.fluid as fluid
-import math
-
-__all__ = ["SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d", "SE_ResNeXt152_32x4d"]
-
-train_parameters = {
-  "input_size": [3, 224, 224],
-  "input_mean": [0.485, 0.456, 0.406],
-  "input_std": [0.229, 0.224, 0.225],
-  "learning_strategy": {
-    "name": "piecewise_decay",
-    "batch_size": 256,
-    "epochs": [30, 60, 90],
-    "steps": [0.1, 0.01, 0.001, 0.0001]
-  }
-}
-
-class SE_ResNeXt():
-    def __init__(self, layers = 50):
-        self.params = train_parameters
-        self.layers = layers
-
-    def net(self, input, class_dim = 1000):
-        layers = self.layers
-        supported_layers = [50, 101, 152]
-        assert layers in supported_layers, \
-            "supported layers are {} but input layer is {}".format(supported_layers, layers)
-        if layers == 50:
-            cardinality = 32
-            reduction_ratio = 16
-            depth = [3, 4, 6, 3]
-            num_filters = [128, 256, 512, 1024]
- 
-            conv = self.conv_bn_layer(
-                input=input, num_filters=64, filter_size=7, stride=2, act='relu')
-            conv = fluid.layers.pool2d(
-                input=conv,
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
-        elif layers == 101:
-            cardinality = 32
-            reduction_ratio = 16
-            depth = [3, 4, 23, 3]
-            num_filters = [128, 256, 512, 1024]
- 
-            conv = self.conv_bn_layer(
-                input=input, num_filters=64, filter_size=7, stride=2, act='relu')
-            conv = fluid.layers.pool2d(
-                input=conv,
-                pool_size=3,
-                pool_stride=2,
-                pool_padding=1,
-                pool_type='max')
-        elif layers == 152:
-            cardinality = 64
-            reduction_ratio = 16
-            depth = [3, 8, 36, 3]
-            num_filters = [128, 256, 512, 1024]
- 
-            conv = self.conv_bn_layer(
-                input=input, num_filters=64, filter_size=3, stride=2, act='relu')
-            conv = self.conv_bn_layer(
-                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
-            conv = self.conv_bn_layer(
-                input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
-            conv = fluid.layers.pool2d(
-                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
-                pool_type='max')
- 
-        for block in range(len(depth)):
-            for i in range(depth[block]):
-                conv = self.bottleneck_block(
-                    input=conv,
-                    num_filters=num_filters[block],
-                    stride=2 if i == 0 and block != 0 else 1,
-                    cardinality=cardinality,
-                    reduction_ratio=reduction_ratio)
- 
-        pool = fluid.layers.pool2d(
-            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
-        drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
-        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
-        out = fluid.layers.fc(input=drop,
-                              size=class_dim,
-                              act='softmax',
-                              param_attr=fluid.param_attr.ParamAttr(
-                                  initializer=fluid.initializer.Uniform(-stdv,
-                                                                        stdv)))
-        return pool, out
-
-    def shortcut(self, input, ch_out, stride):
-        ch_in = input.shape[1]
-        if ch_in != ch_out or stride != 1:
-            filter_size = 1
-            return self.conv_bn_layer(input, ch_out, filter_size, stride)
-        else:
-            return input
-
-    def bottleneck_block(self, input, num_filters, stride, cardinality, reduction_ratio):
-        conv0 = self.conv_bn_layer(
-            input=input, num_filters=num_filters, filter_size=1, act='relu')
-        conv1 = self.conv_bn_layer(
-            input=conv0,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            groups=cardinality,
-            act='relu')
-        conv2 = self.conv_bn_layer(
-            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
-        scale = self.squeeze_excitation(
-            input=conv2,
-            num_channels=num_filters * 2,
-            reduction_ratio=reduction_ratio)
- 
-        short = self.shortcut(input, num_filters * 2, stride)
- 
-        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
-
-    def conv_bn_layer(self, input, num_filters, filter_size, stride=1, groups=1,
-                      act=None):
-        conv = fluid.layers.conv2d(
-            input=input,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=False)
-        return fluid.layers.batch_norm(input=conv, act=act)
-
-    def squeeze_excitation(self, input, num_channels, reduction_ratio):
-        pool = fluid.layers.pool2d(
-            input=input, pool_size=0, pool_type='avg', global_pooling=True)
-        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
-        squeeze = fluid.layers.fc(input=pool,
-                                  size=num_channels / reduction_ratio,
-                                  act='relu',
-                                  param_attr=fluid.param_attr.ParamAttr(
-                                      initializer=fluid.initializer.Uniform(-stdv,
-                                                                            stdv)))
-        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
-        excitation = fluid.layers.fc(input=squeeze,
-                                     size=num_channels,
-                                     act='sigmoid',
-                                     param_attr=fluid.param_attr.ParamAttr(
-                                         initializer=fluid.initializer.Uniform(
-                                             -stdv, stdv)))
-        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
-        return scale
-
-def SE_ResNeXt50_32x4d():
-    model = SE_ResNeXt(layers = 50)
-    return model
-
-def SE_ResNeXt101_32x4d():
-    model = SE_ResNeXt(layers = 101)
-    return model
-
-def SE_ResNeXt152_32x4d():
-    model = SE_ResNeXt(layers = 152)
-    return model
--- a/fluid/PaddleCV/metric_learning/reader.py
+++ b/fluid/PaddleCV/metric_learning/reader.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import math
+import random
+import functools
+import numpy as np
+import paddle
+from imgtool import process_image
+
+random.seed(0)
+
+DATA_DIR = "./data/Stanford_Online_Products/"
+TRAIN_LIST = './data/Stanford_Online_Products/Ebay_train.txt'
+VAL_LIST = './data/Stanford_Online_Products/Ebay_test.txt'
+
+
+def init_sop(mode):
+    if mode == 'train':
+        train_data = {}
+        train_image_list = []
+        train_list = open(TRAIN_LIST, "r").readlines()
+        for i, item in enumerate(train_list):
+            items = item.strip().split()
+            if items[0] == 'image_id':
+                continue
+            path = items[3]
+            label = int(items[1]) - 1
+            train_image_list.append((path, label))
+            if label not in train_data:
+                train_data[label] = []
+            train_data[label].append(path)
+        random.shuffle(train_image_list)
+        print("{} dataset size: {}".format(mode, len(train_data)))
+        return train_data, train_image_list
+    else:
+        val_data = {}
+        val_image_list = []
+        test_image_list = []
+        val_list = open(VAL_LIST, "r").readlines()
+        for i, item in enumerate(val_list):
+            items = item.strip().split()
+            if items[0] == 'image_id':
+                continue
+            path = items[3]
+            label = int(items[1])
+            val_image_list.append((path, label))
+            test_image_list.append(path)
+            if label not in val_data:
+                val_data[label] = []
+            val_data[label].append(path)
+        print("{} dataset size: {}".format(mode, len(val_data)))
+        if mode == 'val':
+            return val_data, val_image_list
+        else:
+            return test_image_list
+
+def common_iterator(data, settings):
+    batch_size = settings.train_batch_size
+    samples_each_class = settings.samples_each_class
+    assert (batch_size % samples_each_class == 0)
+    class_num = batch_size // samples_each_class 
+    def train_iterator():
+        labs = list(data.keys())
+        lab_num = len(labs)
+        ind = list(range(0, lab_num))
+        while True:
+            random.shuffle(ind)
+            ind_sample = ind[:class_num]
+            for ind_i in ind_sample:
+                lab = labs[ind_i]
+                data_list = data[lab]
+                data_ind = list(range(0, len(data_list)))
+                random.shuffle(data_ind)
+                anchor_ind = data_ind[:samples_each_class]
+
+                for anchor_ind_i in anchor_ind:
+                    anchor_path = DATA_DIR + data_list[anchor_ind_i]
+                    yield anchor_path, lab
+
+    return train_iterator
+
+def triplet_iterator(data, settings):
+    batch_size = settings.train_batch_size
+    assert (batch_size % 3 == 0)
+    def train_iterator():
+        labs = list(data.keys())
+        lab_num = len(labs)
+        ind = list(range(0, lab_num))
+        while True:
+            random.shuffle(ind)
+            ind_pos, ind_neg = ind[:2]
+            lab_pos = labs[ind_pos]
+            pos_data_list = data[lab_pos]
+            data_ind = list(range(0, len(pos_data_list)))
+            random.shuffle(data_ind)
+            anchor_ind, pos_ind = data_ind[:2]
+
+            lab_neg = labs[ind_neg]
+            neg_data_list = data[lab_neg]
+            neg_ind = random.randint(0, len(neg_data_list) - 1)
+            
+            anchor_path = DATA_DIR + pos_data_list[anchor_ind]
+            yield anchor_path, lab_pos
+            pos_path = DATA_DIR + pos_data_list[pos_ind]
+            yield pos_path, lab_pos
+            neg_path = DATA_DIR + neg_data_list[neg_ind]
+            yield neg_path, lab_neg
+
+    return train_iterator
+
+def arcmargin_iterator(data, settings):
+    def train_iterator():
+        while True:
+            for items in data:
+                path, label = items
+                path = DATA_DIR + path
+                yield path, label
+    return train_iterator
+
+def image_iterator(data, mode):
+    def val_iterator():
+        for items in data:
+            path, label = items
+            path = DATA_DIR + path 
+            yield path, label
+    def test_iterator():
+        for item in data:
+            path = item
+            path = DATA_DIR + path 
+            yield [path]
+    if mode == 'val':
+        return val_iterator
+    else:
+        return test_iterator
+
+def createreader(settings, mode):
+    def metric_reader():
+        if mode == 'train':
+            train_data, train_image_list = init_sop('train')
+            loss_name = settings.loss_name
+            if loss_name in ["softmax", "arcmargin"]:
+                return arcmargin_iterator(train_image_list, settings)()
+            elif loss_name == 'triplet':
+                return triplet_iterator(train_data, settings)()
+            else:
+                return common_iterator(train_data, settings)()
+        elif mode == 'val':
+            val_data, val_image_list = init_sop('val')
+            return image_iterator(val_image_list, 'val')()
+        else:
+            test_image_list = init_sop('test')
+            return image_iterator(test_image_list, 'test')()
+
+    image_shape = settings.image_shape.split(',')
+    assert(image_shape[1] == image_shape[2])
+    image_size = int(image_shape[2])
+    keep_order = False if mode != 'train' or settings.loss_name in ['softmax', 'arcmargin'] else True
+    image_mapper = functools.partial(process_image,
+            mode=mode, color_jitter=False, rotate=False, crop_size=image_size)
+    reader = paddle.reader.xmap_readers(
+            image_mapper, metric_reader, 8, 1000, order=keep_order)
+    return reader
+
+
+def train(settings): 
+    return createreader(settings, "train")
+
+def test(settings):
+    return createreader(settings, "val")
+
+def infer(settings):
+    return createreader(settings, "test")
--- a/fluid/PaddleCV/metric_learning/train.py
+++ b/fluid/PaddleCV/metric_learning/train.py
-import os
-import sys
-import math
-import time
-import argparse
-import functools
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import models
-from losses import tripletloss
-from losses import quadrupletloss
-from losses import emlloss
-from losses.metrics import recall_topk
-from utility import add_arguments, print_arguments
-
-parser = argparse.ArgumentParser(description=__doc__)
-add_arg = functools.partial(add_arguments, argparser=parser)
-# yapf: disable
-add_arg('train_batch_size', int, 80, "Minibatch size.")
-add_arg('test_batch_size', int, 10, "Minibatch size.")
-add_arg('num_epochs', int, 120, "number of epochs.")
-add_arg('image_shape', str, "3,224,224", "input image size")
-add_arg('model_save_dir', str, "output", "model save directory")
-add_arg('with_mem_opt', bool, True,
-        "Whether to use memory optimization or not.")
-add_arg('pretrained_model', str, None, "Whether to use pretrained model.")
-add_arg('checkpoint', str, None, "Whether to resume checkpoint.")
-add_arg('lr', float, 0.1, "set learning rate.")
-add_arg('lr_strategy', str, "piecewise_decay",
-        "Set the learning rate decay strategy.")
-add_arg('model', str, "SE_ResNeXt50_32x4d", "Set the network to use.")
-add_arg('loss_name', str, "tripletloss", "Set the loss type to use.")
-add_arg('samples_each_class', int, 2, "Samples each class.")
-add_arg('margin', float, 0.1, "margin.")
-add_arg('alpha', float, 0.0, "alpha.")
-# yapf: enable
-
-model_list = [m for m in dir(models) if "__" not in m]
-
-def optimizer_setting(params):
-    ls = params["learning_strategy"]
-    assert ls["name"] == "piecewise_decay", \
-           "learning rate strategy must be {}, \
-           but got {}".format("piecewise_decay", lr["name"])
-
-    step = 10000
-    bd = [step * e for e in ls["epochs"]]
-    base_lr = params["lr"]
-    lr = []
-    lr = [base_lr * (0.1 ** i) for i in range(len(bd) + 1)]
-    optimizer = fluid.optimizer.Momentum(
-        learning_rate=fluid.layers.piecewise_decay(
-            boundaries=bd, values=lr),
-        momentum=0.9,
-        regularization=fluid.regularizer.L2Decay(1e-4))
-
-    return optimizer
-
-def train(args):
-    # parameters from arguments
-    model_name = args.model
-    checkpoint = args.checkpoint
-    pretrained_model = args.pretrained_model
-    with_memory_optimization = args.with_mem_opt
-    model_save_dir = args.model_save_dir
-    loss_name = args.loss_name
-
-    image_shape = [int(m) for m in args.image_shape.split(",")]
-
-    assert model_name in model_list, "{} is not in lists: {}".format(args.model, model_list)
-
-    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    
-    # model definition
-    model = models.__dict__[model_name]()
-    out = model.net(input=image, class_dim=200)
-
-    if loss_name == "tripletloss":
-        metricloss = tripletloss(
-                train_batch_size = args.train_batch_size, 
-                margin=args.margin)
-        cost_metric = metricloss.loss(out[0])
-        avg_cost_metric = fluid.layers.mean(x=cost_metric)
-    elif loss_name == "quadrupletloss":
-        metricloss = quadrupletloss(
-                train_batch_size = args.train_batch_size,
-                samples_each_class = args.samples_each_class,
-                margin=args.margin)
-        cost_metric = metricloss.loss(out[0])
-        avg_cost_metric = fluid.layers.mean(x=cost_metric)
-    elif loss_name == "emlloss":
-        metricloss = emlloss(
-                train_batch_size = args.train_batch_size, 
-                samples_each_class = args.samples_each_class
-        )
-        cost_metric = metricloss.loss(out[0])
-        avg_cost_metric = fluid.layers.mean(x=cost_metric)
-
-    cost_cls = fluid.layers.cross_entropy(input=out[1], label=label)
-    avg_cost_cls = fluid.layers.mean(x=cost_cls)
-    acc_top1 = fluid.layers.accuracy(input=out[1], label=label, k=1)
-    acc_top5 = fluid.layers.accuracy(input=out[1], label=label, k=5)
-    avg_cost = avg_cost_metric + args.alpha*avg_cost_cls
-    
-
-    test_program = fluid.default_main_program().clone(for_test=True)
-
-    # parameters from model and arguments
-    params = model.params    
-    params["lr"] = args.lr
-    params["num_epochs"] = args.num_epochs
-    params["learning_strategy"]["batch_size"] = args.train_batch_size
-    params["learning_strategy"]["name"] = args.lr_strategy
-
-    # initialize optimizer
-    optimizer = optimizer_setting(params)
-    opts = optimizer.minimize(avg_cost)
-
-    global_lr = optimizer._global_learning_rate()
-
-    place = fluid.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-
-    if checkpoint is not None:
-        fluid.io.load_persistables(exe, checkpoint)
-
-    if pretrained_model:
-        assert(checkpoint is None)
-        def if_exist(var):
-            has_var = os.path.exists(os.path.join(pretrained_model, var.name))
-            if has_var:
-                print('var: %s found' % (var.name))
-            return has_var
-        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
-
-    train_reader = paddle.batch(metricloss.train_reader, batch_size=args.train_batch_size)
-    test_reader = paddle.batch(metricloss.test_reader, batch_size=args.test_batch_size)
-    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
-
-    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
-
-    fetch_list_train = [avg_cost_metric.name, avg_cost_cls.name, acc_top1.name, acc_top5.name, global_lr.name]
-    fetch_list_test = [out[0].name]
-
-    if with_memory_optimization:
-        fluid.memory_optimize(fluid.default_main_program(), skip_opt_set=set(fetch_list_train))
-
-    for pass_id in range(params["num_epochs"]):
-        train_info = [[], [], [], []]
-        for batch_id, data in enumerate(train_reader()):
-            t1 = time.time()
-            loss_metric, loss_cls, acc1, acc5, lr = train_exe.run(fetch_list_train, feed=feeder.feed(data))
-            t2 = time.time()
-            period = t2 - t1
-            loss_metric = np.mean(np.array(loss_metric))
-            loss_cls = np.mean(np.array(loss_cls))
-            acc1 = np.mean(np.array(acc1))
-            acc5 = np.mean(np.array(acc5))
-            lr = np.mean(np.array(lr))
-            train_info[0].append(loss_metric)
-            train_info[1].append(loss_cls)
-            train_info[2].append(acc1)
-            train_info[3].append(acc5)
-            if batch_id % 10 == 0:
-                print("Pass {0}, trainbatch {1}, lr {2}, loss_metric {3}, loss_cls {4}, acc1 {5}, acc5 {6}, time {7}".format(pass_id,  \
-                      batch_id, lr, loss_metric, loss_cls, acc1, acc5, "%2.2f sec" % period))
-
-        train_loss_metric = np.array(train_info[0]).mean()
-        train_loss_cls = np.array(train_info[1]).mean()
-        train_acc1 = np.array(train_info[2]).mean()
-        train_acc5 = np.array(train_info[3]).mean()
-        f = []
-        l = []
-        for batch_id, data in enumerate(test_reader()):
-            if len(data) < args.test_batch_size:
-                continue
-            t1 = time.time()
-            [feas] = exe.run(test_program, fetch_list = fetch_list_test, feed=feeder.feed(data))
-            label = np.asarray([x[1] for x in data])
-            f.append(feas)
-            l.append(label)
-
-            t2 = time.time()
-            period = t2 - t1
-            if batch_id % 20 == 0:
-                print("Pass {0}, testbatch {1}, time {2}".format(pass_id,  \
-                      batch_id, "%2.2f sec" % period))
-
-        f = np.vstack(f)
-        l = np.hstack(l)
-        recall = recall_topk(f, l, k = 1)
-        print("End pass {0}, train_loss_metric {1}, train_loss_cls {2}, train_acc1 {3}, train_acc5 {4}, test_recall {5}".format(pass_id,  \
-              train_loss_metric, train_loss_cls, train_acc1, train_acc5, recall))
-        sys.stdout.flush()
-
-        model_path = os.path.join(model_save_dir + '/' + model_name,
-                                  str(pass_id))
-        if not os.path.isdir(model_path):
-            os.makedirs(model_path)
-        fluid.io.save_persistables(exe, model_path)
-
-def main():
-    args = parser.parse_args()
-    print_arguments(args)
-    train(args)
-
-if __name__ == '__main__':
-    main()
--- a/fluid/PaddleCV/metric_learning/train_elem.py
+++ b/fluid/PaddleCV/metric_learning/train_elem.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import math
+import time
+import logging
+import argparse
+import functools
+import threading
+import subprocess
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import models
+import reader
+from losses import SoftmaxLoss
+from losses import ArcMarginLoss
+from utility import add_arguments, print_arguments
+from utility import fmt_time, recall_topk, get_gpu_num
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('model', str, "ResNet50", "Set the network to use.")
+add_arg('embedding_size', int, 0, "Embedding size.")
+add_arg('train_batch_size', int, 256, "Minibatch size.")
+add_arg('test_batch_size', int, 50, "Minibatch size.")
+add_arg('image_shape', str, "3,224,224", "input image size")
+add_arg('class_dim', int, 11318 , "Class number.")
+add_arg('lr', float, 0.01, "set learning rate.")
+add_arg('lr_strategy', str, "piecewise_decay",	"Set the learning rate decay strategy.")
+add_arg('lr_steps', str, "30000", "step of lr")
+add_arg('total_iter_num', int, 30000, "total_iter_num")
+add_arg('display_iter_step', int, 10, "display_iter_step.")
+add_arg('test_iter_step', int, 1000, "test_iter_step.")
+add_arg('save_iter_step', int, 1000, "save_iter_step.")
+add_arg('use_gpu', bool, True, "Whether to use GPU or not.")
+add_arg('with_mem_opt', bool, True, "Whether to use memory optimization or not.")
+add_arg('pretrained_model', str, None, "Whether to use pretrained model.")
+add_arg('checkpoint', str, None, "Whether to resume checkpoint.")
+add_arg('model_save_dir', str, "output", "model save directory")
+add_arg('loss_name', str, "softmax", "Set the loss type to use.")
+add_arg('arc_scale', float, 80.0, "arc scale.")
+add_arg('arc_margin', float, 0.15, "arc margin.")
+add_arg('arc_easy_margin', bool, False, "arc easy margin.")
+add_arg('enable_ce', bool, False, "If set True, enable continuous evaluation job.")
+# yapf: enable
+
+model_list = [m for m in dir(models) if "__" not in m]
+
+def optimizer_setting(params):
+    ls = params["learning_strategy"]
+    assert ls["name"] == "piecewise_decay", \
+           "learning rate strategy must be {}, \
+           but got {}".format("piecewise_decay", lr["name"])
+
+    bd = [int(e) for e in ls["lr_steps"].split(',')]
+    base_lr = params["lr"]
+    lr = [base_lr * (0.1 ** i) for i in range(len(bd) + 1)]
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=fluid.layers.piecewise_decay(
+            boundaries=bd, values=lr),
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(1e-4))
+    return optimizer
+
+
+def net_config(image, label, model, args, is_train):
+    assert args.model in model_list, "{} is not in lists: {}".format(
+        args.model, model_list)
+
+    out = model.net(input=image, embedding_size=args.embedding_size)
+    if not is_train:
+        return None, None, None, out
+
+    if args.loss_name == "softmax":
+        metricloss = SoftmaxLoss(
+                class_dim=args.class_dim,
+        )
+    elif args.loss_name == "arcmargin":
+        metricloss = ArcMarginLoss(
+                class_dim = args.class_dim,
+                margin = args.arc_margin,
+                scale = args.arc_scale,
+                easy_margin = args.arc_easy_margin,
+        )
+    cost, logit = metricloss.loss(out, label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc_top1 = fluid.layers.accuracy(input=logit, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=logit, label=label, k=5)
+    return avg_cost, acc_top1, acc_top5, out
+
+def build_program(is_train, main_prog, startup_prog, args):
+    image_shape = [int(m) for m in args.image_shape.split(",")]
+    model = models.__dict__[args.model]()
+    with fluid.program_guard(main_prog, startup_prog):
+        if is_train:
+            queue_capacity = 64
+            py_reader = fluid.layers.py_reader(
+                capacity=queue_capacity,
+                shapes=[[-1] + image_shape, [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"],
+                use_double_buffer=True)
+            image, label = fluid.layers.read_file(py_reader)
+        else:
+            image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        with fluid.unique_name.guard():
+            avg_cost, acc_top1, acc_top5, out = net_config(image, label, model, args, is_train)
+            if is_train:
+                params = model.params
+                params["lr"] = args.lr
+                params["learning_strategy"]["lr_steps"] = args.lr_steps
+                params["learning_strategy"]["name"] = args.lr_strategy
+                optimizer = optimizer_setting(params)
+                optimizer.minimize(avg_cost)
+                global_lr = optimizer._global_learning_rate()
+    """            
+    if not is_train:
+        main_prog = main_prog.clone(for_test=True)
+    """
+    if is_train:
+        return py_reader, avg_cost, acc_top1, acc_top5, global_lr
+    else: 
+        return out, image, label
+
+
+def train_async(args):
+    # parameters from arguments
+
+    logging.debug('enter train')
+    model_name = args.model
+    checkpoint = args.checkpoint
+    pretrained_model = args.pretrained_model
+    model_save_dir = args.model_save_dir
+
+    startup_prog = fluid.Program()
+    train_prog = fluid.Program()
+    tmp_prog = fluid.Program()
+
+    if args.enable_ce:
+        assert args.model == "ResNet50"
+        assert args.loss_name == "arcmargin"
+        np.random.seed(0)
+        startup_prog.random_seed = 1000
+        train_prog.random_seed = 1000
+        tmp_prog.random_seed = 1000
+
+    train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program(
+        is_train=True,
+        main_prog=train_prog,
+        startup_prog=startup_prog,
+        args=args)
+    test_feas, image, label = build_program(
+        is_train=False,
+        main_prog=tmp_prog,
+        startup_prog=startup_prog,
+        args=args)
+    test_prog = tmp_prog.clone(for_test=True)
+
+    train_fetch_list = [global_lr.name, train_cost.name, train_acc1.name, train_acc5.name]
+    test_fetch_list = [test_feas.name]
+
+    if args.with_mem_opt:
+        fluid.memory_optimize(train_prog, skip_opt_set=set(train_fetch_list))
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    exe.run(startup_prog)
+
+    logging.debug('after run startup program')
+
+    if checkpoint is not None:
+        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)
+
+    if pretrained_model:
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(pretrained_model, var.name))
+
+        fluid.io.load_vars(
+            exe, pretrained_model, main_program=train_prog, predicate=if_exist)
+
+    devicenum = get_gpu_num()
+    assert (args.train_batch_size % devicenum) == 0
+    train_batch_size = args.train_batch_size // devicenum
+    test_batch_size = args.test_batch_size
+    
+    train_reader = paddle.batch(reader.train(args), batch_size=train_batch_size, drop_last=True)
+    test_reader = paddle.batch(reader.test(args), batch_size=test_batch_size, drop_last=False)
+    test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
+    train_py_reader.decorate_paddle_reader(train_reader)
+
+    train_exe = fluid.ParallelExecutor(
+        main_program=train_prog,
+        use_cuda=args.use_gpu,
+        loss_name=train_cost.name)
+
+    totalruntime = 0
+    train_py_reader.start()
+    iter_no = 0
+    train_info = [0, 0, 0, 0]
+    while iter_no <= args.total_iter_num:
+        t1 = time.time()
+        lr, loss, acc1, acc5 = train_exe.run(fetch_list=train_fetch_list)
+        t2 = time.time()
+        period = t2 - t1
+        lr = np.mean(np.array(lr))
+        train_info[0] += np.mean(np.array(loss))
+        train_info[1] += np.mean(np.array(acc1))
+        train_info[2] += np.mean(np.array(acc5))
+        train_info[3] += 1
+        if iter_no % args.display_iter_step == 0:
+            avgruntime = totalruntime / args.display_iter_step
+            avg_loss = train_info[0] / train_info[3]
+            avg_acc1 = train_info[1] / train_info[3]
+            avg_acc5 = train_info[2] / train_info[3]
+            print("[%s] trainbatch %d, lr %.6f, loss %.6f, "\
+                    "acc1 %.4f, acc5 %.4f, time %2.2f sec" % \
+                    (fmt_time(), iter_no, lr, avg_loss, avg_acc1, avg_acc5, avgruntime))
+            sys.stdout.flush()
+            totalruntime = 0
+        if iter_no % 1000 == 0:
+            train_info = [0, 0, 0, 0]
+
+        totalruntime += period
+        
+        if iter_no % args.test_iter_step == 0 and iter_no != 0:
+            f, l = [], []
+            for batch_id, data in enumerate(test_reader()):
+                t1 = time.time()
+                [feas] = exe.run(test_prog, fetch_list = test_fetch_list, feed=test_feeder.feed(data))
+                label = np.asarray([x[1] for x in data])
+                f.append(feas)
+                l.append(label)
+
+                t2 = time.time()
+                period = t2 - t1
+                if batch_id % 20 == 0:
+                    print("[%s] testbatch %d, time %2.2f sec" % \
+                            (fmt_time(), batch_id, period))
+
+            f = np.vstack(f)
+            l = np.hstack(l)
+            recall = recall_topk(f, l, k=1)
+            print("[%s] test_img_num %d, trainbatch %d, test_recall %.5f" % \
+                    (fmt_time(), len(f), iter_no, recall))
+            sys.stdout.flush()
+
+        if iter_no % args.save_iter_step == 0 and iter_no != 0:
+            model_path = os.path.join(model_save_dir + '/' + model_name,
+                                      str(iter_no))
+            if not os.path.isdir(model_path):
+                os.makedirs(model_path)
+            fluid.io.save_persistables(exe, model_path, main_program=train_prog)
+
+        iter_no += 1
+
+    # This is for continuous evaluation only
+    if args.enable_ce:
+        # Use the mean cost/acc for training
+        print("kpis train_cost      %s" % (avg_loss))
+        print("kpis test_recall     %s" % (recall))
+
+
+def initlogging():
+    for handler in logging.root.handlers[:]:
+        logging.root.removeHandler(handler)
+    loglevel = logging.DEBUG
+    logging.basicConfig(
+        level=loglevel,
+        # logger.BASIC_FORMAT,
+        format=
+        "%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s",
+        datefmt='%a, %d %b %Y %H:%M:%S')
+
+def main():
+    args = parser.parse_args()
+    print_arguments(args)
+    train_async(args)
+
+
+if __name__ == '__main__':
+    main()
--- a/fluid/PaddleCV/metric_learning/train_pair.py
+++ b/fluid/PaddleCV/metric_learning/train_pair.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import math
+import time
+import logging
+import argparse
+import functools
+import threading
+import subprocess
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import models
+import reader
+from losses import TripletLoss
+from losses import QuadrupletLoss
+from losses import EmlLoss
+from utility import add_arguments, print_arguments
+from utility import fmt_time, recall_topk, get_gpu_num
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('model', str, "ResNet50", "Set the network to use.")
+add_arg('embedding_size', int, 0, "Embedding size.")
+add_arg('train_batch_size', int, 120, "Minibatch size.")
+add_arg('test_batch_size', int, 50, "Minibatch size.")
+add_arg('image_shape', str, "3,224,224", "input image size")
+add_arg('class_dim', int, 11318, "Class number.")
+add_arg('lr', float, 0.0001, "set learning rate.")
+add_arg('lr_strategy', str, "piecewise_decay",	"Set the learning rate decay strategy.")
+add_arg('lr_steps', str, "100000", "step of lr")
+add_arg('total_iter_num', int, 100000, "total_iter_num")
+add_arg('display_iter_step', int, 10, "display_iter_step.")
+add_arg('test_iter_step', int, 5000, "test_iter_step.")
+add_arg('save_iter_step', int, 5000, "save_iter_step.")
+add_arg('use_gpu', bool, True, "Whether to use GPU or not.")
+add_arg('with_mem_opt', bool, True, "Whether to use memory optimization or not.")
+add_arg('pretrained_model', str, None, "Whether to use pretrained model.")
+add_arg('checkpoint', str, None, "Whether to resume checkpoint.")
+add_arg('model_save_dir', str, "output", "model save directory")
+add_arg('loss_name', str, "triplet", "Set the loss type to use.")
+add_arg('samples_each_class', int, 2, "samples_each_class.")
+add_arg('margin', float, 0.1, "margin.")
+# yapf: enable
+
+model_list = [m for m in dir(models) if "__" not in m]
+
+def optimizer_setting(params):
+    ls = params["learning_strategy"]
+    assert ls["name"] == "piecewise_decay", \
+           "learning rate strategy must be {}, \
+           but got {}".format("piecewise_decay", lr["name"])
+
+    bd = [int(e) for e in ls["lr_steps"].split(',')]
+    base_lr = params["lr"]
+    lr = [base_lr * (0.1 ** i) for i in range(len(bd) + 1)]
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=fluid.layers.piecewise_decay(
+            boundaries=bd, values=lr),
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(1e-4))
+    return optimizer
+
+
+def net_config(image, label, model, args, is_train):
+    assert args.model in model_list, "{} is not in lists: {}".format(
+        args.model, model_list)
+
+    out = model.net(input=image, embedding_size=args.embedding_size)
+    if not is_train:
+        return None, out
+
+    if args.loss_name == "triplet":
+        metricloss = TripletLoss(
+                margin=args.margin,
+        )
+    elif args.loss_name == "quadruplet":
+        metricloss = QuadrupletLoss(
+                train_batch_size = args.train_batch_size,
+                samples_each_class = args.samples_each_class,
+                margin=args.margin,
+        )
+    elif args.loss_name == "eml":
+        metricloss = EmlLoss(
+                train_batch_size = args.train_batch_size,
+                samples_each_class = args.samples_each_class,
+        )
+    cost = metricloss.loss(out)
+    avg_cost = fluid.layers.mean(x=cost)
+    return avg_cost, out
+
+def build_program(is_train, main_prog, startup_prog, args):
+    image_shape = [int(m) for m in args.image_shape.split(",")]
+    model = models.__dict__[args.model]()
+    with fluid.program_guard(main_prog, startup_prog):
+        if is_train:
+            queue_capacity = 64
+            py_reader = fluid.layers.py_reader(
+                capacity=queue_capacity,
+                shapes=[[-1] + image_shape, [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"],
+                use_double_buffer=True)
+            image, label = fluid.layers.read_file(py_reader)
+        else:
+            image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        with fluid.unique_name.guard():
+            avg_cost, out = net_config(image, label, model, args, is_train)
+            if is_train:
+                params = model.params
+                params["lr"] = args.lr
+                params["learning_strategy"]["lr_steps"] = args.lr_steps
+                params["learning_strategy"]["name"] = args.lr_strategy
+                optimizer = optimizer_setting(params)
+                optimizer.minimize(avg_cost)
+                global_lr = optimizer._global_learning_rate()
+    """            
+    if not is_train:
+        main_prog = main_prog.clone(for_test=True)
+    """
+    if is_train:
+        return py_reader, avg_cost, global_lr, out, label
+    else: 
+        return out, image, label
+
+
+def train_async(args):
+    # parameters from arguments
+
+    logging.debug('enter train')
+    model_name = args.model
+    checkpoint = args.checkpoint
+    pretrained_model = args.pretrained_model
+    model_save_dir = args.model_save_dir
+
+    startup_prog = fluid.Program()
+    train_prog = fluid.Program()
+    tmp_prog = fluid.Program()
+
+    train_py_reader, train_cost, global_lr, train_feas, train_label = build_program(
+        is_train=True,
+        main_prog=train_prog,
+        startup_prog=startup_prog,
+        args=args)
+    test_feas, image, label = build_program(
+        is_train=False,
+        main_prog=tmp_prog,
+        startup_prog=startup_prog,
+        args=args)
+    test_prog = tmp_prog.clone(for_test=True)
+
+    train_fetch_list = [global_lr.name, train_cost.name, train_feas.name, train_label.name]
+    test_fetch_list = [test_feas.name]
+
+    if args.with_mem_opt:
+        fluid.memory_optimize(train_prog, skip_opt_set=set(train_fetch_list))
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    exe.run(startup_prog)
+
+    logging.debug('after run startup program')
+
+    if checkpoint is not None:
+        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)
+
+    if pretrained_model:
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(pretrained_model, var.name))
+
+        fluid.io.load_vars(
+            exe, pretrained_model, main_program=train_prog, predicate=if_exist)
+
+    devicenum = get_gpu_num()
+    assert (args.train_batch_size % devicenum) == 0
+    train_batch_size = args.train_batch_size / devicenum
+    test_batch_size = args.test_batch_size
+    
+    train_reader = paddle.batch(reader.train(args), batch_size=train_batch_size, drop_last=True)
+    test_reader = paddle.batch(reader.test(args), batch_size=test_batch_size, drop_last=False)
+    test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
+    train_py_reader.decorate_paddle_reader(train_reader)
+
+    train_exe = fluid.ParallelExecutor(
+        main_program=train_prog,
+        use_cuda=args.use_gpu,
+        loss_name=train_cost.name)
+
+    totalruntime = 0
+    train_py_reader.start()
+    iter_no = 0
+    train_info = [0, 0, 0]
+    while iter_no <= args.total_iter_num:
+        t1 = time.time()
+        lr, loss, feas, label = train_exe.run(fetch_list=train_fetch_list)
+        t2 = time.time()
+        period = t2 - t1
+        lr = np.mean(np.array(lr))
+        train_info[0] += np.mean(np.array(loss))
+        train_info[1] += recall_topk(feas, label, k=1)
+        train_info[2] += 1
+        if iter_no % args.display_iter_step == 0:
+            avgruntime = totalruntime / args.display_iter_step
+            avg_loss = train_info[0] / train_info[2]
+            avg_recall = train_info[1] / train_info[2]
+            print("[%s] trainbatch %d, lr %.6f, loss %.6f, "\
+                    "recall %.4f, time %2.2f sec" % \
+                    (fmt_time(), iter_no, lr, avg_loss, avg_recall, avgruntime))
+            sys.stdout.flush()
+            totalruntime = 0
+        if iter_no % 1000 == 0:
+            train_info = [0, 0, 0]
+
+        totalruntime += period
+        
+        if iter_no % args.test_iter_step == 0 and iter_no != 0:
+            f, l = [], []
+            for batch_id, data in enumerate(test_reader()):
+                t1 = time.time()
+                [feas] = exe.run(test_prog, fetch_list = test_fetch_list, feed=test_feeder.feed(data))
+                label = np.asarray([x[1] for x in data])
+                f.append(feas)
+                l.append(label)
+
+                t2 = time.time()
+                period = t2 - t1
+                if batch_id % 20 == 0:
+                    print("[%s] testbatch %d, time %2.2f sec" % \
+                            (fmt_time(), batch_id, period))
+
+            f = np.vstack(f)
+            l = np.hstack(l)
+            recall = recall_topk(f, l, k=1)
+            print("[%s] test_img_num %d, trainbatch %d, test_recall %.5f" % \
+                    (fmt_time(), len(f), iter_no, recall))
+            sys.stdout.flush()
+
+        if iter_no % args.save_iter_step == 0 and iter_no != 0:
+            model_path = os.path.join(model_save_dir + '/' + model_name,
+                                      str(iter_no))
+            if not os.path.isdir(model_path):
+                os.makedirs(model_path)
+            fluid.io.save_persistables(exe, model_path, main_program=train_prog)
+
+        iter_no += 1
+
+def initlogging():
+    for handler in logging.root.handlers[:]:
+        logging.root.removeHandler(handler)
+    loglevel = logging.DEBUG
+    logging.basicConfig(
+        level=loglevel,
+        # logger.BASIC_FORMAT,
+        format=
+        "%(levelname)s:%(filename)s[%(lineno)s] %(name)s:%(funcName)s->%(message)s",
+        datefmt='%a, %d %b %Y %H:%M:%S')
+
+def main():
+    args = parser.parse_args()
+    print_arguments(args)
+    train_async(args)
+
+
+if __name__ == '__main__':
+    main()
--- a/fluid/PaddleCV/metric_learning/utility.py
+++ b/fluid/PaddleCV/metric_learning/utility.py
--- a/fluid/PaddleCV/ocr_recognition/train.py
+++ b/fluid/PaddleCV/ocr_recognition/train.py
--- a/fluid/README.cn.rst
+++ b/fluid/README.cn.rst
--- a/fluid/README.md
+++ b/fluid/README.md