diff --git a/fluid/PaddleCV/metric_learning/.run_ce.sh b/fluid/PaddleCV/metric_learning/.run_ce.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6565109568127fd29e5fb8ad962fa160168c726f
--- /dev/null
+++ b/fluid/PaddleCV/metric_learning/.run_ce.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+
+cudaid=${metric_learning:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train_elem.py  --model=ResNet50 --train_batch_size=80 --test_batch_size=80  --lr=0.01 --total_iter_num=10 --use_gpu=True --model_save_dir=out_put --loss_name=arcmargin --arc_scale=80.0  --arc_margin=0.15  --arc_easy_margin=False --enable_ce=True | python _ce.py
+
+
+cudaid=${metric_learning_4:=0,1,2,3} # use 0,1,2,3 card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train_elem.py  --model=ResNet50 --train_batch_size=80 --test_batch_size=80  --lr=0.01 --total_iter_num=10 --use_gpu=True --model_save_dir=out_put --loss_name=arcmargin --arc_scale=80.0  --arc_margin=0.15  --arc_easy_margin=False --enable_ce=True | python _ce.py
+
+
+cudaid=${metric_learning_8:=0,1,2,3,4,5,6,7} # use 0,1,2,3,4,5,6,7 card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train_elem.py  --model=ResNet50 --train_batch_size=80 --test_batch_size=80  --lr=0.01 --total_iter_num=10 --use_gpu=True --model_save_dir=out_put --loss_name=arcmargin --arc_scale=80.0  --arc_margin=0.15  --arc_easy_margin=False --enable_ce=True | python _ce.py
+
diff --git a/fluid/PaddleCV/metric_learning/__init__.py b/fluid/PaddleCV/metric_learning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fluid/PaddleCV/metric_learning/_ce.py b/fluid/PaddleCV/metric_learning/_ce.py
index ad1d8e4b7eae8b51336a713c9fa40a108bdfa438..4d250efc46172d9a3b24f57851d7c574fb99298d 100644
--- a/fluid/PaddleCV/metric_learning/_ce.py
+++ b/fluid/PaddleCV/metric_learning/_ce.py
@@ -3,18 +3,25 @@
 import os
 import sys
 sys.path.append(os.environ['ceroot'])
-from kpi import CostKpi, DurationKpi, AccKpi
+from kpi import CostKpi
+from kpi import DurationKpi
 
-# NOTE kpi.py should shared in models in some way!!!!
 
-train_cost_kpi = CostKpi('train_cost', 0.02 0, actived=True)
-test_recall_kpi = AccKpi('test_recall', 0.02, 0, actived=True)
+each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, actived=True)
+train_avg_loss_card1_kpi = CostKpi('train_avg_loss_card1', 0.08, 0)
+each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.08, 0, actived=True)
+train_avg_loss_card4_kpi = CostKpi('train_avg_loss_card4', 0.08, 0)
+each_pass_duration_card8_kpi = DurationKpi('each_pass_duration_card8', 0.08, 0, actived=True)
+train_avg_loss_card8_kpi = CostKpi('train_avg_loss_card8', 0.08, 0)
 
 tracking_kpis = [
-    train_cost_kpi,
-    test_recall_kpi,
-]
-
+        each_pass_duration_card1_kpi,
+        train_avg_loss_card1_kpi,
+        each_pass_duration_card4_kpi,
+        train_avg_loss_card4_kpi,
+        each_pass_duration_card8_kpi,
+        train_avg_loss_card8_kpi,
+        ]
 
 def parse_log(log):
     '''
@@ -55,4 +62,3 @@ def log_to_ce(log):
 if __name__ == '__main__':
     log = sys.stdin.read()
     log_to_ce(log)
-
diff --git a/fluid/PaddleCV/metric_learning/imgtool_ce.py b/fluid/PaddleCV/metric_learning/imgtool_ce.py
new file mode 100644
index 0000000000000000000000000000000000000000..b66ef12e6c7f08516cb349bdd25a6f5449a30d0d
--- /dev/null
+++ b/fluid/PaddleCV/metric_learning/imgtool_ce.py
@@ -0,0 +1,123 @@
+""" tools for processing images
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cv2
+import math
+import random
+import functools
+import numpy as np
+
+#random.seed(0)
+
+def rotate_image(img):
+    """ rotate_image """
+    (h, w) = img.shape[:2]
+    center = (w // 2, h // 2)
+    #angle = random.randint(-10, 10)
+    aggle = 0
+    M = cv2.getRotationMatrix2D(center, angle, 1.0)
+    rotated = cv2.warpAffine(img, M, (w, h))
+    return rotated
+
+def random_crop(img, size, scale=None, ratio=None):
+    """ random_crop """
+    scale = [0.08, 1.0] if scale is None else scale
+    ratio = [3. / 4., 4. / 3.] if ratio is None else ratio
+
+    #aspect_ratio = math.sqrt(random.uniform(*ratio))
+    aspect_ratio = math.sqrt(1.)
+    w = 1. * aspect_ratio
+    h = 1. / aspect_ratio
+
+    bound = min((float(img.shape[1]) / img.shape[0]) / (w ** 2),
+                (float(img.shape[0]) / img.shape[1]) / (h ** 2))
+    scale_max = min(scale[1], bound)
+    scale_min = min(scale[0], bound)
+
+    #target_area = img.shape[0] * img.shape[1] * random.uniform(scale_min,
+    #                                                         scale_max)
+    target_area = img.shape[0] * img.shape[1] * (scale_min + scale_max) / 2.
+    target_size = math.sqrt(target_area)
+    w = int(target_size * w)
+    h = int(target_size * h)
+
+    #i = random.randint(0, img.shape[0] - h)
+    #j = random.randint(0, img.shape[1] - w)
+    i = int(img.shape[0] - h) // 2
+    j = int(img.shape[1] - w) // 2
+
+    img = img[i:i+h, j:j+w, :]
+    resized = cv2.resize(img, (size, size), interpolation=cv2.INTER_LANCZOS4)
+    return resized
+
+def distort_color(img):
+    return img
+
+def resize_short(img, target_size):
+    """ resize_short """
+    percent = float(target_size) / min(img.shape[0], img.shape[1])
+    resized_width = int(round(img.shape[1] * percent))
+    resized_height = int(round(img.shape[0] * percent))
+    resized = cv2.resize(img, (resized_width, resized_height), interpolation=cv2.INTER_LANCZOS4)
+    return resized
+
+def crop_image(img, target_size, center):
+    """ crop_image """
+    height, width = img.shape[:2]
+    size = target_size
+    if center == True:
+        w_start = (width - size) // 2
+        h_start = (height - size) // 2
+    else:
+        #w_start = random.randint(0, width - size)
+        #h_start = random.randint(0, height - size)
+        w_start = (width - size) // 2
+        h_start = (height - size) // 2
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img[h_start:h_end, w_start:w_end, :]
+    return img
+
+def process_image(sample, mode, color_jitter, rotate,
+        crop_size=224, mean=None, std=None):
+    """ process_image """
+
+    mean = [0.485, 0.456, 0.406] if mean is None else mean
+    std = [0.229, 0.224, 0.225] if std is None else std
+
+    image_name = sample[0]
+    img = cv2.imread(image_name) # BGR mode, but need RGB mode
+
+    if mode == 'train':
+        if rotate:
+            img = rotate_image(img)
+        if crop_size > 0:
+            img = random_crop(img, crop_size)
+        if color_jitter:
+            img = distort_color(img)
+        #if random.randint(0, 1) == 1:
+        if random.randint(0, 1) in [0, 1]:
+            img = img[:, ::-1, :]
+    else:
+        if crop_size > 0:
+            img = resize_short(img, crop_size)
+            img = crop_image(img, target_size=crop_size, center=True)
+
+    img = img[:, :, ::-1].astype('float32').transpose((2, 0, 1)) / 255
+
+    img_mean = np.array(mean).reshape((3, 1, 1))
+    img_std = np.array(std).reshape((3, 1, 1))
+    img -= img_mean
+    img /= img_std
+
+    if mode == 'train' or mode == 'val':
+        return (img, sample[1])
+    elif mode == 'test':
+        return (img, )
+
+def image_mapper(**kwargs):
+    """ image_mapper """
+    return functools.partial(process_image, **kwargs)
diff --git a/fluid/PaddleCV/metric_learning/reader_ce.py b/fluid/PaddleCV/metric_learning/reader_ce.py
new file mode 100644
index 0000000000000000000000000000000000000000..06e37c8ff4cf3e0899012f8b49e310c56167a373
--- /dev/null
+++ b/fluid/PaddleCV/metric_learning/reader_ce.py
@@ -0,0 +1,177 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import math
+import random
+import functools
+import numpy as np
+import paddle
+from imgtool_ce import process_image
+
+random.seed(0)
+
+DATA_DIR = "./data/Stanford_Online_Products/"
+TRAIN_LIST = './data/Stanford_Online_Products/Ebay_train.txt'
+VAL_LIST = './data/Stanford_Online_Products/Ebay_test.txt'
+
+
+def init_sop(mode):
+    if mode == 'train':
+        train_data = {}
+        train_image_list = []
+        train_list = open(TRAIN_LIST, "r").readlines()
+        for i, item in enumerate(train_list):
+            items = item.strip().split()
+            if items[0] == 'image_id':
+                continue
+            path = items[3]
+            label = int(items[1]) - 1
+            train_image_list.append((path, label))
+            if label not in train_data:
+                train_data[label] = []
+            train_data[label].append(path)
+        #random.shuffle(train_image_list)
+        print("{} dataset size: {}".format(mode, len(train_data)))
+        return train_data, train_image_list
+    else:
+        val_data = {}
+        val_image_list = []
+        test_image_list = []
+        val_list = open(VAL_LIST, "r").readlines()
+        for i, item in enumerate(val_list):
+            items = item.strip().split()
+            if items[0] == 'image_id':
+                continue
+            path = items[3]
+            label = int(items[1])
+            val_image_list.append((path, label))
+            test_image_list.append(path)
+            if label not in val_data:
+                val_data[label] = []
+            val_data[label].append(path)
+        print("{} dataset size: {}".format(mode, len(val_data)))
+        if mode == 'val':
+            return val_data, val_image_list
+        else:
+            return test_image_list
+
+def common_iterator(data, settings):
+    batch_size = settings.train_batch_size
+    samples_each_class = settings.samples_each_class
+    assert (batch_size % samples_each_class == 0)
+    class_num = batch_size // samples_each_class 
+    def train_iterator():
+        labs = list(data.keys())
+        lab_num = len(labs)
+        ind = list(range(0, lab_num))
+        while True:
+            #random.shuffle(ind)
+            ind_sample = ind[:class_num]
+            for ind_i in ind_sample:
+                lab = labs[ind_i]
+                data_list = data[lab]
+                data_ind = list(range(0, len(data_list)))
+                #random.shuffle(data_ind)
+                anchor_ind = data_ind[:samples_each_class]
+
+                for anchor_ind_i in anchor_ind:
+                    anchor_path = DATA_DIR + data_list[anchor_ind_i]
+                    yield anchor_path, lab
+
+    return train_iterator
+
+def triplet_iterator(data, settings):
+    batch_size = settings.train_batch_size
+    assert (batch_size % 3 == 0)
+    def train_iterator():
+        labs = list(data.keys())
+        lab_num = len(labs)
+        ind = list(range(0, lab_num))
+        while True:
+            #random.shuffle(ind)
+            ind_pos, ind_neg = ind[:2]
+            lab_pos = labs[ind_pos]
+            pos_data_list = data[lab_pos]
+            data_ind = list(range(0, len(pos_data_list)))
+            #random.shuffle(data_ind)
+            anchor_ind, pos_ind = data_ind[:2]
+
+            lab_neg = labs[ind_neg]
+            neg_data_list = data[lab_neg]
+            #neg_ind = random.randint(0, len(neg_data_list) - 1)
+            neg_ind = 1
+            
+            anchor_path = DATA_DIR + pos_data_list[anchor_ind]
+            yield anchor_path, lab_pos
+            pos_path = DATA_DIR + pos_data_list[pos_ind]
+            yield pos_path, lab_pos
+            neg_path = DATA_DIR + neg_data_list[neg_ind]
+            yield neg_path, lab_neg
+
+    return train_iterator
+
+def arcmargin_iterator(data, settings):
+    def train_iterator():
+        while True:
+            for items in data:
+                path, label = items
+                path = DATA_DIR + path
+                yield path, label
+    return train_iterator
+
+def image_iterator(data, mode):
+    def val_iterator():
+        for items in data:
+            path, label = items
+            path = DATA_DIR + path 
+            yield path, label
+    def test_iterator():
+        for item in data:
+            path = item
+            path = DATA_DIR + path 
+            yield [path]
+    if mode == 'val':
+        return val_iterator
+    else:
+        return test_iterator
+
+def createreader(settings, mode):
+    def metric_reader():
+        if mode == 'train':
+            train_data, train_image_list = init_sop('train')
+            loss_name = settings.loss_name
+            if loss_name in ["softmax", "arcmargin"]:
+                return arcmargin_iterator(train_image_list, settings)()
+            elif loss_name == 'triplet':
+                return triplet_iterator(train_data, settings)()
+            else:
+                return common_iterator(train_data, settings)()
+        elif mode == 'val':
+            val_data, val_image_list = init_sop('val')
+            return image_iterator(val_image_list, 'val')()
+        else:
+            test_image_list = init_sop('test')
+            return image_iterator(test_image_list, 'test')()
+
+    image_shape = settings.image_shape.split(',')
+    assert(image_shape[1] == image_shape[2])
+    image_size = int(image_shape[2])
+    #keep_order = False if mode != 'train' or settings.loss_name in ['softmax', 'arcmargin'] else True
+    keep_order = True
+    image_mapper = functools.partial(process_image,
+            mode=mode, color_jitter=False, rotate=False, crop_size=image_size)
+    reader = paddle.reader.xmap_readers(
+            image_mapper, metric_reader, 8, 1000, order=keep_order)
+    return reader
+
+
+def train(settings): 
+    return createreader(settings, "train")
+
+def test(settings):
+    return createreader(settings, "val")
+
+def infer(settings):
+    return createreader(settings, "test")
diff --git a/fluid/PaddleCV/metric_learning/run.xsh b/fluid/PaddleCV/metric_learning/run.xsh
new file mode 100755
index 0000000000000000000000000000000000000000..7c93b526f82a5666cd56bea9cf1fba4186618809
--- /dev/null
+++ b/fluid/PaddleCV/metric_learning/run.xsh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+./.run_ce.sh
diff --git a/fluid/PaddleCV/metric_learning/train_elem.py b/fluid/PaddleCV/metric_learning/train_elem.py
index 126a1ff1201301fa7cd3f8be924554c5ded22bdc..3e8a3f1aa8eba4164dac2e763eb4ab0c0544af36 100644
--- a/fluid/PaddleCV/metric_learning/train_elem.py
+++ b/fluid/PaddleCV/metric_learning/train_elem.py
@@ -194,6 +194,10 @@ def train_async(args):
     
     train_reader = paddle.batch(reader.train(args), batch_size=train_batch_size, drop_last=True)
     test_reader = paddle.batch(reader.test(args), batch_size=test_batch_size, drop_last=False)
+    if args.enable_ce:
+        import reader_ce
+        train_reader = paddle.batch(reader_ce.train(args), batch_size=train_batch_size, drop_last=False)
+        test_reader = paddle.batch(reader_ce.test(args), batch_size=test_batch_size, drop_last=False)
     test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
     train_py_reader.decorate_paddle_reader(train_reader)
 
@@ -202,6 +206,7 @@ def train_async(args):
         use_cuda=args.use_gpu,
         loss_name=train_cost.name)
 
+    total_time = 0
     totalruntime = 0
     train_py_reader.start()
     iter_no = 0
@@ -230,10 +235,16 @@ def train_async(args):
             train_info = [0, 0, 0, 0]
 
         totalruntime += period
+        total_time += 1
         
-        if iter_no % args.test_iter_step == 0 and iter_no != 0:
+        #if iter_no % args.test_iter_step == 0 and iter_no != 0:
+        if (iter_no % args.test_iter_step == 0 and iter_no != 0) or args.enable_ce:
             f, l = [], []
             for batch_id, data in enumerate(test_reader()):
+                if args.enable_ce:
+                    if batch_id > 1:
+                        break
+
                 t1 = time.time()
                 [feas] = exe.run(test_prog, fetch_list = test_fetch_list, feed=test_feeder.feed(data))
                 label = np.asarray([x[1] for x in data])
@@ -263,10 +274,17 @@ def train_async(args):
         iter_no += 1
 
     # This is for continuous evaluation only
+    # only for ce
     if args.enable_ce:
-        # Use the mean cost/acc for training
-        print("kpis train_cost      %s" % (avg_loss))
-        print("kpis test_recall     %s" % (recall))
+        gpu_num = devicenum
+        epoch_idx = args.total_iter_num 
+        print("kpis\teach_pass_duration_card%s\t%s" %
+                    (gpu_num, total_time / epoch_idx))
+        print("kpis\ttrain_avg_loss_card%s\t%s" %
+                    (gpu_num, avg_loss))
+        #print("kpis\ttrain_recall_card%s\t%s" %
+        #            (gpu_num, recall))
+
 
 
 def initlogging():