diff --git a/fluid/face_detection/profile.py b/fluid/face_detection/profile.py
index 7de20d6cb7c527ae3d08d07ef8eea1d1b12d0dd8..d73bff19c147050a02e9148493b868f5955e4dc6 100644
--- a/fluid/face_detection/profile.py
+++ b/fluid/face_detection/profile.py
@@ -24,13 +24,13 @@ add_arg('parallel',         bool,  True,            "parallel")
 add_arg('learning_rate',    float, 0.001,           "Learning rate.")
 add_arg('batch_size',       int,   20,              "Minibatch size.")
 add_arg('num_iteration',    int,   10,              "Epoch number.")
-add_arg('skip_reader',      bool,  False,            "Whether to skip data reader.")
 add_arg('use_gpu',          bool,  True,            "Whether use GPU.")
 add_arg('use_pyramidbox',   bool,  True,            "Whether use PyramidBox model.")
 add_arg('model_save_dir',   str,   'output',        "The path to save model.")
 add_arg('pretrained_model', str,   './vgg_ilsvrc_16_fc_reduced', "The init model path.")
 add_arg('resize_h',         int,   640,             "The resized image height.")
 add_arg('resize_w',         int,   640,             "The resized image height.")
+add_arg('data_dir',         str,   'data',          "The base dir of dataset")
 #yapf: enable
 
 
@@ -43,52 +43,64 @@ def train(args, config, train_file_list, optimizer_method):
     use_pyramidbox = args.use_pyramidbox
     model_save_dir = args.model_save_dir
     pretrained_model = args.pretrained_model
-    skip_reader = args.skip_reader
     num_iterations = args.num_iteration
     parallel = args.parallel
 
     num_classes = 2
     image_shape = [3, height, width]
 
-    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
-    devices_num = len(devices.split(","))
-
-    fetches = []
-    network = PyramidBox(image_shape, num_classes,
-                         sub_network=use_pyramidbox)
-    if use_pyramidbox:
-        face_loss, head_loss, loss = network.train()
-        fetches = [face_loss, head_loss]
-    else:
-        loss = network.vgg_ssd_loss()
-        fetches = [loss]
-
-    epocs = 12880 // batch_size
-    boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100]
-    values = [
-        learning_rate, learning_rate * 0.5, learning_rate * 0.25,
-        learning_rate * 0.1, learning_rate * 0.01
-    ]
-
-    if optimizer_method == "momentum":
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=fluid.layers.piecewise_decay(
-                boundaries=boundaries, values=values),
-            momentum=0.9,
-            regularization=fluid.regularizer.L2Decay(0.0005),
-        )
-    else:
-        optimizer = fluid.optimizer.RMSProp(
-            learning_rate=fluid.layers.piecewise_decay(boundaries, values),
-            regularization=fluid.regularizer.L2Decay(0.0005),
-        )
+    startup_prog = fluid.Program()
+    train_prog = fluid.Program()
+    with fluid.program_guard(train_prog, startup_prog):
+        py_reader = fluid.layers.py_reader(
+            capacity=8,
+            shapes=[[-1] + image_shape, [-1, 4], [-1, 4], [-1, 1]],
+            lod_levels=[0, 1, 1, 1],
+            dtypes=["float32", "float32", "float32", "int32"],
+            use_double_buffer=True)
+        with fluid.unique_name.guard():
+            image, face_box, head_box, gt_label = fluid.layers.read_file(py_reader)
+            fetches = []
+            network = PyramidBox(image=image,
+                                 face_box=face_box,
+                                 head_box=head_box,
+                                 gt_label=gt_label,
+                                 sub_network=use_pyramidbox)
+            if use_pyramidbox:
+                face_loss, head_loss, loss = network.train()
+                fetches = [face_loss, head_loss]
+            else:
+                loss = network.vgg_ssd_loss()
+                fetches = [loss]
+            devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
+            devices_num = len(devices.split(","))
+            batch_size_per_device = batch_size // devices_num
+            steps_per_pass = 12880 // batch_size
+            boundaries = [steps_per_pass * 50, steps_per_pass * 80,
+                          steps_per_pass * 120, steps_per_pass * 140]
+            values = [
+                learning_rate, learning_rate * 0.5, learning_rate * 0.25,
+                learning_rate * 0.1, learning_rate * 0.01]
+            if optimizer_method == "momentum":
+                optimizer = fluid.optimizer.Momentum(
+                    learning_rate=fluid.layers.piecewise_decay(
+                        boundaries=boundaries, values=values),
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(0.0005),
+                )
+            else:
+                optimizer = fluid.optimizer.RMSProp(
+                    learning_rate=
+                    fluid.layers.piecewise_decay(boundaries, values),
+                    regularization=fluid.regularizer.L2Decay(0.0005),
+                )
+            optimizer.minimize(loss)
+    fluid.memory_optimize(train_prog)
 
-    optimizer.minimize(loss)
-    fluid.memory_optimize(fluid.default_main_program())
 
     place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
     exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
+    exe.run(startup_prog)
 
     start_pass = 0
     if pretrained_model:
@@ -106,49 +118,26 @@ def train(args, config, train_file_list, optimizer_method):
 
     if parallel:
         train_exe = fluid.ParallelExecutor(
-            use_cuda=use_gpu, loss_name=loss.name)
-
-    train_reader = reader.train_batch_reader(config, train_file_list, batch_size=batch_size)
-
-    def tensor(data, place, lod=None):
-        t = fluid.core.LoDTensor()
-        t.set(data, place)
-        if lod:
-            t.set_lod(lod)
-        return t
-
-    im, face_box, head_box, labels, lod = next(train_reader)
-    im_t = tensor(im, place)
-    box1 = tensor(face_box, place, [lod])
-    box2 = tensor(head_box, place, [lod])
-    lbl_t = tensor(labels, place, [lod])
-    feed_data = {'image': im_t, 'face_box': box1,
-                 'head_box': box2, 'gt_label': lbl_t}
-
-    def run(iterations, feed_data):
+            use_cuda=use_gpu, loss_name=loss.name, main_program = train_prog)
+    train_reader = reader.train(config,
+                                train_file_list,
+                                batch_size_per_device,
+                                shuffle=False,
+                                use_multiprocessing=True,
+                                num_workers=8,
+                                max_queue=24)
+    py_reader.decorate_paddle_reader(train_reader)
+
+    def run(iterations):
         # global feed_data
-        reader_time = []
+        py_reader.start()
         run_time = []
         for batch_id in range(iterations):
-            start_time = time.time()
-            if not skip_reader:
-                im, face_box, head_box, labels, lod = next(train_reader)
-                im_t = tensor(im, place)
-                box1 = tensor(face_box, place, [lod])
-                box2 = tensor(head_box, place, [lod])
-                lbl_t = tensor(labels, place, [lod])
-                feed_data = {'image': im_t, 'face_box': box1,
-                             'head_box': box2, 'gt_label': lbl_t}
-            end_time = time.time()
-            reader_time.append(end_time - start_time)
-
             start_time = time.time()
             if parallel:
-                fetch_vars = train_exe.run(fetch_list=[v.name for v in fetches],
-                                           feed=feed_data)
+                fetch_vars = train_exe.run(fetch_list=[v.name for v in fetches])
             else:
-                fetch_vars = exe.run(fluid.default_main_program(),
-                                     feed=feed_data,
+                fetch_vars = exe.run(train_prog,
                                      fetch_list=fetches)
             end_time = time.time()
             run_time.append(end_time - start_time)
@@ -158,31 +147,31 @@ def train(args, config, train_file_list, optimizer_method):
             else:
                 print("Batch {0}, face loss {1}, head loss {2}".format(
                        batch_id, fetch_vars[0], fetch_vars[1]))
-
-        return reader_time, run_time
+        return run_time
 
     # start-up
-    run(2, feed_data)
+    run(2)
 
     # profiling
     start = time.time()
     if not parallel:
         with profiler.profiler('All', 'total', '/tmp/profile_file'):
-            reader_time, run_time = run(num_iterations, feed_data)
+            run_time = run(num_iterations)
     else:
-        reader_time, run_time = run(num_iterations, feed_data)
+        run_time = run(num_iterations)
     end = time.time()
     total_time = end - start
     print("Total time: {0}, reader time: {1} s, run time: {2} s".format(
-        total_time, np.sum(reader_time), np.sum(run_time)))
+        total_time, total_time - np.sum(run_time), np.sum(run_time)))
 
 
 if __name__ == '__main__':
     args = parser.parse_args()
     print_arguments(args)
 
-    data_dir = 'data/WIDER_train/images/'
-    train_file_list = 'data/wider_face_split/wider_face_train_bbx_gt.txt'
+    data_dir = os.path.join(args.data_dir, 'WIDER_train/images/')
+    train_file_list = os.path.join(args.data_dir,
+        'wider_face_split/wider_face_train_bbx_gt.txt')
 
     config = reader.Settings(
         data_dir=data_dir,
diff --git a/fluid/face_detection/pyramidbox.py b/fluid/face_detection/pyramidbox.py
index 5701f2fa333649e05122da1fd66974c54d73a285..4012d77cb7b99c36793807f173e38062f1b846ad 100644
--- a/fluid/face_detection/pyramidbox.py
+++ b/fluid/face_detection/pyramidbox.py
@@ -63,8 +63,11 @@ def conv_block(input, groups, filters, ksizes, strides=None, with_pool=True):
 
 class PyramidBox(object):
     def __init__(self,
-                 data_shape,
-                 num_classes=None,
+                 data_shape=None,
+                 image=None,
+                 face_box=None,
+                 head_box=None,
+                 gt_label=None,
                  use_transposed_conv2d=True,
                  is_infer=False,
                  sub_network=False):
@@ -74,13 +77,17 @@ class PyramidBox(object):
         self.data_shape = data_shape
         self.min_sizes = [16., 32., 64., 128., 256., 512.]
         self.steps = [4., 8., 16., 32., 64., 128.]
-        self.num_classes = num_classes
         self.use_transposed_conv2d = use_transposed_conv2d
         self.is_infer = is_infer
         self.sub_network = sub_network
+        self.image = image
+        self.face_box = face_box
+        self.head_box = head_box
+        self.gt_label = gt_label
 
         # the base network is VGG with atrous layers
-        self._input()
+        if is_infer:
+            self._input()
         self._vgg()
         if sub_network:
             self._low_level_fpn()
@@ -89,12 +96,6 @@ class PyramidBox(object):
         else:
             self._vgg_ssd()
 
-    def feeds(self):
-        if self.is_infer:
-            return [self.image]
-        else:
-            return [self.image, self.face_box, self.head_box, self.gt_label]
-
     def _input(self):
         self.image = fluid.layers.data(
             name='image', shape=self.data_shape, dtype='float32')
diff --git a/fluid/face_detection/reader.py b/fluid/face_detection/reader.py
index 1e142adf9f00ecfa2af6dc75512babe71fb4496b..ae90ad6b51db274446785dd6d9def57a606747d2 100644
--- a/fluid/face_detection/reader.py
+++ b/fluid/face_detection/reader.py
@@ -231,9 +231,7 @@ def train_generator(settings, file_list, batch_size, shuffle=True):
     while True:
         if shuffle:
             np.random.shuffle(file_dict)
-        images, face_boxes, head_boxes, label_ids = [], [], [], []
-        label_offs = [0]
-
+        batch_out = []
         for index_image in file_dict.keys():
             image_name = file_dict[index_image][0]
             image_path = os.path.join(settings.data_dir, image_name)
@@ -261,7 +259,6 @@ def train_generator(settings, file_list, batch_size, shuffle=True):
                     bbox_sample.append(float(xmax) / im_width)
                     bbox_sample.append(float(ymax) / im_height)
                     bbox_labels.append(bbox_sample)
-
             im, sample_labels = preprocess(im, bbox_labels, "train", settings,
                                            image_path)
             sample_labels = np.array(sample_labels)
@@ -271,46 +268,40 @@ def train_generator(settings, file_list, batch_size, shuffle=True):
             face_box = sample_labels[:, 1:5]
             head_box = expand_bboxes(face_box)
             label = [1] * len(face_box)
-
-            images.append(im)
-            face_boxes.extend(face_box)
-            head_boxes.extend(head_box)
-            label_ids.extend(label)
-            label_offs.append(label_offs[-1] + len(face_box))
-
-            if len(images) == batch_size:
-                images = np.array(images).astype('float32')
-                face_boxes = np.array(face_boxes).astype('float32')
-                head_boxes = np.array(head_boxes).astype('float32')
-                label_ids = np.array(label_ids).astype('int32')
-                yield images, face_boxes, head_boxes, label_ids, label_offs
-                images, face_boxes, head_boxes = [], [], []
-                label_ids, label_offs = [], [0]
-
-
-def train_batch_reader(settings,
-                       file_list,
-                       batch_size,
-                       shuffle=True,
-                       num_workers=8):
-    try:
-        enqueuer = GeneratorEnqueuer(
-            train_generator(settings, file_list, batch_size, shuffle),
-            use_multiprocessing=False)
-        enqueuer.start(max_queue_size=24, workers=num_workers)
-        generator_output = None
-        while True:
-            while enqueuer.is_running():
-                if not enqueuer.queue.empty():
-                    generator_output = enqueuer.queue.get()
-                    break
-                else:
-                    time.sleep(0.01)
-            yield generator_output
+            batch_out.append((im, face_box, head_box, label))
+            if len(batch_out) == batch_size:
+                yield batch_out
+                batch_out = []
+
+
+def train(settings,
+          file_list,
+          batch_size,
+          shuffle=True,
+          use_multiprocessing=True,
+          num_workers=8,
+          max_queue=24):
+    def reader():
+        try:
+            enqueuer = GeneratorEnqueuer(
+                train_generator(settings, file_list, batch_size, shuffle),
+                use_multiprocessing=use_multiprocessing)
+            enqueuer.start(max_queue_size=max_queue, workers=num_workers)
             generator_output = None
-    finally:
-        if enqueuer is not None:
-            enqueuer.stop()
+            while True:
+                while enqueuer.is_running():
+                    if not enqueuer.queue.empty():
+                        generator_output = enqueuer.queue.get()
+                        break
+                    else:
+                        time.sleep(0.02)
+                yield generator_output
+                generator_output = None
+        finally:
+            if enqueuer is not None:
+                enqueuer.stop()
+
+    return reader
 
 
 def test(settings, file_list):
diff --git a/fluid/face_detection/train.py b/fluid/face_detection/train.py
index 0a71606bf300c2354d6e8429b70e96a8635b1421..13744562c9d1814d457af20e3185d2d3c7a22fb7 100644
--- a/fluid/face_detection/train.py
+++ b/fluid/face_detection/train.py
@@ -9,6 +9,7 @@ import time
 import argparse
 import functools
 
+import paddle
 import paddle.fluid as fluid
 from pyramidbox import PyramidBox
 import reader
@@ -21,59 +22,42 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 add_arg('parallel',         bool,  True,            "Whether use multi-GPU/threads or not.")
 add_arg('learning_rate',    float, 0.001,           "The start learning rate.")
 add_arg('batch_size',       int,   16,              "Minibatch size.")
-add_arg('num_passes',       int,   160,             "Epoch number.")
+add_arg('epoc_num',         int,   160,             "Epoch number.")
 add_arg('use_gpu',          bool,  True,            "Whether use GPU.")
 add_arg('use_pyramidbox',   bool,  True,            "Whether use PyramidBox model.")
 add_arg('model_save_dir',   str,   'output',        "The path to save model.")
 add_arg('resize_h',         int,   640,             "The resized image height.")
 add_arg('resize_w',         int,   640,             "The resized image width.")
+add_arg('mean_BGR',         str,   '104., 117., 123.', "Mean value for B,G,R channel which will be subtracted.")
 add_arg('with_mem_opt',     bool,  True,            "Whether to use memory optimization or not.")
 add_arg('pretrained_model', str,   './vgg_ilsvrc_16_fc_reduced/', "The init model path.")
 add_arg('data_dir',         str,   'data',          "The base dir of dataset")
 #yapf: enable
 
-
-def train(args, config, train_file_list, optimizer_method):
-    learning_rate = args.learning_rate
-    batch_size = args.batch_size
-    num_passes = args.num_passes
-    height = args.resize_h
-    width = args.resize_w
-    use_gpu = args.use_gpu
-    use_pyramidbox = args.use_pyramidbox
-    model_save_dir = args.model_save_dir
-    pretrained_model = args.pretrained_model
-    with_memory_optimization = args.with_mem_opt
-
-    num_classes = 2
-    image_shape = [3, height, width]
-
-    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
-    devices_num = len(devices.split(","))
-
-
-    fetches = []
-    network = PyramidBox(image_shape, num_classes,
-                         sub_network=use_pyramidbox)
-    if use_pyramidbox:
-        face_loss, head_loss, loss = network.train()
-        fetches = [face_loss, head_loss]
-    else:
-        loss = network.vgg_ssd_loss()
-        fetches = [loss]
-
-    steps_per_pass = 12880 // batch_size
-    boundaries = [steps_per_pass * 99, steps_per_pass * 124,
-                  steps_per_pass * 149]
-    values = [
-        learning_rate, learning_rate * 0.1,
-        learning_rate * 0.01, learning_rate * 0.001
-    ]
+train_parameters = {
+    "train_images": 12880,
+    "image_shape": [3, 640, 640],
+    "class_num": 2,
+    "batch_size": 16,
+    "lr": 0.001,
+    "lr_epochs": [99, 124, 149],
+    "lr_decay": [1, 0.1, 0.01, 0.001],
+    "epoc_num": 160,
+    "optimizer_method": "momentum",
+    "use_pyramidbox": True
+}
+
+def optimizer_setting(train_params):
+    batch_size = train_params["batch_size"]
+    iters = train_params["train_images"] // batch_size
+    lr = train_params["lr"]
+    optimizer_method = train_params["optimizer_method"]
+    boundaries = [i * iters for i in train_params["lr_epochs"]]
+    values = [i * lr for i in train_params["lr_decay"]]
 
     if optimizer_method == "momentum":
         optimizer = fluid.optimizer.Momentum(
-            learning_rate=fluid.layers.piecewise_decay(
-                boundaries=boundaries, values=values),
+            learning_rate=fluid.layers.piecewise_decay(boundaries, values),
             momentum=0.9,
             regularization=fluid.regularizer.L2Decay(0.0005),
         )
@@ -82,19 +66,76 @@ def train(args, config, train_file_list, optimizer_method):
             learning_rate=fluid.layers.piecewise_decay(boundaries, values),
             regularization=fluid.regularizer.L2Decay(0.0005),
         )
+    return optimizer
+
+
+def build_program(train_params, main_prog, startup_prog, args):
+    use_pyramidbox = train_params["use_pyramidbox"]
+    image_shape = train_params["image_shape"]
+    class_num = train_params["class_num"]
+    with fluid.program_guard(main_prog, startup_prog):
+        py_reader = fluid.layers.py_reader(
+            capacity=8,
+            shapes=[[-1] + image_shape, [-1, 4], [-1, 4], [-1, 1]],
+            lod_levels=[0, 1, 1, 1],
+            dtypes=["float32", "float32", "float32", "int32"],
+            use_double_buffer=True)
+        with fluid.unique_name.guard():
+            image, face_box, head_box, gt_label = fluid.layers.read_file(py_reader)
+            fetches = []
+            network = PyramidBox(image=image,
+                                 face_box=face_box,
+                                 head_box=head_box,
+                                 gt_label=gt_label,
+                                 sub_network=use_pyramidbox)
+            if use_pyramidbox:
+                face_loss, head_loss, loss = network.train()
+                fetches = [face_loss, head_loss]
+            else:
+                loss = network.vgg_ssd_loss()
+                fetches = [loss]
+            optimizer = optimizer_setting(train_params)
+            optimizer.minimize(loss)
+    return py_reader, fetches, loss
+
+def train(args, config, train_params, train_file_list):
+    batch_size = train_params["batch_size"]
+    epoc_num = train_params["epoc_num"]
+    optimizer_method = train_params["optimizer_method"]
+    use_pyramidbox = train_params["use_pyramidbox"]
+
+    use_gpu = args.use_gpu
+    model_save_dir = args.model_save_dir
+    pretrained_model = args.pretrained_model
+    with_memory_optimization = args.with_mem_opt
+
+    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
+    devices_num = len(devices.split(","))
+    batch_size_per_device = batch_size // devices_num
+    iters_per_epoc = train_params["train_images"] // batch_size
+    num_workers = 8
+    is_shuffle = True
+
+    startup_prog = fluid.Program()
+    train_prog = fluid.Program()
+
+    train_py_reader, fetches, loss = build_program(
+        train_params = train_params,
+        main_prog = train_prog,
+        startup_prog = startup_prog,
+        args=args)
 
-    optimizer.minimize(loss)
     if with_memory_optimization:
-        fluid.memory_optimize(fluid.default_main_program())
+        fluid.memory_optimize(train_prog)
 
     place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
     exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
+    exe.run(startup_prog)
 
-    start_pass = 0
+    start_epoc = 0
     if pretrained_model:
         if pretrained_model.isdigit():
-            start_pass = int(pretrained_model) + 1
+            start_epoc = int(pretrained_model) + 1
             pretrained_model = os.path.join(model_save_dir, pretrained_model)
             print("Resume from %s " %(pretrained_model))
 
@@ -103,66 +144,67 @@ def train(args, config, train_file_list, optimizer_method):
                              (pretrained_model))
         def if_exist(var):
             return os.path.exists(os.path.join(pretrained_model, var.name))
-        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+        fluid.io.load_vars(
+            exe, pretrained_model, main_program=train_prog, predicate=if_exist)
+    train_reader = reader.train(config,
+                                train_file_list,
+                                batch_size_per_device,
+                                shuffle = is_shuffle,
+                                use_multiprocessing=True,
+                                num_workers = num_workers,
+                                max_queue=24)
+    train_py_reader.decorate_paddle_reader(train_reader)
 
     if args.parallel:
         train_exe = fluid.ParallelExecutor(
-            use_cuda=use_gpu, loss_name=loss.name)
-
-    train_reader = reader.train_batch_reader(config, train_file_list, batch_size=batch_size)
+            main_program = train_prog,
+            use_cuda=use_gpu,
+            loss_name=loss.name)
 
-    def save_model(postfix):
+    def save_model(postfix, program):
         model_path = os.path.join(model_save_dir, postfix)
         if os.path.isdir(model_path):
             shutil.rmtree(model_path)
+
         print('save models to %s' % (model_path))
         fluid.io.save_persistables(exe, model_path)
 
-    def tensor(data, place, lod=None):
-        t = fluid.core.LoDTensor()
-        t.set(data, place)
-        if lod:
-            t.set_lod(lod)
-        return t
-
-    for pass_id in range(start_pass, num_passes):
-        start_time = time.time()
-        prev_start_time = start_time
-        end_time = 0
-        for batch_id in range(steps_per_pass):
-            im, face_box, head_box, labels, lod = next(train_reader)
-            im_t = tensor(im, place)
-            box1 = tensor(face_box, place, [lod])
-            box2 = tensor(head_box, place, [lod])
-            lbl_t = tensor(labels, place, [lod])
-            feeding = {'image': im_t, 'face_box': box1,
-                       'head_box': box2, 'gt_label': lbl_t}
-
-            prev_start_time = start_time
+    train_py_reader.start()
+    try:
+        for pass_id in range(start_epoc, epoc_num):
             start_time = time.time()
-            if args.parallel:
-                fetch_vars = train_exe.run(fetch_list=[v.name for v in fetches],
-                                           feed=feeding)
-            else:
-                fetch_vars = exe.run(fluid.default_main_program(),
-                                     feed=feeding,
-                                     fetch_list=fetches)
-            end_time = time.time()
-            fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
-            if batch_id % 10 == 0:
-                if not args.use_pyramidbox:
-                    print("Pass {0}, batch {1}, loss {2}, time {3}".format(
-                        pass_id, batch_id, fetch_vars[0],
-                        start_time - prev_start_time))
+            prev_start_time = start_time
+            end_time = 0
+            batch_id = 0
+            for batch_id in range(iters_per_epoc):
+                prev_start_time = start_time
+                start_time = time.time()
+                if args.parallel:
+                    fetch_vars = train_exe.run(fetch_list=
+                        [v.name for v in fetches])
                 else:
-                    print("Pass {0}, batch {1}, face loss {2}, head loss {3}, " \
-                          "time {4}".format(pass_id,
-                           batch_id, fetch_vars[0], fetch_vars[1],
-                           start_time - prev_start_time))
-
-        if pass_id % 1 == 0 or pass_id == num_passes - 1:
-            save_model(str(pass_id))
-
+                    fetch_vars = exe.run(train_prog,
+                                         fetch_list=fetches)
+                end_time = time.time()
+                fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
+                if batch_id % 10 == 0:
+                    if not args.use_pyramidbox:
+                        print("Pass {0}, batch {1}, loss {2}, time {3}".format(
+                            pass_id, batch_id, fetch_vars[0],
+                            start_time - prev_start_time))
+                    else:
+                        print("Pass {0}, batch {1}, face loss {2}, " \
+                              "head loss {3}, " \
+                              "time {4}".format(pass_id,
+                               batch_id, fetch_vars[0], fetch_vars[1],
+                               start_time - prev_start_time))
+            if pass_id % 1 == 0 or pass_id == epoc_num - 1:
+                save_model(str(pass_id), train_prog)
+    except fluid.core.EOFException:
+        train_py_reader.reset()
+    except StopIteration:
+        train_py_reader.reset()
+    train_py_reader.reset()
 
 if __name__ == '__main__':
     args = parser.parse_args()
@@ -171,13 +213,21 @@ if __name__ == '__main__':
     data_dir = os.path.join(args.data_dir, 'WIDER_train/images/')
     train_file_list = os.path.join(args.data_dir,
         'wider_face_split/wider_face_train_bbx_gt.txt')
+    mean_BGR = [float(m) for m in args.mean_BGR.split(",")]
+    image_shape = [3, int(args.resize_h), int(args.resize_w)]
+    train_parameters["image_shape"] = image_shape
+    train_parameters["use_pyramidbox"] = args.use_pyramidbox
+    train_parameters["batch_size"] = args.batch_size
+    train_parameters["lr"] = args.learning_rate
+    train_parameters["epoc_num"] = args.epoc_num
+
 
     config = reader.Settings(
         data_dir=data_dir,
-        resize_h=args.resize_h,
-        resize_w=args.resize_w,
+        resize_h=image_shape[1],
+        resize_w=image_shape[2],
         apply_distort=True,
         apply_expand=False,
-        mean_value=[104., 117., 123.],
+        mean_value=mean_BGR,
         ap_version='11point')
-    train(args, config, train_file_list, optimizer_method="momentum")
+    train(args, config, train_parameters, train_file_list)
diff --git a/fluid/face_detection/widerface_eval.py b/fluid/face_detection/widerface_eval.py
index ea9e8b11e60591e2372e286b1533c1346e2ca9ba..2a1addd1ed3313f8bb472bde2dad7fe90dd1c591 100644
--- a/fluid/face_detection/widerface_eval.py
+++ b/fluid/face_detection/widerface_eval.py
@@ -305,7 +305,9 @@ if __name__ == '__main__':
     image_shape = [3, 1024, 1024]
     with fluid.program_guard(main_program, startup_program):
         network = PyramidBox(
-            image_shape, sub_network=args.use_pyramidbox, is_infer=True)
+            data_shape=image_shape,
+            sub_network=args.use_pyramidbox,
+            is_infer=True)
         infer_program, nmsed_out = network.infer(main_program)
         fetches = [nmsed_out]
         fluid.io.load_persistables(
diff --git a/fluid/object_detection/train.py b/fluid/object_detection/train.py
index 806887edca06df3102bcaa09f4b83d9d7e6524ea..1106635dabab26ff70ccad81d477af25819cec17 100644
--- a/fluid/object_detection/train.py
+++ b/fluid/object_detection/train.py
@@ -62,7 +62,7 @@ train_parameters = {
 
 def optimizer_setting(train_params):
     batch_size = train_params["batch_size"]
-    iters = train_params["train_images"] / batch_size
+    iters = train_params["train_images"] // batch_size
     lr = train_params["lr"]
     boundaries = [i * iters  for i in train_params["lr_epochs"]]
     values = [ i * lr for i in train_params["lr_decay"]]
@@ -118,7 +118,6 @@ def train(args,
 
     model_save_dir = args.model_save_dir
     pretrained_model = args.pretrained_model
-    epoc_num = args.epoc_num
     use_gpu = args.use_gpu
     parallel = args.parallel
     enable_ce = args.enable_ce
@@ -127,6 +126,7 @@ def train(args,
     devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
     devices_num = len(devices.split(","))
     batch_size = train_params['batch_size']
+    epoc_num = train_params['epoch_num']
     batch_size_per_device = batch_size // devices_num
     iters_per_epoc = train_params["train_images"] // batch_size
     num_workers = 8