diff --git a/yolov3.py b/yolov3.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d1cad446a27031ff875e194cda3ad4a7efea012
--- /dev/null
+++ b/yolov3.py
@@ -0,0 +1,540 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import argparse
+import contextlib
+import os
+import random
+import time
+
+import cv2
+import numpy as np
+from pycocotools.coco import COCO
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Conv2D
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.regularizer import L2Decay
+
+from model import Model, Loss, shape_hints
+from resnet import ResNet, ConvBNLayer
+
+
+# XXX transfer learning
+class ResNetBackBone(ResNet):
+    def __init__(self, depth=50):
+        super(ResNetBackBone, self).__init__(depth=depth)
+        delattr(self, 'fc')
+
+    def forward(self, inputs):
+        x = self.conv(inputs)
+        x = self.pool(x)
+        outputs = []
+        for layer in self.layers:
+            x = layer(x)
+            outputs.append(x)
+        return outputs
+
+
+class YoloDetectionBlock(fluid.dygraph.Layer):
+    def __init__(self, num_channels, num_filters):
+        super(YoloDetectionBlock, self).__init__()
+
+        assert num_filters % 2 == 0, \
+            "num_filters {} cannot be divided by 2".format(num_filters)
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='leaky_relu')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2,
+            filter_size=3,
+            act='leaky_relu')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters * 2,
+            num_filters=num_filters,
+            filter_size=1,
+            act='leaky_relu')
+        self.conv3 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2,
+            filter_size=3,
+            act='leaky_relu')
+        self.route = ConvBNLayer(
+            num_channels=num_filters * 2,
+            num_filters=num_filters,
+            filter_size=1,
+            act='leaky_relu')
+        self.tip = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2,
+            filter_size=3,
+            act='leaky_relu')
+
+    def forward(self, inputs):
+        out = self.conv0(inputs)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        out = self.conv3(out)
+        route = self.route(out)
+        tip = self.tip(route)
+        return route, tip
+
+
+class YOLOv3(Model):
+    def __init__(self):
+        super(YOLOv3, self).__init__()
+        self.num_classes = 80
+        self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45,
+                        59, 119, 116, 90, 156, 198, 373, 326]
+        self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+        self.valid_thresh = 0.005
+        self.nms_topk = 400
+        self.nms_posk = 100
+        self.draw_thresh = 0.5
+
+        self.backbone = ResNetBackBone()
+        self.block_outputs = []
+        self.yolo_blocks = []
+        self.route_blocks = []
+
+        for idx, num_chan in enumerate([2048, 1280, 640]):
+            yolo_block = self.add_sublayer(
+                "detecton_block_{}".format(idx),
+                YoloDetectionBlock(num_chan, num_filters=512 // (2**idx)))
+            self.yolo_blocks.append(yolo_block)
+
+            num_filters = len(self.anchor_masks[idx]) * (self.num_classes + 5)
+
+            block_out = self.add_sublayer(
+                "block_out_{}".format(idx),
+                Conv2D(num_channels=1024 // (2**idx),
+                       num_filters=num_filters,
+                       filter_size=1,
+                       param_attr=ParamAttr(
+                           initializer=fluid.initializer.Normal(0., 0.02)),
+                       bias_attr=ParamAttr(
+                           initializer=fluid.initializer.Constant(0.0),
+                           regularizer=L2Decay(0.))))
+            self.block_outputs.append(block_out)
+            if idx < 2:
+                route = self.add_sublayer(
+                    "route_{}".format(idx),
+                    ConvBNLayer(num_channels=512 // (2**idx),
+                                num_filters=256 // (2**idx),
+                                filter_size=1,
+                                act='leaky_relu'))
+                self.route_blocks.append(route)
+
+    @shape_hints(inputs=[None, 3, None, None])
+    def forward(self, inputs, im_shape):
+        outputs = []
+        boxes = []
+        scores = []
+        downsample = 32
+
+        feats = self.backbone(inputs)
+        feats = feats[::-1][:len(self.anchor_masks)]
+        route = None
+        for idx, feat in enumerate(feats):
+            if idx > 0:
+                feat = fluid.layers.concat(input=[route, feat], axis=1)
+            route, tip = self.yolo_blocks[idx](feat)
+            block_out = self.block_outputs[idx](tip)
+
+            if idx < 2:
+                route = self.route_blocks[idx](route)
+                route = fluid.layers.resize_nearest(route, scale=2)
+
+            anchor_mask = self.anchor_masks[idx]
+            mask_anchors = []
+            for m in anchor_mask:
+                mask_anchors.append(self.anchors[2 * m])
+                mask_anchors.append(self.anchors[2 * m + 1])
+            b, s = fluid.layers.yolo_box(
+                x=block_out,
+                img_size=im_shape,
+                anchors=mask_anchors,
+                class_num=self.num_classes,
+                conf_thresh=self.valid_thresh,
+                downsample_ratio=downsample)
+
+            outputs.append(block_out)
+            boxes.append(b)
+            scores.append(fluid.layers.transpose(s, perm=[0, 2, 1]))
+
+            downsample //= 2
+
+        if self.mode != 'test':
+            return outputs
+
+        return fluid.layers.multiclass_nms(
+            bboxes=fluid.layers.concat(boxes, axis=1),
+            scores=fluid.layers.concat(scores, axis=2),
+            score_threshold=self.valid_thresh,
+            nms_top_k=self.nms_topk,
+            keep_top_k=self.nms_posk,
+            nms_threshold=self.nms_thresh,
+            background_label=-1)
+
+
+class YoloLoss(Loss):
+    def __init__(self, num_classes=80, num_max_boxes=50):
+        super(YoloLoss, self).__init__()
+        self.num_classes = num_classes
+        self.num_max_boxes = num_max_boxes
+        self.ignore_thresh = 0.7
+        self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45,
+                        59, 119, 116, 90, 156, 198, 373, 326]
+        self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+
+    def forward(self, outputs, labels):
+        downsample = 32
+        gt_box, gt_label, gt_score = labels
+        losses = []
+
+        for idx, out in enumerate(outputs):
+            anchor_mask = self.anchor_masks[idx]
+            loss = fluid.layers.yolov3_loss(
+                x=out,
+                gt_box=gt_box,
+                gt_label=gt_label,
+                gt_score=gt_score,
+                anchor_mask=anchor_mask,
+                downsample_ratio=downsample,
+                anchors=self.anchors,
+                class_num=self.num_classes,
+                ignore_thresh=self.ignore_thresh,
+                use_label_smooth=True)
+            losses.append(loss)
+            downsample //= 2
+        return losses
+
+    def infer_shape(self, _):
+        return [
+            [None, self.num_max_boxes, 4],
+            [None, self.num_max_boxes],
+            [None, self.num_max_boxes]
+        ]
+
+    def infer_dtype(self, _):
+        return ['float32', 'int32', 'float32']
+
+
+def make_optimizer(parameter_list=None):
+    base_lr = 0.001
+    boundaries = [400000, 450000]
+    warm_up_iter = 4000
+    momentum = 0.9
+    weight_decay = 5e-4
+    values = [base_lr * (0.1 ** i) for i in range(len(boundaries) + 1)]
+
+    lr = fluid.layers.piecewise_decay(
+        boundaries=boundaries,
+        values=values)
+
+    lr = fluid.layers.linear_lr_warmup(
+        learning_rate=lr,
+        warmup_steps=warm_up_iter,
+        start_lr=0.0,
+        end_lr=base_lr)
+
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=lr,
+        regularization=fluid.regularizer.L2Decay(weight_decay),
+        momentum=momentum,
+        parameter_list=parameter_list)
+    return optimizer
+
+
+def _iou_matrix(a, b):
+    tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+    br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+    area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
+    area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+    area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+    area_o = (area_a[:, np.newaxis] + area_b - area_i)
+    return area_i / (area_o + 1e-10)
+
+
+def _crop_box_with_center_constraint(box, crop):
+    cropped_box = box.copy()
+    cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2])
+    cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:])
+    cropped_box[:, :2] -= crop[:2]
+    cropped_box[:, 2:] -= crop[:2]
+    centers = (box[:, :2] + box[:, 2:]) / 2
+    valid = np.logical_and(
+        crop[:2] <= centers, centers < crop[2:]).all(axis=1)
+    valid = np.logical_and(
+        valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))
+    return cropped_box, np.where(valid)[0]
+
+
+def random_crop(inputs):
+    aspect_ratios = [.5, 2.]
+    thresholds = [.0, .1, .3, .5, .7, .9]
+    scaling = [.3, 1.]
+
+    img, gt_box, gt_label = inputs
+    h, w = img.shape[:2]
+
+    if len(gt_box) == 0:
+        return inputs
+
+    np.random.shuffle(thresholds)
+    for thresh in thresholds:
+        found = False
+        for i in range(50):
+            scale = np.random.uniform(*scaling)
+            min_ar, max_ar = aspect_ratios
+            ar = np.random.uniform(max(min_ar, scale**2),
+                                   min(max_ar, scale**-2))
+            crop_h = int(h * scale / np.sqrt(ar))
+            crop_w = int(w * scale * np.sqrt(ar))
+            crop_y = np.random.randint(0, h - crop_h)
+            crop_x = np.random.randint(0, w - crop_w)
+            crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
+            iou = _iou_matrix(gt_box, np.array([crop_box], dtype=np.float32))
+            if iou.max() < thresh:
+                continue
+
+            cropped_box, valid_ids = _crop_box_with_center_constraint(
+                gt_box, np.array(crop_box, dtype=np.float32))
+            if valid_ids.size > 0:
+                found = True
+                break
+
+        if found:
+            x1, y1, x2, y2 = crop_box
+            img = img[y1:y2, x1:x2, :]
+            gt_box = np.take(cropped_box, valid_ids, axis=0)
+            gt_label = np.take(gt_label, valid_ids, axis=0)
+            return img, gt_box, gt_label
+
+        return inputs
+
+
+# XXX mix up, color distort and random expand are skipped for simplicity
+def sample_transform(inputs, mode='train', num_max_boxes=50):
+    if mode == 'train':
+        img, gt_box, gt_label = random_crop(inputs)
+    else:
+        img, gt_box, gt_label = inputs
+
+    h, w = img.shape[:2]
+    # random flip
+    if mode == 'train' and np.random.uniform(0., 1.) > .5:
+        img = img[:, ::-1, :]
+        if len(gt_box) > 0:
+            swap = gt_box.copy()
+            gt_box[:, 0] = w - swap[:, 2] - 1
+            gt_box[:, 2] = w - swap[:, 0] - 1
+
+    if len(gt_label) == 0:
+        gt_box = np.zeros([num_max_boxes, 4], dtype=np.float32)
+        gt_label = np.zeros([num_max_boxes, 1], dtype=np.int32)
+        return img, gt_box, gt_label
+
+    gt_box = gt_box[:num_max_boxes, :]
+    gt_label = gt_label[:num_max_boxes, 0]
+    # normalize boxes
+    gt_box /= np.array([w, h] * 2, dtype=np.float32)
+    gt_box[:, 2:] = gt_box[:, 2:] - gt_box[:, :2]
+    gt_box[:, :2] = gt_box[:, :2] + gt_box[:, 2:] / 2.
+
+    pad = num_max_boxes - gt_label.size
+    gt_box = np.pad(gt_box, ((0, pad), (0, 0)), mode='constant')
+    gt_label = np.pad(gt_label, [(0, pad)], mode='constant')
+
+    return img, gt_box, gt_label
+
+
+def batch_transform(batch, mode='train'):
+    if mode == 'train':
+        d = np.random.choice(
+            [320, 352, 384, 416, 448, 480, 512, 544, 576, 608])
+        interp = np.random.choice(range(5))
+    else:
+        d = 608
+        interp = cv2.INTER_CUBIC
+    # transpose batch
+    imgs, gt_boxes, gt_labels = list(zip(*batch))
+    imgs = np.array([cv2.resize(
+        img, (d, d), interpolation=interp) for img in imgs])
+
+    # transpose, permute and normalize
+    imgs = imgs.astype(np.float32)[..., ::-1]
+    mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
+    std = np.array([58.395, 57.120, 57.375], dtype=np.float32)
+    invstd = 1. / std
+    imgs -= mean
+    imgs *= invstd
+    imgs = imgs.transpose((0, 3, 1, 2))
+
+    im_shapes = np.full([len(imgs), 2], d, dtype=np.int32)
+    gt_boxes = np.array(gt_boxes)
+    gt_labels = np.array(gt_labels)
+    # XXX since mix up is not used, scores are all 1s
+    gt_scores = np.ones_like(gt_labels, dtype=np.float32)
+    return [imgs, im_shapes], [gt_boxes, gt_labels, gt_scores]
+
+
+def coco2017(root_dir, mode='train'):
+    json_path = os.path.join(
+        root_dir, 'annotations/instances_{}2017.json'.format(mode))
+    coco = COCO(json_path)
+    img_ids = coco.getImgIds()
+    imgs = coco.loadImgs(img_ids)
+    class_map = {v: i + 1 for i, v in enumerate(coco.getCatIds())}
+    samples = []
+
+    for img in imgs:
+        img_path = os.path.join(
+            root_dir, '{}2017'.format(mode), img['file_name'])
+        file_path = img_path
+        width = img['width']
+        height = img['height']
+        ann_ids = coco.getAnnIds(imgIds=img['id'], iscrowd=False)
+        anns = coco.loadAnns(ann_ids)
+
+        gt_box = []
+        gt_label = []
+
+        for ann in anns:
+            x1, y1, w, h = ann['bbox']
+            x2 = x1 + w - 1
+            y2 = y1 + h - 1
+            x1 = np.clip(x1, 0, width - 1)
+            x2 = np.clip(x2, 0, width - 1)
+            y1 = np.clip(y1, 0, height - 1)
+            y2 = np.clip(y2, 0, height - 1)
+            if ann['area'] <= 0 or x2 < x1 or y2 < y1:
+                continue
+            gt_label.append(ann['category_id'])
+            gt_box.append([x1, y1, x2, y2])
+
+        gt_box = np.array(gt_box, dtype=np.float32)
+        gt_label = np.array([class_map[cls] for cls in gt_label],
+                            dtype=np.int32)[:, np.newaxis]
+
+        if gt_label.size == 0 and not mode == 'train':
+            continue
+        samples.append((file_path, gt_box.copy(), gt_label.copy()))
+
+    def iterator():
+        if mode == 'train':
+            random.shuffle(samples)
+        for file_path, gt_box, gt_label in samples:
+            img = cv2.imread(file_path)
+            yield img, gt_box, gt_label
+
+    return iterator
+
+
+# XXX coco metrics not included for simplicity
+def run(model, loader, mode='train'):
+    total_loss = 0.0
+    total_time = 0.
+    device_ids = list(range(FLAGS.num_devices))
+    start = time.time()
+    for idx, batch in enumerate(loader()):
+        outputs, losses = getattr(model, mode)(
+            batch[0], batch[1], device='gpu', device_ids=device_ids)
+
+        total_loss += np.sum(losses)
+        if idx > 1:  # skip first two step
+            total_time += time.time() - start
+        if idx % 10 == 0:
+            print("{:04d}: loss {:0.3f} time: {:0.3f}".format(
+                idx, total_loss / (idx + 1), total_time / (idx - 1)))
+        start = time.time()
+
+
+def main():
+    @contextlib.contextmanager
+    def null_guard():
+        yield
+
+    epoch = FLAGS.epoch
+    batch_size = FLAGS.batch_size
+    if FLAGS.dynamic:
+        guard = fluid.dygraph.guard()
+    else:
+        guard = null_guard()
+
+    train_loader = fluid.io.xmap_readers(
+        lambda batch: batch_transform(batch, 'train'),
+        paddle.batch(
+            fluid.io.xmap_readers(
+                lambda inputs: sample_transform(inputs, 'train'),
+                coco2017(FLAGS.data, 'train'),
+                process_num=8,
+                buffer_size=4 * batch_size),
+            batch_size=batch_size,
+            drop_last=True),
+        process_num=2, buffer_size=4)
+
+    val_loader = fluid.io.xmap_readers(
+        lambda batch: batch_transform(batch, 'train'),
+        paddle.batch(
+            fluid.io.xmap_readers(
+                lambda inputs: sample_transform(inputs, 'val'),
+                coco2017(FLAGS.data, 'val'),
+                process_num=8,
+                buffer_size=4 * batch_size),
+            batch_size=batch_size),
+        process_num=2, buffer_size=4)
+
+    if not os.path.exists('yolo_checkpoints'):
+        os.mkdir('yolo_checkpoints')
+
+    with guard:
+        model = YOLOv3()
+        # XXX transfer learning
+        if FLAGS.weights is not None:
+            model.backbone.load(FLAGS.weights)
+        optim = make_optimizer(parameter_list=model.parameters())
+        model.prepare(optim, YoloLoss())
+
+        for e in range(epoch):
+            print("======== train epoch {} ========".format(e))
+            run(model, train_loader)
+            model.save('checkpoints/{:02d}'.format(e))
+            print("======== eval epoch {} ========".format(e))
+            run(model, val_loader, mode='eval')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser("Yolov3 Training on COCO")
+    parser.add_argument('data', metavar='DIR', help='path to COCO dataset')
+    parser.add_argument(
+        "-e", "--epoch", default=300, type=int, help="number of epoch")
+    parser.add_argument(
+        "-b", "--batch_size", default=32, type=int, help="batch size")
+    parser.add_argument(
+        "-n", "--num_devices", default=8, type=int, help="number of devices")
+    parser.add_argument(
+        "-d", "--dynamic", action='store_true', help="enable dygraph mode")
+    parser.add_argument(
+        "-w", "--weights", default=None, type=str,
+        help="path to pretrained weights")
+    FLAGS = parser.parse_args()
+    main()