diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e0084aca349e870154c828c7aeb55b017a3ba03
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -0,0 +1,739 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+import unittest
+
+import paddle.fluid as fluid
+from paddle.fluid import ParamAttr
+from paddle.fluid.dygraph import to_variable
+from paddle.fluid.dygraph import declarative, ProgramTranslator
+
+SEED = 2020
+DATATYPE = 'float32'
+program_translator = ProgramTranslator()
+
+# Note: Set True to eliminate randomness.
+#     1. For one operation, cuDNN has several algorithms,
+#        some algorithm results are non-deterministic, like convolution algorithms.
+if fluid.is_compiled_with_cuda():
+    fluid.set_flags({'FLAGS_cudnn_deterministic': True})
+
+
+def get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample,
+                      num_sample_perbin):
+    """ generate sample mask for each point in Boundary-Matching Map """
+    mask_mat = []
+    for start_index in range(tscale):
+        mask_mat_vector = []
+        for duration_index in range(dscale):
+            if start_index + duration_index < tscale:
+                p_xmin = start_index
+                p_xmax = start_index + duration_index
+                center_len = float(p_xmax - p_xmin) + 1
+                sample_xmin = p_xmin - center_len * prop_boundary_ratio
+                sample_xmax = p_xmax + center_len * prop_boundary_ratio
+                p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax,
+                                                tscale, num_sample,
+                                                num_sample_perbin)
+            else:
+                p_mask = np.zeros([tscale, num_sample])
+            mask_mat_vector.append(p_mask)
+        mask_mat_vector = np.stack(mask_mat_vector, axis=2)
+        mask_mat.append(mask_mat_vector)
+    mask_mat = np.stack(mask_mat, axis=3)
+    mask_mat = mask_mat.astype(np.float32)
+
+    sample_mask = np.reshape(mask_mat, [tscale, -1])
+    return sample_mask
+
+
+def _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,
+                           num_sample_perbin):
+    """ generate sample mask for a boundary-matching pair """
+    plen = float(seg_xmax - seg_xmin)
+    plen_sample = plen / (num_sample * num_sample_perbin - 1.0)
+    total_samples = [
+        seg_xmin + plen_sample * ii
+        for ii in range(num_sample * num_sample_perbin)
+    ]
+    p_mask = []
+    for idx in range(num_sample):
+        bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) *
+                                    num_sample_perbin]
+        bin_vector = np.zeros([tscale])
+        for sample in bin_samples:
+            sample_upper = math.ceil(sample)
+            sample_decimal, sample_down = math.modf(sample)
+            if int(sample_down) <= (tscale - 1) and int(sample_down) >= 0:
+                bin_vector[int(sample_down)] += 1 - sample_decimal
+            if int(sample_upper) <= (tscale - 1) and int(sample_upper) >= 0:
+                bin_vector[int(sample_upper)] += sample_decimal
+        bin_vector = 1.0 / num_sample_perbin * bin_vector
+        p_mask.append(bin_vector)
+    p_mask = np.stack(p_mask, axis=1)
+    return p_mask
+
+
+class Conv1D(fluid.dygraph.Layer):
+    def __init__(self,
+                 prefix,
+                 num_channels=256,
+                 num_filters=256,
+                 size_k=3,
+                 padding=1,
+                 groups=1,
+                 act="relu"):
+        super(Conv1D, self).__init__()
+        fan_in = num_channels * size_k * 1
+        k = 1. / math.sqrt(fan_in)
+        param_attr = ParamAttr(
+            name=prefix + "_w",
+            initializer=fluid.initializer.Uniform(
+                low=-k, high=k))
+        bias_attr = ParamAttr(
+            name=prefix + "_b",
+            initializer=fluid.initializer.Uniform(
+                low=-k, high=k))
+
+        self._conv2d = fluid.dygraph.Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=(1, size_k),
+            stride=1,
+            padding=(0, padding),
+            groups=groups,
+            act=act,
+            param_attr=param_attr,
+            bias_attr=bias_attr)
+
+    def forward(self, x):
+        x = fluid.layers.unsqueeze(input=x, axes=[2])
+        x = self._conv2d(x)
+        x = fluid.layers.squeeze(input=x, axes=[2])
+        return x
+
+
+class BMN(fluid.dygraph.Layer):
+    def __init__(self, cfg):
+        super(BMN, self).__init__()
+
+        self.tscale = cfg.tscale
+        self.dscale = cfg.dscale
+        self.prop_boundary_ratio = cfg.prop_boundary_ratio
+        self.num_sample = cfg.num_sample
+        self.num_sample_perbin = cfg.num_sample_perbin
+
+        self.hidden_dim_1d = 256
+        self.hidden_dim_2d = 128
+        self.hidden_dim_3d = 512
+
+        # Base Module
+        self.b_conv1 = Conv1D(
+            prefix="Base_1",
+            num_channels=cfg.feat_dim,
+            num_filters=self.hidden_dim_1d,
+            size_k=3,
+            padding=1,
+            groups=4,
+            act="relu")
+        self.b_conv2 = Conv1D(
+            prefix="Base_2",
+            num_filters=self.hidden_dim_1d,
+            size_k=3,
+            padding=1,
+            groups=4,
+            act="relu")
+
+        # Temporal Evaluation Module
+        self.ts_conv1 = Conv1D(
+            prefix="TEM_s1",
+            num_filters=self.hidden_dim_1d,
+            size_k=3,
+            padding=1,
+            groups=4,
+            act="relu")
+        self.ts_conv2 = Conv1D(
+            prefix="TEM_s2", num_filters=1, size_k=1, padding=0, act="sigmoid")
+        self.te_conv1 = Conv1D(
+            prefix="TEM_e1",
+            num_filters=self.hidden_dim_1d,
+            size_k=3,
+            padding=1,
+            groups=4,
+            act="relu")
+        self.te_conv2 = Conv1D(
+            prefix="TEM_e2", num_filters=1, size_k=1, padding=0, act="sigmoid")
+
+        #Proposal Evaluation Module
+        self.p_conv1 = Conv1D(
+            prefix="PEM_1d",
+            num_filters=self.hidden_dim_2d,
+            size_k=3,
+            padding=1,
+            act="relu")
+
+        # init to speed up
+        self.sample_mask = get_interp1d_mask(
+            self.tscale, self.dscale, self.prop_boundary_ratio, self.num_sample,
+            self.num_sample_perbin)
+        # self.sample_mask = fluid.dygraph.base.to_variable(sample_mask)
+        # self.sample_mask.stop_gradient = True
+
+        self.p_conv3d1 = fluid.dygraph.Conv3D(
+            num_channels=128,
+            num_filters=self.hidden_dim_3d,
+            filter_size=(self.num_sample, 1, 1),
+            stride=(self.num_sample, 1, 1),
+            padding=0,
+            act="relu",
+            param_attr=ParamAttr(name="PEM_3d1_w"),
+            bias_attr=ParamAttr(name="PEM_3d1_b"))
+
+        self.p_conv2d1 = fluid.dygraph.Conv2D(
+            num_channels=512,
+            num_filters=self.hidden_dim_2d,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act="relu",
+            param_attr=ParamAttr(name="PEM_2d1_w"),
+            bias_attr=ParamAttr(name="PEM_2d1_b"))
+        self.p_conv2d2 = fluid.dygraph.Conv2D(
+            num_channels=128,
+            num_filters=self.hidden_dim_2d,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            act="relu",
+            param_attr=ParamAttr(name="PEM_2d2_w"),
+            bias_attr=ParamAttr(name="PEM_2d2_b"))
+        self.p_conv2d3 = fluid.dygraph.Conv2D(
+            num_channels=128,
+            num_filters=self.hidden_dim_2d,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            act="relu",
+            param_attr=ParamAttr(name="PEM_2d3_w"),
+            bias_attr=ParamAttr(name="PEM_2d3_b"))
+        self.p_conv2d4 = fluid.dygraph.Conv2D(
+            num_channels=128,
+            num_filters=2,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act="sigmoid",
+            param_attr=ParamAttr(name="PEM_2d4_w"),
+            bias_attr=ParamAttr(name="PEM_2d4_b"))
+
+    @declarative
+    def forward(self, x):
+        # TODO(Aurelius84): sample_mask is created in `__init__`,
+        # but currently we don't support that. The two lines code
+        # will be removed when support creating var outside of forward.
+        sample_mask = to_variable(self.sample_mask)
+        sample_mask.stop_gradient = True
+
+        # Base Module
+        x = self.b_conv1(x)
+        x = self.b_conv2(x)
+
+        # TEM
+        xs = self.ts_conv1(x)
+        xs = self.ts_conv2(xs)
+        xs = fluid.layers.squeeze(xs, axes=[1])
+        xe = self.te_conv1(x)
+        xe = self.te_conv2(xe)
+        xe = fluid.layers.squeeze(xe, axes=[1])
+
+        # PEM
+        xp = self.p_conv1(x)
+        # BM layer
+        xp = fluid.layers.matmul(xp, sample_mask)
+        xp = fluid.layers.reshape(
+            xp, shape=[0, 0, -1, self.dscale, self.tscale])
+
+        xp = self.p_conv3d1(xp)
+        xp = fluid.layers.squeeze(xp, axes=[2])
+        xp = self.p_conv2d1(xp)
+        xp = self.p_conv2d2(xp)
+        xp = self.p_conv2d3(xp)
+        xp = self.p_conv2d4(xp)
+        return xp, xs, xe
+
+
+def bmn_loss_func(pred_bm, pred_start, pred_end, gt_iou_map, gt_start, gt_end,
+                  cfg):
+    def _get_mask(cfg):
+        dscale = cfg.dscale
+        tscale = cfg.tscale
+        bm_mask = []
+        for idx in range(dscale):
+            mask_vector = [1 for i in range(tscale - idx)
+                           ] + [0 for i in range(idx)]
+            bm_mask.append(mask_vector)
+        bm_mask = np.array(bm_mask, dtype=np.float32)
+        self_bm_mask = fluid.layers.create_global_var(
+            shape=[dscale, tscale], value=0, dtype=DATATYPE, persistable=True)
+        fluid.layers.assign(bm_mask, self_bm_mask)
+        self_bm_mask.stop_gradient = True
+        return self_bm_mask
+
+    def tem_loss_func(pred_start, pred_end, gt_start, gt_end):
+        def bi_loss(pred_score, gt_label):
+            pred_score = fluid.layers.reshape(
+                x=pred_score, shape=[-1], inplace=False)
+            gt_label = fluid.layers.reshape(
+                x=gt_label, shape=[-1], inplace=False)
+            gt_label.stop_gradient = True
+            pmask = fluid.layers.cast(x=(gt_label > 0.5), dtype=DATATYPE)
+            num_entries = fluid.layers.cast(
+                fluid.layers.shape(pmask), dtype=DATATYPE)
+            num_positive = fluid.layers.cast(
+                fluid.layers.reduce_sum(pmask), dtype=DATATYPE)
+            ratio = num_entries / num_positive
+            coef_0 = 0.5 * ratio / (ratio - 1)
+            coef_1 = 0.5 * ratio
+            epsilon = 0.000001
+            # temp = fluid.layers.log(pred_score + epsilon)
+            loss_pos = fluid.layers.elementwise_mul(
+                fluid.layers.log(pred_score + epsilon), pmask)
+            loss_pos = coef_1 * fluid.layers.reduce_mean(loss_pos)
+            loss_neg = fluid.layers.elementwise_mul(
+                fluid.layers.log(1.0 - pred_score + epsilon), (1.0 - pmask))
+            loss_neg = coef_0 * fluid.layers.reduce_mean(loss_neg)
+            loss = -1 * (loss_pos + loss_neg)
+            return loss
+
+        loss_start = bi_loss(pred_start, gt_start)
+        loss_end = bi_loss(pred_end, gt_end)
+        loss = loss_start + loss_end
+        return loss
+
+    def pem_reg_loss_func(pred_score, gt_iou_map, mask):
+
+        gt_iou_map = fluid.layers.elementwise_mul(gt_iou_map, mask)
+
+        u_hmask = fluid.layers.cast(x=gt_iou_map > 0.7, dtype=DATATYPE)
+        u_mmask = fluid.layers.logical_and(gt_iou_map <= 0.7, gt_iou_map > 0.3)
+        u_mmask = fluid.layers.cast(x=u_mmask, dtype=DATATYPE)
+        u_lmask = fluid.layers.logical_and(gt_iou_map <= 0.3, gt_iou_map >= 0.)
+        u_lmask = fluid.layers.cast(x=u_lmask, dtype=DATATYPE)
+        u_lmask = fluid.layers.elementwise_mul(u_lmask, mask)
+
+        num_h = fluid.layers.cast(
+            fluid.layers.reduce_sum(u_hmask), dtype=DATATYPE)
+        num_m = fluid.layers.cast(
+            fluid.layers.reduce_sum(u_mmask), dtype=DATATYPE)
+        num_l = fluid.layers.cast(
+            fluid.layers.reduce_sum(u_lmask), dtype=DATATYPE)
+
+        r_m = num_h / num_m
+        u_smmask = fluid.layers.assign(
+            local_random.uniform(0., 1., [
+                gt_iou_map.shape[1], gt_iou_map.shape[2]
+            ]).astype(DATATYPE))
+        u_smmask = fluid.layers.elementwise_mul(u_mmask, u_smmask)
+        u_smmask = fluid.layers.cast(x=(u_smmask > (1. - r_m)), dtype=DATATYPE)
+
+        r_l = num_h / num_l
+        u_slmask = fluid.layers.assign(
+            local_random.uniform(0., 1., [
+                gt_iou_map.shape[1], gt_iou_map.shape[2]
+            ]).astype(DATATYPE))
+        u_slmask = fluid.layers.elementwise_mul(u_lmask, u_slmask)
+        u_slmask = fluid.layers.cast(x=(u_slmask > (1. - r_l)), dtype=DATATYPE)
+
+        weights = u_hmask + u_smmask + u_slmask
+        weights.stop_gradient = True
+        loss = fluid.layers.square_error_cost(pred_score, gt_iou_map)
+        loss = fluid.layers.elementwise_mul(loss, weights)
+        loss = 0.5 * fluid.layers.reduce_sum(loss) / fluid.layers.reduce_sum(
+            weights)
+
+        return loss
+
+    def pem_cls_loss_func(pred_score, gt_iou_map, mask):
+        gt_iou_map = fluid.layers.elementwise_mul(gt_iou_map, mask)
+        gt_iou_map.stop_gradient = True
+        pmask = fluid.layers.cast(x=(gt_iou_map > 0.9), dtype=DATATYPE)
+        nmask = fluid.layers.cast(x=(gt_iou_map <= 0.9), dtype=DATATYPE)
+        nmask = fluid.layers.elementwise_mul(nmask, mask)
+
+        num_positive = fluid.layers.reduce_sum(pmask)
+        num_entries = num_positive + fluid.layers.reduce_sum(nmask)
+        ratio = num_entries / num_positive
+        coef_0 = 0.5 * ratio / (ratio - 1)
+        coef_1 = 0.5 * ratio
+        epsilon = 0.000001
+        loss_pos = fluid.layers.elementwise_mul(
+            fluid.layers.log(pred_score + epsilon), pmask)
+        loss_pos = coef_1 * fluid.layers.reduce_sum(loss_pos)
+        loss_neg = fluid.layers.elementwise_mul(
+            fluid.layers.log(1.0 - pred_score + epsilon), nmask)
+        loss_neg = coef_0 * fluid.layers.reduce_sum(loss_neg)
+        loss = -1 * (loss_pos + loss_neg) / num_entries
+        return loss
+
+    pred_bm_reg = fluid.layers.squeeze(
+        fluid.layers.slice(
+            pred_bm, axes=[1], starts=[0], ends=[1]), axes=[1])
+    pred_bm_cls = fluid.layers.squeeze(
+        fluid.layers.slice(
+            pred_bm, axes=[1], starts=[1], ends=[2]), axes=[1])
+
+    bm_mask = _get_mask(cfg)
+
+    pem_reg_loss = pem_reg_loss_func(pred_bm_reg, gt_iou_map, bm_mask)
+    pem_cls_loss = pem_cls_loss_func(pred_bm_cls, gt_iou_map, bm_mask)
+
+    tem_loss = tem_loss_func(pred_start, pred_end, gt_start, gt_end)
+
+    loss = tem_loss + 10 * pem_reg_loss + pem_cls_loss
+    return loss, tem_loss, pem_reg_loss, pem_cls_loss
+
+
+class Args(object):
+    epoch = 1
+    batch_size = 4
+    learning_rate = 0.1
+    learning_rate_decay = 0.1
+    lr_decay_iter = 4200
+    l2_weight_decay = 1e-4
+    valid_interval = 20
+    log_interval = 5
+    train_batch_num = valid_interval
+    valid_batch_num = 5
+
+    tscale = 50
+    dscale = 50
+    feat_dim = 100
+    prop_boundary_ratio = 0.5
+    num_sample = 2
+    num_sample_perbin = 2
+    infer_dir = './bmn_infer_model'
+    dy_param_path = './bmn_dy_param'
+
+
+def optimizer(cfg, parameter_list):
+    bd = [cfg.lr_decay_iter]
+    base_lr = cfg.learning_rate
+    lr_decay = cfg.learning_rate_decay
+    l2_weight_decay = cfg.l2_weight_decay
+    lr = [base_lr, base_lr * lr_decay]
+    optimizer = fluid.optimizer.Adam(
+        fluid.layers.piecewise_decay(
+            boundaries=bd, values=lr),
+        parameter_list=parameter_list,
+        regularization=fluid.regularizer.L2DecayRegularizer(
+            regularization_coeff=l2_weight_decay))
+    return optimizer
+
+
+def fake_data_reader(args, mode='train'):
+    def iou_with_anchors(anchors_min, anchors_max, box_min, box_max):
+        """Compute jaccard score between a box and the anchors.
+        """
+        len_anchors = anchors_max - anchors_min
+        int_xmin = np.maximum(anchors_min, box_min)
+        int_xmax = np.minimum(anchors_max, box_max)
+        inter_len = np.maximum(int_xmax - int_xmin, 0.)
+        union_len = len_anchors - inter_len + box_max - box_min
+        jaccard = np.divide(inter_len, union_len)
+        return jaccard
+
+    def ioa_with_anchors(anchors_min, anchors_max, box_min, box_max):
+        """Compute intersection between score a box and the anchors.
+        """
+        len_anchors = anchors_max - anchors_min
+        int_xmin = np.maximum(anchors_min, box_min)
+        int_xmax = np.minimum(anchors_max, box_max)
+        inter_len = np.maximum(int_xmax - int_xmin, 0.)
+        scores = np.divide(inter_len, len_anchors)
+        return scores
+
+    def get_match_map(tscale):
+        match_map = []
+        tgap = 1. / tscale
+        for idx in range(tscale):
+            tmp_match_window = []
+            xmin = tgap * idx
+            for jdx in range(1, tscale + 1):
+                xmax = xmin + tgap * jdx
+                tmp_match_window.append([xmin, xmax])
+            match_map.append(tmp_match_window)
+        match_map = np.array(match_map)
+        match_map = np.transpose(match_map, [1, 0, 2])
+        match_map = np.reshape(match_map, [-1, 2])
+        match_map = match_map
+        anchor_xmin = [tgap * i for i in range(tscale)]
+        anchor_xmax = [tgap * i for i in range(1, tscale + 1)]
+
+        return match_map, anchor_xmin, anchor_xmax
+
+    def get_video_label(match_map, anchor_xmin, anchor_xmax):
+        video_second = local_random.randint(75, 90)
+        label_num = local_random.randint(1, 3)
+
+        gt_bbox = []
+        gt_iou_map = []
+        for idx in range(label_num):
+            duration = local_random.uniform(video_second * 0.4,
+                                            video_second * 0.8)
+            start_t = local_random.uniform(0.1 * video_second,
+                                           video_second - duration)
+            tmp_start = max(min(1, start_t / video_second), 0)
+            tmp_end = max(min(1, (start_t + duration) / video_second), 0)
+            gt_bbox.append([tmp_start, tmp_end])
+            tmp_gt_iou_map = iou_with_anchors(match_map[:, 0], match_map[:, 1],
+                                              tmp_start, tmp_end)
+            tmp_gt_iou_map = np.reshape(tmp_gt_iou_map,
+                                        [args.dscale, args.tscale])
+            gt_iou_map.append(tmp_gt_iou_map)
+        gt_iou_map = np.array(gt_iou_map)
+        gt_iou_map = np.max(gt_iou_map, axis=0)
+
+        gt_bbox = np.array(gt_bbox)
+        gt_xmins = gt_bbox[:, 0]
+        gt_xmaxs = gt_bbox[:, 1]
+        gt_len_small = 3. / args.tscale
+        gt_start_bboxs = np.stack(
+            (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)
+        gt_end_bboxs = np.stack(
+            (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)
+
+        match_score_start = []
+        for jdx in range(len(anchor_xmin)):
+            match_score_start.append(
+                np.max(
+                    ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[
+                        jdx], gt_start_bboxs[:, 0], gt_start_bboxs[:, 1])))
+        match_score_end = []
+        for jdx in range(len(anchor_xmin)):
+            match_score_end.append(
+                np.max(
+                    ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
+                                     gt_end_bboxs[:, 0], gt_end_bboxs[:, 1])))
+
+        gt_start = np.array(match_score_start)
+        gt_end = np.array(match_score_end)
+        return gt_iou_map, gt_start, gt_end
+
+    def reader():
+        batch_out = []
+        iter_num = args.batch_size * 100
+        match_map, anchor_xmin, anchor_xmax = get_match_map(args.tscale)
+
+        for video_idx in range(iter_num):
+            video_feat = local_random.random_sample(
+                [args.feat_dim, args.tscale]).astype('float32')
+            gt_iou_map, gt_start, gt_end = get_video_label(
+                match_map, anchor_xmin, anchor_xmax)
+
+            if mode == 'train' or mode == 'valid':
+                batch_out.append((video_feat, gt_iou_map, gt_start, gt_end))
+            elif mode == 'test':
+                batch_out.append(
+                    (video_feat, gt_iou_map, gt_start, gt_end, video_idx))
+            else:
+                raise NotImplementedError('mode {} not implemented'.format(
+                    mode))
+            if len(batch_out) == args.batch_size:
+                yield batch_out
+                batch_out = []
+
+    return reader
+
+
+def train_bmn(args, place, to_static):
+    program_translator.enable(to_static)
+    loss_data = []
+
+    with fluid.dygraph.guard(place):
+        fluid.default_main_program().random_seed = SEED
+        fluid.default_startup_program().random_seed = SEED
+        global local_random
+        local_random = np.random.RandomState(SEED)
+
+        bmn = BMN(args)
+        adam = optimizer(args, parameter_list=bmn.parameters())
+
+        train_reader = fake_data_reader(args, 'train')
+
+        for epoch in range(args.epoch):
+            for batch_id, data in enumerate(train_reader()):
+                video_feat = np.array(
+                    [item[0] for item in data]).astype(DATATYPE)
+                gt_iou_map = np.array(
+                    [item[1] for item in data]).astype(DATATYPE)
+                gt_start = np.array([item[2] for item in data]).astype(DATATYPE)
+                gt_end = np.array([item[3] for item in data]).astype(DATATYPE)
+
+                x_data = to_variable(video_feat)
+                gt_iou_map = to_variable(gt_iou_map)
+                gt_start = to_variable(gt_start)
+                gt_end = to_variable(gt_end)
+                gt_iou_map.stop_gradient = True
+                gt_start.stop_gradient = True
+                gt_end.stop_gradient = True
+
+                pred_bm, pred_start, pred_end = bmn(x_data)
+
+                loss, tem_loss, pem_reg_loss, pem_cls_loss = bmn_loss_func(
+                    pred_bm, pred_start, pred_end, gt_iou_map, gt_start, gt_end,
+                    args)
+                avg_loss = fluid.layers.mean(loss)
+
+                avg_loss.backward()
+                adam.minimize(avg_loss)
+                bmn.clear_gradients()
+                # log loss data to verify correctness
+                loss_data += [
+                    avg_loss.numpy()[0], tem_loss.numpy()[0],
+                    pem_reg_loss.numpy()[0], pem_cls_loss.numpy()[0]
+                ]
+
+                if args.log_interval > 0 and (
+                        batch_id % args.log_interval == 0):
+                    print('[TRAIN] Epoch {}, iter {} '.format(epoch, batch_id)
+                                + '\tLoss = {}, \ttem_loss = {}, \tpem_reg_loss = {}, \tpem_cls_loss = {}'.format(
+                        '%f' % avg_loss.numpy()[0], '%f' % tem_loss.numpy()[0], \
+                        '%f' % pem_reg_loss.numpy()[0], '%f' % pem_cls_loss.numpy()[0]))
+
+                # validation
+                if batch_id % args.valid_interval == 0 and batch_id > 0:
+                    bmn.eval()
+                    val_loss_data = val_bmn(bmn, args)
+                    bmn.train()
+                    loss_data += val_loss_data
+
+                if batch_id == args.train_batch_num:
+                    if to_static:
+                        program_translator.save_inference_model(args.infer_dir)
+                    else:
+                        fluid.dygraph.save_dygraph(bmn.state_dict(),
+                                                   args.dy_param_path)
+                    break
+        return np.array(loss_data)
+
+
+# Validation
+def val_bmn(model, args):
+    val_reader = fake_data_reader(args, 'valid')
+
+    loss_data = []
+    for batch_id, data in enumerate(val_reader()):
+        video_feat = np.array([item[0] for item in data]).astype(DATATYPE)
+        gt_iou_map = np.array([item[1] for item in data]).astype(DATATYPE)
+        gt_start = np.array([item[2] for item in data]).astype(DATATYPE)
+        gt_end = np.array([item[3] for item in data]).astype(DATATYPE)
+
+        x_data = to_variable(video_feat)
+        gt_iou_map = to_variable(gt_iou_map)
+        gt_start = to_variable(gt_start)
+        gt_end = to_variable(gt_end)
+        gt_iou_map.stop_gradient = True
+        gt_start.stop_gradient = True
+        gt_end.stop_gradient = True
+
+        pred_bm, pred_start, pred_end = model(x_data)
+
+        loss, tem_loss, pem_reg_loss, pem_cls_loss = bmn_loss_func(
+            pred_bm, pred_start, pred_end, gt_iou_map, gt_start, gt_end, args)
+        avg_loss = fluid.layers.mean(loss)
+
+        loss_data += [
+            avg_loss.numpy()[0], tem_loss.numpy()[0], pem_reg_loss.numpy()[0],
+            pem_cls_loss.numpy()[0]
+        ]
+
+        print('[VALID] iter {} '.format(batch_id)
+                    + '\tLoss = {}, \ttem_loss = {}, \tpem_reg_loss = {}, \tpem_cls_loss = {}'.format(
+            '%f' % avg_loss.numpy()[0], '%f' % tem_loss.numpy()[0], \
+            '%f' % pem_reg_loss.numpy()[0], '%f' % pem_cls_loss.numpy()[0]))
+
+        if batch_id == args.valid_batch_num:
+            break
+    return loss_data
+
+
+class TestTrain(unittest.TestCase):
+    def setUp(self):
+        self.args = Args()
+        self.place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda() \
+            else fluid.CUDAPlace(0)
+
+    def test_train(self):
+
+        static_res = train_bmn(self.args, self.place, to_static=True)
+        dygraph_res = train_bmn(self.args, self.place, to_static=False)
+        self.assertTrue(
+            np.allclose(dygraph_res, static_res),
+            "dygraph_res: {},\n static_res: {}".format(
+                dygraph_res[~np.isclose(dygraph_res, static_res)],
+                static_res[~np.isclose(dygraph_res, static_res)]))
+
+        # Prediction needs trained models, so put `test_predict` at last of `test_train`
+        self.verify_predict()
+
+    def verify_predict(self):
+        args = Args()
+        args.batch_size = 1  # change batch_size
+        test_reader = fake_data_reader(args, 'test')
+        for batch_id, data in enumerate(test_reader()):
+            video_data = np.array([item[0] for item in data]).astype(DATATYPE)
+            static_pred_res = self.predict_static(video_data)
+            dygraph_pred_res = self.predict_dygraph(video_data)
+
+            for dy_res, st_res in zip(dygraph_pred_res, static_pred_res):
+                self.assertTrue(
+                    np.allclose(st_res, dy_res),
+                    "dygraph_res: {},\n static_res: {}".format(
+                        dy_res[~np.isclose(st_res, dy_res)],
+                        st_res[~np.isclose(st_res, dy_res)]))
+            break
+
+    def predict_dygraph(self, data):
+        program_translator.enable(False)
+        with fluid.dygraph.guard(self.place):
+            bmn = BMN(self.args)
+            # load dygraph trained parameters
+            model_dict, _ = fluid.load_dygraph(self.args.dy_param_path +
+                                               ".pdparams")
+            bmn.set_dict(model_dict)
+            bmn.eval()
+
+            x = to_variable(data)
+            pred_res = bmn(x)
+            pred_res = [var.numpy() for var in pred_res]
+
+            return pred_res
+
+    def predict_static(self, data):
+        exe = fluid.Executor(self.place)
+        # load inference model
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             self.args.infer_dir, executor=exe)
+        pred_res = exe.run(inference_program,
+                           feed={feed_target_names[0]: data},
+                           fetch_list=fetch_targets)
+
+        return pred_res
+
+
+if __name__ == "__main__":
+    unittest.main()