提交 8207f5fe 编写于 作者: F frankwhzhang

Merge branch 'develop' of https://github.com/PaddlePaddle/models into develop

[submodule "fluid/PaddleNLP/SimNet"]
path = fluid/PaddleNLP/SimNet
url = https://github.com/baidu/AnyQ.git
[submodule "fluid/PaddleNLP/LAC"]
path = fluid/PaddleNLP/LAC
url = https://github.com/baidu/lac.git
[submodule "fluid/PaddleNLP/SimNet"]
path = fluid/PaddleNLP/SimNet
url = https://github.com/baidu/AnyQ.git
[submodule "fluid/PaddleNLP/Senta"]
path = fluid/PaddleNLP/Senta
url = https://github.com/baidu/Senta.git
......@@ -250,7 +250,8 @@ class PyramidBox(object):
face_loc, head_loc = fluid.layers.split(
mbox_loc, num_or_sections=2, dim=1)
face_loc = permute_and_reshape(face_loc, 4)
head_loc = permute_and_reshape(head_loc, 4)
if not self.is_infer:
head_loc = permute_and_reshape(head_loc, 4)
mbox_conf = fluid.layers.conv2d(
self.ssh_conv3_norm, 8, 3, 1, 1, bias_attr=b_attr)
......@@ -259,16 +260,19 @@ class PyramidBox(object):
face_conf3_maxin = fluid.layers.reduce_max(
face_conf3, dim=1, keep_dim=True)
face_conf = fluid.layers.concat([face_conf3_maxin, face_conf1], axis=1)
head_conf3_maxin = fluid.layers.reduce_max(
head_conf3, dim=1, keep_dim=True)
head_conf = fluid.layers.concat([head_conf3_maxin, head_conf1], axis=1)
face_conf = permute_and_reshape(face_conf, 2)
head_conf = permute_and_reshape(head_conf, 2)
if not self.is_infer:
head_conf3_maxin = fluid.layers.reduce_max(
head_conf3, dim=1, keep_dim=True)
head_conf = fluid.layers.concat(
[head_conf3_maxin, head_conf1], axis=1)
head_conf = permute_and_reshape(head_conf, 2)
face_locs.append(face_loc)
face_confs.append(face_conf)
head_locs.append(head_loc)
head_confs.append(head_conf)
if not self.is_infer:
head_locs.append(head_loc)
head_confs.append(head_conf)
box, var = fluid.layers.prior_box(
self.ssh_conv3_norm,
......@@ -293,7 +297,8 @@ class PyramidBox(object):
face_loc, head_loc = fluid.layers.split(
mbox_loc, num_or_sections=2, dim=1)
face_loc = permute_and_reshape(face_loc, 4)
head_loc = permute_and_reshape(head_loc, 4)
if not self.is_infer:
head_loc = permute_and_reshape(head_loc, 4)
mbox_conf = fluid.layers.conv2d(input, 6, 3, 1, 1, bias_attr=b_attr)
face_conf1, face_conf3, head_conf = fluid.layers.split(
......@@ -304,13 +309,15 @@ class PyramidBox(object):
[face_conf1, face_conf3_maxin], axis=1)
face_conf = permute_and_reshape(face_conf, 2)
head_conf = permute_and_reshape(head_conf, 2)
if not self.is_infer:
head_conf = permute_and_reshape(head_conf, 2)
face_locs.append(face_loc)
face_confs.append(face_conf)
head_locs.append(head_loc)
head_confs.append(head_conf)
if not self.is_infer:
head_locs.append(head_loc)
head_confs.append(head_conf)
box, var = fluid.layers.prior_box(
input,
......@@ -330,8 +337,9 @@ class PyramidBox(object):
self.face_mbox_loc = fluid.layers.concat(face_locs, axis=1)
self.face_mbox_conf = fluid.layers.concat(face_confs, axis=1)
self.head_mbox_loc = fluid.layers.concat(head_locs, axis=1)
self.head_mbox_conf = fluid.layers.concat(head_confs, axis=1)
if not self.is_infer:
self.head_mbox_loc = fluid.layers.concat(head_locs, axis=1)
self.head_mbox_conf = fluid.layers.concat(head_confs, axis=1)
self.prior_boxes = fluid.layers.concat(boxes)
self.box_vars = fluid.layers.concat(vars)
......
......@@ -308,6 +308,9 @@ if __name__ == '__main__':
infer_program, nmsed_out = network.infer(main_program)
fetches = [nmsed_out]
fluid.io.load_persistables(
exe, args.model_dir, main_program=main_program)
exe, args.model_dir, main_program=infer_program)
# save model and program
#fluid.io.save_inference_model('pyramidbox_model',
# ['image'], [nmsed_out], exe, main_program=infer_program,
# model_filename='model', params_filename='params')
infer(args, config)
......@@ -125,11 +125,9 @@ Faster RCNN mAP
| Model | RoI function | Batch size | Max iteration | mAP |
| :--------------- | :--------: | :------------: | :------------------: |------: |
| Detectron_RoIPool | RoIPool | 8 | 180000 | 0.315 |
| Fluid RoIPool minibatch padding | RoIPool | 8 | 180000 | 0.314 |
| Fluid RoIPool no padding | RoIPool | 8 | 180000 | 0.316 |
| Detectron_RoIAlign | RoIAlign | 8 | 180000 | 0.346 |
| Fluid RoIAlign no padding | RoIAlign | 8 | 180000 | 0.345 |
| [Fluid RoIPool minibatch padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_minibatch_padding.tar.gz) | RoIPool | 8 | 180000 | 0.314 |
| [Fluid RoIPool no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_no_padding.tar.gz) | RoIPool | 8 | 180000 | 0.316 |
| [Fluid RoIAlign no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding.tar.gz) | RoIAlign | 8 | 180000 | 0.345 |
* Fluid RoIPool minibatch padding: Use RoIPool. Images in one batch padding to the same size. This method is same as detectron.
* Fluid RoIPool no padding: Images without padding.
......
......@@ -119,11 +119,9 @@ Faster RCNN mAP
| 模型 | RoI处理方式 | 批量大小 | 迭代次数 | mAP |
| :--------------- | :--------: | :------------: | :------------------: |------: |
| Detectron RoIPool | RoIPool | 8 | 180000 | 0.315 |
| Fluid RoIPool minibatch padding | RoIPool | 8 | 180000 | 0.314 |
| Fluid RoIPool no padding | RoIPool | 8 | 180000 | 0.316 |
| Detectron RoIAlign | RoIAlign | 8 | 180000 | 0.346 |
| Fluid RoIAlign no padding | RoIAlign | 8 | 180000 | 0.345 |
| [Fluid RoIPool minibatch padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_minibatch_padding.tar.gz) | RoIPool | 8 | 180000 | 0.314 |
| [Fluid RoIPool no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_pool_no_padding.tar.gz) | RoIPool | 8 | 180000 | 0.316 |
| [Fluid RoIAlign no padding](http://paddlemodels.bj.bcebos.com/faster_rcnn/model_align_no_padding.tar.gz) | RoIAlign | 8 | 180000 | 0.345 |
......
#!/bin/bash
# This file is only used for continuous evaluation.
export FLAGS_cudnn_deterministic=True
export ce_mode=1
(CUDA_VISIBLE_DEVICES=6 python c_gan.py --batch_size=121 --epoch=1 --run_ce=True --use_gpu=True & \
CUDA_VISIBLE_DEVICES=7 python dc_gan.py --batch_size=121 --epoch=1 --run_ce=True --use_gpu=True) | python _ce.py
####this file is only used for continuous evaluation test!
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
cgan_d_train_cost_kpi = CostKpi('cgan_d_train_cost', 0.02, 0, actived=True, desc='train cost of discriminator')
cgan_g_train_cost_kpi = CostKpi('cgan_g_train_cost', 0.02, 0, actived=True, desc='train cost of generator')
cgan_train_speed_kpi = DurationKpi(
'cgan_duration',
0.05,
0,
actived=True,
unit_repr='second',
desc='train time used in one GPU card')
dcgan_d_train_cost_kpi = CostKpi('dcgan_d_train_cost', 0.02, 0, actived=True, desc='train cost of discriminator')
dcgan_g_train_cost_kpi = CostKpi('dcgan_g_train_cost', 0.02, 0, actived=True, desc='train cost of generator')
dcgan_train_speed_kpi = DurationKpi(
'dcgan_duration',
0.05,
0,
actived=True,
unit_repr='second',
desc='train time used in one GPU card')
tracking_kpis = [dcgan_d_train_cost_kpi, dcgan_g_train_cost_kpi,
dcgan_train_speed_kpi, cgan_d_train_cost_kpi, cgan_g_train_cost_kpi, cgan_train_speed_kpi]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split(',')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
print("kpi {}={}".format(kpi_name, kpi_value))
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
# print("*****")
# print(log)
# print("****")
log_to_ce(log)
......@@ -23,6 +23,7 @@ import functools
import matplotlib
import numpy as np
import paddle
import time
import paddle.fluid as fluid
from utility import get_parent_function_name, plot, check, add_arguments, print_arguments
from network import G_cond, D_cond
......@@ -30,6 +31,7 @@ matplotlib.use('agg')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
NOISE_SIZE = 100
LEARNING_RATE = 2e-4
......@@ -40,6 +42,7 @@ add_arg('batch_size', int, 121, "Minibatch size.")
add_arg('epoch', int, 20, "The number of epoched to be trained.")
add_arg('output', str, "./output", "The directory the model and the test result to be saved to.")
add_arg('use_gpu', bool, True, "Whether to use GPU to train.")
add_arg('run_ce', bool, False, "Whether to run for model ce.")
# yapf: enable
......@@ -51,6 +54,10 @@ def loss(x, label):
def train(args):
if args.run_ce:
np.random.seed(10)
fluid.default_startup_program().random_seed = 90
d_program = fluid.Program()
dg_program = fluid.Program()
......@@ -89,16 +96,22 @@ def train(args):
if args.use_gpu:
exe = fluid.Executor(fluid.CUDAPlace(0))
exe.run(fluid.default_startup_program())
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=60000),
batch_size=args.batch_size)
if args.run_ce:
train_reader = paddle.batch(
paddle.dataset.mnist.train(),
batch_size=args.batch_size)
else:
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=60000),
batch_size=args.batch_size)
NUM_TRAIN_TIMES_OF_DG = 2
const_n = np.random.uniform(
low=-1.0, high=1.0,
size=[args.batch_size, NOISE_SIZE]).astype('float32')
t_time = 0
losses = [[],[]]
for pass_id in range(args.epoch):
for batch_id, data in enumerate(train_reader()):
if len(data) != args.batch_size:
......@@ -115,7 +128,7 @@ def train(args):
fake_labels = np.zeros(
shape=[real_image.shape[0], 1], dtype='float32')
total_label = np.concatenate([real_labels, fake_labels])
s_time = time.time()
generated_image = exe.run(
g_program,
feed={'noise': noise_data,
......@@ -130,7 +143,7 @@ def train(args):
'label': fake_labels,
'conditions': conditions_data
},
fetch_list={d_loss})
fetch_list={d_loss})[0][0]
d_loss_2 = exe.run(d_program,
feed={
......@@ -138,20 +151,25 @@ def train(args):
'label': real_labels,
'conditions': conditions_data
},
fetch_list={d_loss})
d_loss_np = [d_loss_1[0][0], d_loss_2[0][0]]
fetch_list={d_loss})[0][0]
d_loss_n = d_loss_1 + d_loss_2
losses[0].append(d_loss_n)
for _ in six.moves.xrange(NUM_TRAIN_TIMES_OF_DG):
noise_data = np.random.uniform(
low=-1.0, high=1.0,
size=[args.batch_size, NOISE_SIZE]).astype('float32')
dg_loss_np = exe.run(
dg_loss_n = exe.run(
dg_program,
feed={'noise': noise_data,
'conditions': conditions_data},
fetch_list={dg_loss})[0]
if batch_id % 10 == 0:
fetch_list={dg_loss})[0][0]
losses[1].append(dg_loss_n)
t_time += (time.time() - s_time)
if batch_id % 10 == 0 and not args.run_ce:
if not os.path.exists(args.output):
os.makedirs(args.output)
# generate image each batch
......@@ -163,9 +181,7 @@ def train(args):
total_images = np.concatenate([real_image, generated_images])
fig = plot(total_images)
msg = "Epoch ID={0}\n Batch ID={1}\n D-Loss={2}\n DG-Loss={3}\n gen={4}".format(
pass_id, batch_id,
np.sum(d_loss_np),
np.sum(dg_loss_np), check(generated_images))
pass_id, batch_id, d_loss_n, dg_loss_n, check(generated_images))
print(msg)
plt.title(msg)
plt.savefig(
......@@ -174,6 +190,11 @@ def train(args):
bbox_inches='tight')
plt.close(fig)
if args.run_ce:
print("kpis,cgan_d_train_cost,{}".format(np.mean(losses[0])))
print("kpis,cgan_g_train_cost,{}".format(np.mean(losses[1])))
print("kpis,cgan_duration,{}".format(t_time / args.epoch))
if __name__ == "__main__":
args = parser.parse_args()
......
......@@ -23,6 +23,7 @@ import matplotlib
import six
import numpy as np
import paddle
import time
import paddle.fluid as fluid
from utility import get_parent_function_name, plot, check, add_arguments, print_arguments
from network import G, D
......@@ -40,6 +41,7 @@ add_arg('batch_size', int, 128, "Minibatch size.")
add_arg('epoch', int, 20, "The number of epoched to be trained.")
add_arg('output', str, "./output_dcgan", "The directory the model and the test result to be saved to.")
add_arg('use_gpu', bool, True, "Whether to use GPU to train.")
add_arg('run_ce', bool, False, "Whether to run for model ce.")
# yapf: enable
......@@ -51,6 +53,9 @@ def loss(x, label):
def train(args):
if args.run_ce:
np.random.seed(10)
fluid.default_startup_program().random_seed = 90
d_program = fluid.Program()
dg_program = fluid.Program()
......@@ -86,15 +91,23 @@ def train(args):
exe = fluid.Executor(fluid.CUDAPlace(0))
exe.run(fluid.default_startup_program())
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=60000),
batch_size=args.batch_size)
if args.run_ce:
train_reader = paddle.batch(
paddle.dataset.mnist.train(),
batch_size=args.batch_size)
else:
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=60000),
batch_size=args.batch_size)
NUM_TRAIN_TIMES_OF_DG = 2
const_n = np.random.uniform(
low=-1.0, high=1.0,
size=[args.batch_size, NOISE_SIZE]).astype('float32')
t_time = 0
losses = [[], []]
for pass_id in range(args.epoch):
for batch_id, data in enumerate(train_reader()):
if len(data) != args.batch_size:
......@@ -109,7 +122,7 @@ def train(args):
fake_labels = np.zeros(
shape=[real_image.shape[0], 1], dtype='float32')
total_label = np.concatenate([real_labels, fake_labels])
s_time = time.time()
generated_image = exe.run(g_program,
feed={'noise': noise_data},
fetch_list={g_img})[0]
......@@ -121,25 +134,27 @@ def train(args):
'img': generated_image,
'label': fake_labels,
},
fetch_list={d_loss})
fetch_list={d_loss})[0][0]
d_loss_2 = exe.run(d_program,
feed={
'img': real_image,
'label': real_labels,
},
fetch_list={d_loss})
d_loss_np = [d_loss_1[0][0], d_loss_2[0][0]]
fetch_list={d_loss})[0][0]
d_loss_n = d_loss_1 + d_loss_2
losses[0].append(d_loss_n)
for _ in six.moves.xrange(NUM_TRAIN_TIMES_OF_DG):
noise_data = np.random.uniform(
low=-1.0, high=1.0,
size=[args.batch_size, NOISE_SIZE]).astype('float32')
dg_loss_np = exe.run(dg_program,
dg_loss_n = exe.run(dg_program,
feed={'noise': noise_data},
fetch_list={dg_loss})[0]
if batch_id % 10 == 0:
fetch_list={dg_loss})[0][0]
losses[1].append(dg_loss_n)
t_time += (time.time() - s_time)
if batch_id % 10 == 0 and not args.run_ce:
if not os.path.exists(args.output):
os.makedirs(args.output)
# generate image each batch
......@@ -150,8 +165,7 @@ def train(args):
fig = plot(total_images)
msg = "Epoch ID={0} Batch ID={1} D-Loss={2} DG-Loss={3}\n gen={4}".format(
pass_id, batch_id,
np.sum(d_loss_np),
np.sum(dg_loss_np), check(generated_images))
d_loss_n, dg_loss_n, check(generated_images))
print(msg)
plt.title(msg)
plt.savefig(
......@@ -159,7 +173,11 @@ def train(args):
batch_id),
bbox_inches='tight')
plt.close(fig)
if args.run_ce:
print("kpis,dcgan_d_train_cost,{}".format(np.mean(losses[0])))
print("kpis,dcgan_g_train_cost,{}".format(np.mean(losses[1])))
print("kpis,dcgan_duration,{}".format(t_time / args.epoch))
if __name__ == "__main__":
args = parser.parse_args()
......
......@@ -4,6 +4,7 @@ from __future__ import print_function
import paddle
import paddle.fluid as fluid
from utility import get_parent_function_name
import os
gf_dim = 64
df_dim = 64
......@@ -16,6 +17,9 @@ y_dim = 1
output_height = 28
output_width = 28
use_cudnn = True
if 'ce_mode' in os.environ:
use_cudnn = False
def bn(x, name=None, act='relu'):
if name is None:
......@@ -42,6 +46,7 @@ def conv(x, num_filters, name=None, act=None):
pool_stride=2,
param_attr=name + 'w',
bias_attr=name + 'b',
use_cudnn=use_cudnn,
act=act)
......@@ -76,6 +81,7 @@ def deconv(x,
stride=stride,
dilation=dilation,
padding=padding,
use_cudnn=use_cudnn,
act=act)
......
#!/bin/bash
# This file is only used for continuous evaluation.
export FLAGS_cudnn_deterministic=True
export ce_mode=1
CUDA_VISIBLE_DEVICES=0 python train.py --batch_size=1 --epoch=10 --run_ce=True --use_gpu=True | python _ce.py
# this file is only used for continuous evaluation test!
####this file is only used for continuous evaluation test!
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
d_train_cost_kpi = CostKpi('d_train_cost', 0.05, 0, actived=True, desc='train cost of discriminator')
g_train_cost_kpi = CostKpi('g_train_cost', 0.05, 0, actived=True, desc='train cost of generator')
train_speed_kpi = DurationKpi(
'duration',
0.05,
0,
actived=True,
unit_repr='second',
desc='train time used in one GPU card')
imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0)
imikolov_20_pass_duration_kpi = DurationKpi(
'imikolov_20_pass_duration', 0.02, 0, actived=True)
imikolov_20_avg_ppl_kpi_card4 = CostKpi('imikolov_20_avg_ppl_card4', 0.2, 0)
imikolov_20_pass_duration_kpi_card4 = DurationKpi(
'imikolov_20_pass_duration_card4', 0.03, 0, actived=True)
tracking_kpis = [
imikolov_20_avg_ppl_kpi,
imikolov_20_pass_duration_kpi,
imikolov_20_avg_ppl_kpi_card4,
imikolov_20_pass_duration_kpi_card4,
]
tracking_kpis = [d_train_cost_kpi, g_train_cost_kpi, train_speed_kpi]
def parse_log(log):
......@@ -38,11 +40,12 @@ def parse_log(log):
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
fs = line.strip().split(',')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
print("kpi {}={}".format(kpi_name, kpi_value))
yield kpi_name, kpi_value
......@@ -59,4 +62,7 @@ def log_to_ce(log):
if __name__ == '__main__':
log = sys.stdin.read()
# print("*****")
# print(log)
# print("****")
log_to_ce(log)
......@@ -46,18 +46,18 @@ def reader_creater(list_file, cycle=True, shuffle=True, return_name=False):
return reader
def a_reader():
def a_reader(shuffle=True):
"""
Reader of images with A style for training.
"""
return reader_creater(A_LIST_FILE)
return reader_creater(A_LIST_FILE, shuffle=shuffle)
def b_reader():
def b_reader(shuffle=True):
"""
Reader of images with B style for training.
"""
return reader_creater(B_LIST_FILE)
return reader_creater(B_LIST_FILE, shuffle=shuffle)
def a_test_reader():
......
from __future__ import division
import paddle.fluid as fluid
import numpy as np
import os
use_cudnn = True
if 'ce_mode' in os.environ:
use_cudnn = False
def cal_padding(img_size, stride, filter_size, dilation=1):
"""Calculate padding size."""
......@@ -82,7 +86,7 @@ def conv2d(input,
name=name,
stride=stride,
padding=padding,
use_cudnn=False,
use_cudnn=use_cudnn,
param_attr=param_attr,
bias_attr=bias_attr)
if need_crop:
......@@ -137,6 +141,7 @@ def deconv2d(input,
filter_size=filter_size,
stride=stride,
padding=padding,
use_cudnn=use_cudnn,
param_attr=param_attr,
bias_attr=bias_attr)
......
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import data_reader
import os
import random
import sys
import paddle
import argparse
import functools
import paddle.fluid as fluid
import time
import numpy as np
from paddle.fluid import core
from trainer import *
from scipy.misc import imsave
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import data_reader
from utility import add_arguments, print_arguments, ImagePool
from trainer import *
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 1, "Minibatch size.")
add_arg('epoch', int, 2, "The number of epoched to be trained.")
add_arg('output', str, "./output_1", "The directory the model and the test result to be saved to.")
add_arg('output', str, "./output_0", "The directory the model and the test result to be saved to.")
add_arg('init_model', str, None, "The init model file of directory.")
add_arg('save_checkpoints', bool, True, "Whether to save checkpoints.")
add_arg('run_test', bool, True, "Whether to run test.")
add_arg('use_gpu', bool, True, "Whether to use GPU to train.")
add_arg('profile', bool, False, "Whether to profile.")
add_arg('run_ce', bool, False, "Whether to run for model ce.")
# yapf: enable
def train(args):
data_shape = [-1] + data_reader.image_shape()
max_images_num = data_reader.max_images_num()
shuffle=True
if args.run_ce:
np.random.seed(10)
fluid.default_startup_program().random_seed = 90
max_images_num = 1
shuffle = False
data_shape = [-1] + data_reader.image_shape()
input_A = fluid.layers.data(
name='input_A', shape=data_shape, dtype='float32')
......@@ -56,12 +66,12 @@ def train(args):
exe.run(fluid.default_startup_program())
A_pool = ImagePool()
B_pool = ImagePool()
A_reader = paddle.batch(data_reader.a_reader(), args.batch_size)()
B_reader = paddle.batch(data_reader.b_reader(), args.batch_size)()
A_test_reader = data_reader.a_test_reader()
B_test_reader = data_reader.b_test_reader()
A_reader = paddle.batch(data_reader.a_reader(shuffle=shuffle), args.batch_size)()
B_reader = paddle.batch(data_reader.b_reader(shuffle=shuffle), args.batch_size)()
if not args.run_ce:
A_test_reader = data_reader.a_test_reader()
B_test_reader = data_reader.b_test_reader()
def test(epoch):
out_path = args.output + "/test"
......@@ -109,13 +119,13 @@ def train(args):
if not os.path.exists(out_path):
os.makedirs(out_path)
fluid.io.save_persistables(
exe, out_path + "/g_a", main_program=g_A_trainer.program)
exe, out_path + "/g_a", main_program=g_A_trainer.program, filename="params")
fluid.io.save_persistables(
exe, out_path + "/g_b", main_program=g_B_trainer.program)
exe, out_path + "/g_b", main_program=g_B_trainer.program, filename="params")
fluid.io.save_persistables(
exe, out_path + "/d_a", main_program=d_A_trainer.program)
exe, out_path + "/d_a", main_program=d_A_trainer.program, filename="params")
fluid.io.save_persistables(
exe, out_path + "/d_b", main_program=d_B_trainer.program)
exe, out_path + "/d_b", main_program=d_B_trainer.program, filename="params")
print("saved checkpoint to {}".format(out_path))
sys.stdout.flush()
......@@ -134,7 +144,8 @@ def train(args):
if args.init_model:
init_model()
losses=[[], []]
t_time = 0
for epoch in range(args.epoch):
batch_id = 0
for i in range(max_images_num):
......@@ -144,6 +155,7 @@ def train(args):
tensor_B = core.LoDTensor()
tensor_A.set(data_A, place)
tensor_B.set(data_B, place)
s_time = time.time()
# optimize the g_A network
g_A_loss, fake_B_tmp = exe.run(
g_A_trainer.program,
......@@ -158,7 +170,7 @@ def train(args):
d_B_trainer.program,
fetch_list=[d_B_trainer.d_loss_B],
feed={"input_B": tensor_B,
"fake_pool_B": fake_pool_B})
"fake_pool_B": fake_pool_B})[0]
# optimize the g_B network
g_B_loss, fake_A_tmp = exe.run(
......@@ -174,18 +186,24 @@ def train(args):
d_A_trainer.program,
fetch_list=[d_A_trainer.d_loss_A],
feed={"input_A": tensor_A,
"fake_pool_A": fake_pool_A})
"fake_pool_A": fake_pool_A})[0]
t_time += (time.time() - s_time)
print("epoch{}; batch{}; g_A_loss: {}; d_B_loss: {}; g_B_loss: {}; d_A_loss: {};".format(
epoch, batch_id, g_A_loss[0], d_B_loss[0], g_B_loss[0],
d_A_loss[0]))
losses[0].append(g_A_loss[0])
losses[1].append(d_A_loss[0])
sys.stdout.flush()
batch_id += 1
if args.run_test:
if args.run_test and not args.run_ce:
test(epoch)
if args.save_checkpoints:
if args.save_checkpoints and not args.run_ce:
checkpoints(epoch)
if args.run_ce:
print("kpis,g_train_cost,{}".format(np.mean(losses[0])))
print("kpis,d_train_cost,{}".format(np.mean(losses[1])))
print("kpis,duration,{}".format(t_time / args.epoch))
if __name__ == "__main__":
......
......@@ -5,6 +5,7 @@ import argparse
import functools
import shutil
import math
import multiprocessing
import paddle
import paddle.fluid as fluid
......@@ -127,7 +128,7 @@ def train(args,
if parallel:
train_exe = fluid.ParallelExecutor(main_program=train_prog,
use_cuda=use_gpu, loss_name=loss.name)
use_cuda=True if use_gpu else False, loss_name=loss.name)
train_reader = reader.train(data_args,
train_file_list,
......
......@@ -24,8 +24,6 @@ def conv_bn(input,
use_cudnn=use_cudnn,
param_attr=parameter_attr,
bias_attr=False)
parameter_attr = ParamAttr(learning_rate=0.1, initializer=MSRA())
bias_attr = ParamAttr(learning_rate=0.2)
return fluid.layers.batch_norm(input=conv, act=act)
......
Subproject commit 870651e257750f2c237f0b0bc9a27e5d062d1909
Subproject commit 733c1d02085a3092dd262c4f396563962a514c3e
Subproject commit 4dbe7f7b0e76c188eb7f448d104f0165f0a12229
Subproject commit 60b698a294c34420a7f0aab3112f27649aed1445
###!/bin/bash
####This file is only used for continuous evaluation.
export CE_MODE_X=1
export CUDA_VISIBLE_DEVICES=0
export FLAGS_eager_delete_tensor_gb=0.0
if [ ! -e data_small.pkl ]; then
wget -c http://dam-data.bj.bcebos.com/data_small.pkl
fi
python train_and_evaluate.py --data_path data_small.pkl \
--use_cuda \
--use_pyreader \
--num_scan_data 1 \
--batch_size 100 | python _ce.py
# this file is only used for continuous evaluation test!
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
from kpi import CostKpi, DurationKpi, AccKpi
imikolov_20_avg_ppl_kpi = CostKpi('lstm_language_model_loss', 0.02, 0)
imikolov_20_pass_duration_kpi = DurationKpi(
'lstm_language_model_duration', 0.02, 0, actived=True)
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.05, actived=True)
tracking_kpis = [
imikolov_20_avg_ppl_kpi,
imikolov_20_pass_duration_kpi,
train_cost_kpi,
train_duration_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
kpi_name = fs[0]
kpi_value = float(fs[1])
yield kpi_name, kpi_value
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
......@@ -53,4 +40,7 @@ def log_to_ce(log):
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
......@@ -25,7 +25,7 @@ class Net(object):
# turns ids
shapes = [[-1, self._max_turn_len, 1]
for i in six.moves.xrange(self._max_turn_num)]
dtypes = ["int32" for i in six.moves.xrange(self._max_turn_num)]
dtypes = ["int64" for i in six.moves.xrange(self._max_turn_num)]
# turns mask
shapes += [[-1, self._max_turn_len, 1]
for i in six.moves.xrange(self._max_turn_num)]
......@@ -34,7 +34,7 @@ class Net(object):
# response ids, response mask, label
shapes += [[-1, self._max_turn_len, 1], [-1, self._max_turn_len, 1],
[-1, 1]]
dtypes += ["int32", "float32", "float32"]
dtypes += ["int64", "float32", "float32"]
py_reader = fluid.layers.py_reader(
capacity=capacity,
......@@ -60,7 +60,7 @@ class Net(object):
for i in six.moves.xrange(self._max_turn_num):
name = "turn_%d" % i
turn = fluid.layers.data(
name=name, shape=[self._max_turn_len, 1], dtype="int32")
name=name, shape=[self._max_turn_len, 1], dtype="int64")
self.turns_data.append(turn)
self._feed_names.append(name)
......@@ -73,7 +73,7 @@ class Net(object):
self._feed_names.append(name)
self.response = fluid.layers.data(
name="response", shape=[self._max_turn_len, 1], dtype="int32")
name="response", shape=[self._max_turn_len, 1], dtype="int64")
self.response_mask = fluid.layers.data(
name="response_mask",
shape=[self._max_turn_len, 1],
......@@ -141,7 +141,7 @@ class Net(object):
mask_cache=mask_cache)
Hu_stack.append(Hu)
# cross attention
# cross attention
r_a_t_stack = []
t_a_r_stack = []
for index in six.moves.xrange(self._stack_num + 1):
......@@ -183,7 +183,7 @@ class Net(object):
t_a_r = fluid.layers.concat(input=t_a_r_stack, axis=1)
r_a_t = fluid.layers.concat(input=r_a_t_stack, axis=1)
# sim shape: [batch_size, 2*(stack_num+1), max_turn_len, max_turn_len]
# sim shape: [batch_size, 2*(stack_num+1), max_turn_len, max_turn_len]
sim = fluid.layers.matmul(
x=t_a_r, y=r_a_t, transpose_y=True, alpha=1 / np.sqrt(200.0))
sim_turns.append(sim)
......
......@@ -126,6 +126,7 @@ def test(args):
dam = Net(args.max_turn_num, args.max_turn_len, args.vocab_size,
args.emb_size, args.stack_num, args.channel1_num,
args.channel2_num)
dam.create_data_layers()
loss, logits = dam.create_network()
loss.persistable = True
......@@ -144,7 +145,7 @@ def test(args):
staircase=True))
optimizer.minimize(loss)
# The fethced loss is wrong when mem opt is enabled
# The fethced loss is wrong when mem opt is enabled
fluid.memory_optimize(fluid.default_main_program())
if args.use_cuda:
......@@ -191,7 +192,8 @@ def test(args):
feed_list = []
for dev in six.moves.xrange(dev_count):
index = it * dev_count + dev
feed_dict = reader.make_one_batch_input(test_batches, index)
batch_data = reader.make_one_batch_input(test_batches, index)
feed_dict = dict(zip(dam.get_feed_names(), batch_data))
feed_list.append(feed_dict)
predicts = test_exe.run(feed=feed_list, fetch_list=[logits.name])
......
......@@ -192,6 +192,9 @@ def train(args):
train_program = fluid.Program()
train_startup = fluid.Program()
if "CE_MODE_X" in os.environ:
train_program.random_seed = 110
train_startup.random_seed = 110
with fluid.program_guard(train_program, train_startup):
with fluid.unique_name.guard():
if args.use_pyreader:
......@@ -213,10 +216,15 @@ def train(args):
decay_rate=0.9,
staircase=True))
optimizer.minimize(loss)
print("begin memory optimization ...")
fluid.memory_optimize(train_program)
print("end memory optimization ...")
test_program = fluid.Program()
test_startup = fluid.Program()
if "CE_MODE_X" in os.environ:
test_program.random_seed = 110
test_startup.random_seed = 110
with fluid.program_guard(test_program, test_startup):
with fluid.unique_name.guard():
if args.use_pyreader:
......@@ -322,9 +330,9 @@ def train(args):
result_file_path = os.path.join(args.save_path,
'result.' + str(step))
evaluate(score_path, result_file_path)
return step
return step, np.array(cost[0]).mean()
# train on one epoch with pyreader
# train on one epoch with pyreader
def train_with_pyreader(step):
def data_provider():
for index in six.moves.xrange(batch_num):
......@@ -367,18 +375,25 @@ def train(args):
except fluid.core.EOFException:
train_pyreader.reset()
break
return step
return step, np.array(cost[0]).mean()
# train over different epoches
global_step = 0
global_step, train_time = 0, 0.0
for epoch in six.moves.xrange(args.num_scan_data):
shuffle_train = reader.unison_shuffle(train_data)
shuffle_train = reader.unison_shuffle(
train_data, seed=110 if ("CE_MODE_X" in os.environ) else None)
train_batches = reader.build_batches(shuffle_train, data_conf)
begin_time = time.time()
if args.use_pyreader:
global_step = train_with_pyreader(global_step)
global_step, last_cost = train_with_pyreader(global_step)
else:
global_step = train_with_feed(global_step)
global_step, last_cost = train_with_feed(global_step)
train_time += time.time() - begin_time
# For internal continuous evaluation
if "CE_MODE_X" in os.environ:
print("kpis train_cost %f" % last_cost)
print("kpis train_duration %f" % train_time)
if __name__ == '__main__':
......
......@@ -17,6 +17,7 @@ def unison_shuffle(data, seed=None):
assert len(y) == len(c) == len(r)
p = np.random.permutation(len(y))
print(p)
shuffle_data = {six.b('y'): y[p], six.b('c'): c[p], six.b('r'): r[p]}
return shuffle_data
......
......@@ -148,7 +148,7 @@ def train(train_reader,
if pass_idx == pass_num - 1 and args.enable_ce:
#Note: The following logs are special for CE monitoring.
#Other situations do not need to care about these logs.
gpu_num = get_cards(args.enable_ce)
gpu_num = get_cards(args)
if gpu_num == 1:
print("kpis imikolov_20_pass_duration %s" %
(total_time / epoch_idx))
......
......@@ -35,9 +35,10 @@ def parse_log(log):
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
kpi_name = fs[0]
kpi_value = float(fs[1])
yield kpi_name, kpi_value
if len(fs) == 3 and fs[0] == 'ptblm':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
......
......@@ -28,7 +28,10 @@ Py3 = sys.version_info[0] == 3
def _read_words(filename):
data = []
with open(filename, "r") as f:
return f.read().decode("utf-8").replace("\n", "<eos>").split()
if Py3:
return f.read().replace("\n", "<eos>").split()
else:
return f.read().decode("utf-8").replace("\n", "<eos>").split()
def _build_vocab(filename):
......
......@@ -258,7 +258,8 @@ def train():
fetch_list=[
loss.name, last_hidden.name,
last_cell.name, 'learning_rate'
])
],
use_program_cache=True)
cost_train = np.array(fetch_outs[0])
init_hidden = np.array(fetch_outs[1])
......@@ -282,8 +283,9 @@ def train():
print("train ppl", ppl[0])
if epoch_id == max_epoch - 1 and args.enable_ce:
print("lstm_language_model_duration\t%s" % (total_time / max_epoch))
print("lstm_language_model_loss\t%s" % ppl[0])
print("ptblm\tlstm_language_model_duration\t%s" %
(total_time / max_epoch))
print("ptblm\tlstm_language_model_loss\t%s" % ppl[0])
model_path = os.path.join("model_new/", str(epoch_id))
if not os.path.isdir(model_path):
......
#!/bin/bash
DATA_PATH=./data
if [ ! -e $DATA_PATH/demo ] ; then
mkdir -p $DATA_PATH/demo
if [ ! -e $DATA_PATH/demo.tgz ] ; then
cd $DATA_PATH
wget -c --no-check-certificate http://dureader.gz.bcebos.com/demo.tgz
cd -
fi
tar -zxf $DATA_PATH/demo.tgz -C $DATA_PATH/demo
fi
train(){
python -u run.py \
--trainset 'data/demo/search.train.json' \
--devset 'data/demo/search.dev.json' \
--testset 'data/demo/search.test.json' \
--vocab_dir 'data/demo/' \
--use_gpu true \
--save_dir ./models \
--pass_num 1 \
--learning_rate 0.001 \
--batch_size 32 \
--embed_size 300 \
--hidden_size 150 \
--max_p_num 5 \
--max_p_len 500 \
--max_q_len 60 \
--max_a_len 200 \
--drop_rate 0.2 \
--log_interval 1 \
--enable_ce \
--train
}
cudaid=${single:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
train | python _ce.py
cudaid=${multi:=0,1,2,3} # use 0,1,2,3 card as default
export CUDA_VISIBLE_DEVICES=$cudaid
train | python _ce.py
####this file is only used for continuous evaluation test!
import os
import sys
#sys.path.insert(0, os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_card1_kpi = CostKpi('train_cost_card1', 0.02, 0, actived=True)
test_cost_card1_kpi = CostKpi('test_cost_card1', 0.005, 0, actived=True)
train_duration_card1_kpi = DurationKpi(
'train_duration_card1', 0.06, 0, actived=True)
train_cost_card4_kpi = CostKpi('train_cost_card4', 0.01, 0, actived=True)
test_cost_card4_kpi = CostKpi('test_cost_card4', 0.005, 0, actived=True)
train_duration_card4_kpi = DurationKpi(
'train_duration_card4', 0.06, 0, actived=True)
tracking_kpis = [
train_cost_card1_kpi,
test_cost_card1_kpi,
train_duration_card1_kpi,
train_cost_card4_kpi,
test_cost_card4_kpi,
train_duration_card4_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
......@@ -120,5 +120,9 @@ def parse_args():
'--result_name',
default='test_result',
help='the file name of the results')
parser.add_argument(
"--enable_ce",
action='store_true',
help="If set, run the task with continuous evaluation logs.")
args = parser.parse_args()
return args
......@@ -21,6 +21,7 @@ if [[ -d preprocessed ]] && [[ -d raw ]]; then
exit 0
else
wget -c --no-check-certificate http://dureader.gz.bcebos.com/dureader_preprocessed.zip
wget -c --no-check-certificate http://dureader.gz.bcebos.com/demo.tgz
fi
if md5sum --status -c md5sum.txt; then
......
......@@ -152,7 +152,7 @@ class BRCDataset(object):
batch_data['passage_token_ids'].append(passage_token_ids)
batch_data['passage_length'].append(
min(len(passage_token_ids), self.max_p_len))
# record the start passage index of current doc
# record the start passage index of current sample
passade_idx_offset = sum(batch_data['passage_num'])
batch_data['passage_num'].append(count)
gold_passage_offset = 0
......
......@@ -248,18 +248,18 @@ def validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order,
n_batch_loss / n_batch_cnt)))
n_batch_loss = 0.0
n_batch_cnt = 0
batch_offset = 0
for idx, batch in enumerate(batch_list):
#one batch
batch_size = len(batch['raw_data'])
batch_range = match_lod[0][idx * batch_size:(idx + 1) * batch_size +
batch_range = match_lod[0][batch_offset:batch_offset + batch_size +
1]
batch_lod = [[batch_range[x], batch_range[x + 1]]
for x in range(len(batch_range[:-1]))]
start_prob_batch = start_probs_m[idx * batch_size:(idx + 1) *
batch_size]
end_prob_batch = end_probs_m[idx * batch_size:(idx + 1) *
batch_size]
start_prob_batch = start_probs_m[batch_offset:batch_offset +
batch_size + 1]
end_prob_batch = end_probs_m[batch_offset:batch_offset + batch_size
+ 1]
for sample, start_prob_inst, end_prob_inst, inst_range in zip(
batch['raw_data'], start_prob_batch, end_prob_batch,
batch_lod):
......@@ -284,6 +284,7 @@ def validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order,
'yesno_answers': []
}
ref_answers.append(ref)
batch_offset = batch_offset + batch_size
result_dir = args.result_dir
result_prefix = args.result_name
......@@ -312,6 +313,15 @@ def validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order,
return ave_loss, bleu_rouge
def l2_loss(train_prog):
param_list = train_prog.block(0).all_parameters()
para_sum = []
for para in param_list:
para_mul = fluid.layers.elementwise_mul(x=para, y=para, axis=0)
para_sum.append(fluid.layers.reduce_sum(input=para_mul, dim=None))
return fluid.layers.sums(para_sum) * 0.5
def train(logger, args):
logger.info('Load data_set and vocab...')
with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
......@@ -337,8 +347,9 @@ def train(logger, args):
# build model
main_program = fluid.Program()
startup_prog = fluid.Program()
main_program.random_seed = args.random_seed
startup_prog.random_seed = args.random_seed
if args.enable_ce:
main_program.random_seed = args.random_seed
startup_prog.random_seed = args.random_seed
with fluid.program_guard(main_program, startup_prog):
with fluid.unique_name.guard():
avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model(
......@@ -349,24 +360,22 @@ def train(logger, args):
# build optimizer
if args.optim == 'sgd':
optimizer = fluid.optimizer.SGD(
learning_rate=args.learning_rate,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=args.weight_decay))
learning_rate=args.learning_rate)
elif args.optim == 'adam':
optimizer = fluid.optimizer.Adam(
learning_rate=args.learning_rate,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=args.weight_decay))
learning_rate=args.learning_rate)
elif args.optim == 'rprop':
optimizer = fluid.optimizer.RMSPropOptimizer(
learning_rate=args.learning_rate,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=args.weight_decay))
learning_rate=args.learning_rate)
else:
logger.error('Unsupported optimizer: {}'.format(args.optim))
exit(-1)
optimizer.minimize(avg_cost)
if args.weight_decay > 0.0:
obj_func = avg_cost + args.weight_decay * l2_loss(main_program)
optimizer.minimize(obj_func)
else:
obj_func = avg_cost
optimizer.minimize(obj_func)
# initialize parameters
place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
......@@ -398,7 +407,10 @@ def train(logger, args):
for pass_id in range(1, args.pass_num + 1):
pass_start_time = time.time()
pad_id = vocab.get_id(vocab.pad_token)
train_reader = lambda:brc_data.gen_mini_batches('train', args.batch_size, pad_id, shuffle=False)
if args.enable_ce:
train_reader = lambda:brc_data.gen_mini_batches('train', args.batch_size, pad_id, shuffle=False)
else:
train_reader = lambda:brc_data.gen_mini_batches('train', args.batch_size, pad_id, shuffle=True)
train_reader = read_multiple(train_reader, dev_count)
log_every_n_batch, n_batch_loss = args.log_interval, 0
total_num, total_loss = 0, 0
......@@ -406,13 +418,15 @@ def train(logger, args):
feed_data = batch_reader(batch_list, args)
fetch_outs = parallel_executor.run(
feed=list(feeder.feed_parallel(feed_data, dev_count)),
fetch_list=[avg_cost.name],
fetch_list=[obj_func.name],
return_numpy=False)
cost_train = np.array(fetch_outs[0]).mean()
total_num += args.batch_size * dev_count
n_batch_loss += cost_train
total_loss += cost_train * args.batch_size * dev_count
if args.enable_ce and batch_id >= 100:
break
if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0:
print_para(main_program, parallel_executor, logger,
args)
......@@ -457,6 +471,14 @@ def train(logger, args):
executor=exe,
dirname=model_path,
main_program=main_program)
if args.enable_ce: # For CE
print("kpis\ttrain_cost_card%d\t%f" %
(dev_count, total_loss / total_num))
if brc_data.dev_set is not None:
print("kpis\ttest_cost_card%d\t%f" %
(dev_count, eval_loss))
print("kpis\ttrain_duration_card%d\t%f" %
(dev_count, time_consumed))
def evaluate(logger, args):
......@@ -474,8 +496,6 @@ def evaluate(logger, args):
# build model
main_program = fluid.Program()
startup_prog = fluid.Program()
main_program.random_seed = args.random_seed
startup_prog.random_seed = args.random_seed
with fluid.program_guard(main_program, startup_prog):
with fluid.unique_name.guard():
avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model(
......@@ -523,8 +543,6 @@ def predict(logger, args):
# build model
main_program = fluid.Program()
startup_prog = fluid.Program()
main_program.random_seed = args.random_seed
startup_prog.random_seed = args.random_seed
with fluid.program_guard(main_program, startup_prog):
with fluid.unique_name.guard():
avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model(
......@@ -592,8 +610,9 @@ def prepare(logger, args):
if __name__ == '__main__':
args = parse_args()
random.seed(args.random_seed)
np.random.seed(args.random_seed)
if args.enable_ce:
random.seed(args.random_seed)
np.random.seed(args.random_seed)
logger = logging.getLogger("brc")
logger.setLevel(logging.INFO)
......
......@@ -18,5 +18,5 @@ python run.py \
--max_p_len 500 \
--max_q_len 60 \
--max_a_len 200 \
--weight_decay 0.0 \
--weight_decay 0.0001 \
--drop_rate 0.2 $@\
......@@ -129,10 +129,12 @@ def multi_head_attention(queries,
# input from the previous time step first.
k = cache["k"] = layers.concat(
[layers.reshape(
cache["k"], shape=[0, 0, d_model]), k], axis=1)
cache["k"], shape=[0, 0, d_key * n_head]), k],
axis=1)
v = cache["v"] = layers.concat(
[layers.reshape(
cache["v"], shape=[0, 0, d_model]), v], axis=1)
cache["v"], shape=[0, 0, d_value * n_head]), v],
axis=1)
q = __split_heads(q, n_head)
k = __split_heads(k, n_head)
......@@ -657,8 +659,7 @@ def wrap_decoder(trg_vocab_size,
else:
predict = layers.fc(input=dec_output,
size=trg_vocab_size,
bias_attr=False,
num_flatten_dims=2)
bias_attr=False)
if dec_inputs is None:
# Return probs for independent decoder program.
predict = layers.softmax(predict)
......
......@@ -15,8 +15,12 @@ def ctr_dnn_model(embedding_size, sparse_feature_dim):
def embedding_layer(input):
return fluid.layers.embedding(
input=input,
is_sparse=True,
# you need to patch https://github.com/PaddlePaddle/Paddle/pull/14190
# if you want to set is_distributed to True
is_distributed=False,
size=[sparse_feature_dim, embedding_size],
param_attr=fluid.ParamAttr(name="SparseFeatFactors", initializer=fluid.initializer.Normal(scale=1/math.sqrt(sparse_feature_dim))))
param_attr=fluid.ParamAttr(name="SparseFeatFactors", initializer=fluid.initializer.Uniform()))
sparse_embed_seq = map(embedding_layer, sparse_input_ids)
concated = fluid.layers.concat(sparse_embed_seq + [dense_input], axis=1)
......
......@@ -21,10 +21,10 @@ class CriteoDataset(Dataset):
for line in f:
line_idx += 1
if is_train and line_idx > self.train_idx_:
continue
break
elif not is_train and line_idx <= self.train_idx_:
continue
if trainer_id > 0 and line_idx % trainer_num != trainer_id:
if line_idx % trainer_num != trainer_id:
continue
features = line.rstrip('\n').split('\t')
dense_feature = []
......
......@@ -138,7 +138,7 @@ def train():
if args.is_local:
logger.info("run local training")
main_program = fluid.default_main_program()
train_loop(args, main_program, data_list, loss, auc_var, batch_auc_var, 1, -1)
train_loop(args, main_program, data_list, loss, auc_var, batch_auc_var, 1, 0)
else:
logger.info("run dist training")
t = fluid.DistributeTranspiler()
......@@ -154,7 +154,7 @@ def train():
logger.info("run trainer")
train_prog = t.get_trainer_program()
train_loop(args, train_prog, data_list, loss, auc_var, batch_auc_var,
args.trainers, args.trainer_id + 1)
args.trainers, args.trainer_id)
if __name__ == '__main__':
......
......@@ -162,7 +162,7 @@ SimNet是百度自然语言处理部于2013年自主研发的语义匹配框架
在工业可用的推荐系统中,推荐策略一般会被划分为多个模块串联执行。以新闻推荐系统为例,存在多个可以使用深度学习技术的环节,例如新闻的自动化标注,个性化新闻召回,个性化匹配与排序等。PaddlePaddle对推荐算法的训练提供了完整的支持,并提供了多种模型配置供用户选择。
- [TagSpace](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/TagSpace)
- [TagSpace](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/tagspace)
- [GRU4Rec](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/gru4rec)
- [SequenceSemanticRetrieval](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/ssr)
- [DeepCTR](https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleRec/ctr/README.cn.md)
......
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${language_model:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --enable_ce | python _ce.py
cudaid=${language_model_m:=0,1,2,3} # use 0,1,2,3 card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --enable_ce | python _ce.py
# 语言模型
以下是本例的简要目录结构及说明:
```text
.
├── README.md # 文档
├── train.py # 训练脚本
├── infer.py # 预测脚本
└── utils.py # 通用函数
```
## 简介
循环神经网络语言模型的介绍可以参阅论文[Recurrent Neural Network Regularization](https://arxiv.org/abs/1409.2329),在本例中,我们实现了GRU-RNN语言模型。
## 训练
运行命令 `python train.py` 开始训练模型。
```python
python train.py
```
当前支持的参数可参见[train.py](./train.py) `train_net` 函数
```python
vocab, train_reader, test_reader = utils.prepare_data(
batch_size=20, # batch size
buffer_size=1000, # buffer size, default value is OK
word_freq_threshold=0) # vocabulary related parameter, and words with frequency below this value will be filtered
train(train_reader=train_reader,
vocab=vocab,
network=network,
hid_size=200, # embedding and hidden size
base_lr=1.0, # base learning rate
batch_size=20, # batch size, the same as that in prepare_data
pass_num=12, # the number of passes for training
use_cuda=True, # whether to use GPU card
parallel=False, # whether to be parallel
model_dir="model", # directory to save model
init_low_bound=-0.1, # uniform parameter initialization lower bound
init_high_bound=0.1) # uniform parameter initialization upper bound
```
## 自定义网络结构
可在[train.py](./train.py) `network` 函数中调整网络结构,当前的网络结构如下:
```python
emb = fluid.layers.embedding(input=src, size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound),
learning_rate=emb_lr_x),
is_sparse=True)
fc0 = fluid.layers.fc(input=emb, size=hid_size * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound),
learning_rate=gru_lr_x))
gru_h0 = fluid.layers.dynamic_gru(input=fc0, size=hid_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound),
learning_rate=gru_lr_x))
fc = fluid.layers.fc(input=gru_h0, size=vocab_size, act='softmax',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound),
learning_rate=fc_lr_x))
cost = fluid.layers.cross_entropy(input=fc, label=dst)
```
## 训练结果示例
我们在Tesla K40m单GPU卡上训练的日志如下所示
```text
epoch_1 start
step:100 ppl:771.053
step:200 ppl:449.597
step:300 ppl:642.654
step:400 ppl:458.128
step:500 ppl:510.912
step:600 ppl:451.545
step:700 ppl:364.404
step:800 ppl:324.272
step:900 ppl:360.797
step:1000 ppl:275.761
step:1100 ppl:294.599
step:1200 ppl:335.877
step:1300 ppl:185.262
step:1400 ppl:241.744
step:1500 ppl:211.507
step:1600 ppl:233.431
step:1700 ppl:298.767
step:1800 ppl:203.403
step:1900 ppl:158.828
step:2000 ppl:171.148
step:2100 ppl:280.884
epoch:1 num_steps:2104 time_cost(s):47.478780
model saved in model/epoch_1
epoch_2 start
step:100 ppl:238.099
step:200 ppl:136.527
step:300 ppl:204.184
step:400 ppl:252.886
step:500 ppl:177.377
step:600 ppl:197.688
step:700 ppl:131.650
step:800 ppl:223.906
step:900 ppl:144.785
step:1000 ppl:176.286
step:1100 ppl:148.158
step:1200 ppl:203.581
step:1300 ppl:168.208
step:1400 ppl:159.412
step:1500 ppl:114.032
step:1600 ppl:157.985
step:1700 ppl:147.743
step:1800 ppl:88.676
step:1900 ppl:141.962
step:2000 ppl:106.087
step:2100 ppl:122.709
epoch:2 num_steps:2104 time_cost(s):47.583789
model saved in model/epoch_2
...
```
## 预测
运行命令 `python infer.py model_dir start_epoch last_epoch(inclusive)` 开始预测,其中,start_epoch指定开始预测的轮次,last_epoch指定结束的轮次,例如
```python
python infer.py model 1 12 # prediction from epoch 1 to epoch 12
```
## 预测结果示例
```text
model:model/epoch_1 ppl:254.540 time_cost(s):3.29
model:model/epoch_2 ppl:177.671 time_cost(s):3.27
model:model/epoch_3 ppl:156.251 time_cost(s):3.27
model:model/epoch_4 ppl:139.036 time_cost(s):3.27
model:model/epoch_5 ppl:132.661 time_cost(s):3.27
model:model/epoch_6 ppl:130.092 time_cost(s):3.28
model:model/epoch_7 ppl:128.751 time_cost(s):3.27
model:model/epoch_8 ppl:125.411 time_cost(s):3.27
model:model/epoch_9 ppl:124.604 time_cost(s):3.28
model:model/epoch_10 ppl:124.754 time_cost(s):3.29
model:model/epoch_11 ppl:125.421 time_cost(s):3.27
model:model/epoch_12 ppl:125.676 time_cost(s):3.27
```
import sys
import time
import math
import unittest
import contextlib
import numpy as np
import six
import paddle
import paddle.fluid as fluid
import utils
def infer(test_reader, use_cuda, model_path):
""" inference function """
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
with fluid.scope_guard(fluid.core.Scope()):
infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(
model_path, exe)
accum_cost = 0.0
accum_words = 0
t0 = time.time()
for data in test_reader():
src_wordseq = utils.to_lodtensor([dat[0] for dat in data], place)
dst_wordseq = utils.to_lodtensor([dat[1] for dat in data], place)
avg_cost = exe.run(
infer_program,
feed={"src_wordseq": src_wordseq,
"dst_wordseq": dst_wordseq},
fetch_list=fetch_vars)
nwords = src_wordseq.lod()[0][-1]
cost = np.array(avg_cost) * nwords
accum_cost += cost
accum_words += nwords
ppl = math.exp(accum_cost / accum_words)
t1 = time.time()
print("model:%s ppl:%.3f time_cost(s):%.2f" %
(model_path, ppl, t1 - t0))
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: %s model_dir start_epoch last_epoch(inclusive)")
exit(0)
model_dir = sys.argv[1]
try:
start_index = int(sys.argv[2])
last_index = int(sys.argv[3])
except:
print("Usage: %s model_dir start_epoch last_epoch(inclusive)")
exit(-1)
vocab, train_reader, test_reader = utils.prepare_data(
batch_size=20, buffer_size=1000, word_freq_threshold=0)
for epoch in six.moves.xrange(start_index, last_index + 1):
epoch_path = model_dir + "/epoch_" + str(epoch)
infer(test_reader=test_reader, use_cuda=True, model_path=epoch_path)
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import paddle.fluid as fluid
import paddle
import utils
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("language_model benchmark.")
parser.add_argument(
'--enable_ce',
action='store_true',
help='If set, run \
the task with continuous evaluation logs.')
parser.add_argument(
'--num_devices', type=int, default=1, help='Number of GPU devices')
args = parser.parse_args()
return args
def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
""" network definition """
emb_lr_x = 10.0
gru_lr_x = 1.0
fc_lr_x = 1.0
emb = fluid.layers.embedding(
input=src,
size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=emb_lr_x),
is_sparse=True)
fc0 = fluid.layers.fc(input=emb,
size=hid_size * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=gru_lr_x))
gru_h0 = fluid.layers.dynamic_gru(
input=fc0,
size=hid_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=gru_lr_x))
fc = fluid.layers.fc(input=gru_h0,
size=vocab_size,
act='softmax',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=fc_lr_x))
cost = fluid.layers.cross_entropy(input=fc, label=dst)
return cost
def train(train_reader,
vocab,
network,
hid_size,
base_lr,
batch_size,
pass_num,
use_cuda,
parallel,
model_dir,
init_low_bound=-0.04,
init_high_bound=0.04):
""" train network """
args = parse_args()
if args.enable_ce:
# random seed must set before configuring the network.
fluid.default_startup_program().random_seed = SEED
vocab_size = len(vocab)
#Input data
src_wordseq = fluid.layers.data(
name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
dst_wordseq = fluid.layers.data(
name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)
# Train program
avg_cost = None
cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
init_low_bound, init_high_bound)
avg_cost = fluid.layers.mean(x=cost)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=base_lr,
decay_steps=2100 * 4,
decay_rate=0.5,
staircase=True))
sgd_optimizer.minimize(avg_cost)
# Initialize executor
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
total_time = 0.0
fetch_list = [avg_cost.name]
for pass_idx in six.moves.xrange(pass_num):
epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx)
t0 = time.time()
i = 0
newest_ppl = 0
for data in train_reader():
i += 1
lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data],
place)
lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data],
place)
ret_avg_cost = train_exe.run(feed={
"src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq
},
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
if i % 100 == 0:
print("step:%d ppl:%.3f" % (i, newest_ppl))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, i, total_time / epoch_idx))
if pass_idx == pass_num - 1 and args.enable_ce:
#Note: The following logs are special for CE monitoring.
#Other situations do not need to care about these logs.
gpu_num = get_cards(args.enable_ce)
if gpu_num == 1:
print("kpis imikolov_20_pass_duration %s" %
(total_time / epoch_idx))
print("kpis imikolov_20_avg_ppl %s" % newest_ppl)
else:
print("kpis imikolov_20_pass_duration_card%s %s" % \
(gpu_num, total_time / epoch_idx))
print("kpis imikolov_20_avg_ppl_card%s %s" %
(gpu_num, newest_ppl))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("model saved in %s" % save_dir)
print("finish training")
def get_cards(args):
if args.enable_ce:
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
else:
return args.num_devices
def train_net():
""" do training """
batch_size = 20
args = parse_args()
vocab, train_reader, test_reader = utils.prepare_data(
batch_size=batch_size * get_cards(args), buffer_size=1000, \
word_freq_threshold=0, enable_ce = args.enable_ce)
train(
train_reader=train_reader,
vocab=vocab,
network=network,
hid_size=200,
base_lr=1.0,
batch_size=batch_size,
pass_num=12,
use_cuda=True,
parallel=True,
model_dir="model",
init_low_bound=-0.1,
init_high_bound=0.1)
if __name__ == "__main__":
train_net()
import os
import sys
import time
import six
import numpy as np
import math
import collections
import paddle
import paddle.fluid as fluid
import paddle.fluid.framework as framework
cluster_train_dir = "./train/"
cluster_test_dir = "./test/"
train_file = "ptb.train.txt"
valid_file = "ptb.valid.txt"
test_file = "ptb.test.txt"
class DataType(object):
""" data type """
NGRAM = 1
SEQ = 2
def word_count(f, word_freq=None):
""" count words """
if word_freq is None:
word_freq = collections.defaultdict(int)
for line in f:
for w in line.strip().split():
word_freq[w] += 1
word_freq['<s>'] += 1
word_freq['<e>'] += 1
return word_freq
def build_dict(min_word_freq=50):
""" build dictionary """
train_filename = cluster_train_dir + train_file
test_filename = cluster_test_dir + valid_file
trainf = open(train_filename).readlines()
testf = open(test_filename).readlines()
word_freq = word_count(testf, word_count(trainf))
if '<unk>' in word_freq:
del word_freq['<unk>']
word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(zip(words, six.moves.xrange(len(words))))
word_idx['<unk>'] = len(words)
return word_idx
def reader_creator(filename, word_idx, n, data_type):
""" create reader """
def reader():
if True:
f = open(filename).readlines()
UNK = word_idx['<unk>']
for line in f:
if DataType.NGRAM == data_type:
assert n > -1, 'Invalid gram length'
line = ['<s>'] + line.strip().split() + ['<e>']
if len(line) >= n:
line = [word_idx.get(w, UNK) for w in line]
for i in range(n, len(line) + 1):
yield tuple(line[i - n:i])
elif DataType.SEQ == data_type:
line = line.strip().split()
line = [word_idx.get(w, UNK) for w in line]
src_seq = [word_idx['<s>']] + line
trg_seq = line + [word_idx['<e>']]
if n > 0 and len(src_seq) > n:
continue
yield src_seq, trg_seq
else:
assert False, 'Unknow data type'
return reader
def to_lodtensor(data, place):
""" convert to LODtensor """
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for line in seq_lens:
cur_len += line
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0):
""" prepare the English Pann Treebank (PTB) data """
vocab = build_dict(word_freq_threshold)
train_reader = paddle.batch(
paddle.reader.shuffle(
reader_creator(
cluster_train_dir + train_file,
vocab,
buffer_size,
data_type=DataType.SEQ),
buf_size=buffer_size),
batch_size)
test_reader = paddle.batch(
reader_creator(
cluster_test_dir + test_file,
vocab,
buffer_size,
data_type=DataType.SEQ),
batch_size)
return vocab, train_reader, test_reader
def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
""" network definition """
emb_lr_x = 10.0
gru_lr_x = 1.0
fc_lr_x = 1.0
emb = fluid.layers.embedding(
input=src,
size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=emb_lr_x),
is_sparse=True)
fc0 = fluid.layers.fc(input=emb,
size=hid_size * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=gru_lr_x))
gru_h0 = fluid.layers.dynamic_gru(
input=fc0,
size=hid_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=gru_lr_x))
fc = fluid.layers.fc(input=gru_h0,
size=vocab_size,
act='softmax',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=fc_lr_x))
cost = fluid.layers.cross_entropy(input=fc, label=dst)
return cost
def do_train(train_reader,
vocab,
network,
hid_size,
base_lr,
batch_size,
pass_num,
use_cuda,
parallel,
model_dir,
init_low_bound=-0.04,
init_high_bound=0.04):
""" train network """
vocab_size = len(vocab)
src_wordseq = fluid.layers.data(
name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
dst_wordseq = fluid.layers.data(
name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)
avg_cost = None
if not parallel:
cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
init_low_bound, init_high_bound)
avg_cost = fluid.layers.mean(x=cost)
else:
places = fluid.layers.device.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
cost = network(
pd.read_input(src_wordseq),
pd.read_input(dst_wordseq), vocab_size, hid_size,
init_low_bound, init_high_bound)
pd.write_output(cost)
cost = pd()
avg_cost = fluid.layers.mean(x=cost)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=base_lr,
decay_steps=2100 * 4,
decay_rate=0.5,
staircase=True))
sgd_optimizer.minimize(avg_cost)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
total_time = 0.0
for pass_idx in six.moves.xrange(pass_num):
epoch_idx = pass_idx + 1
print("epoch_%d start" % epoch_idx)
t0 = time.time()
i = 0
for data in train_reader():
i += 1
lod_src_wordseq = to_lodtensor([dat[0] for dat in data], place)
lod_dst_wordseq = to_lodtensor([dat[1] for dat in data], place)
ret_avg_cost = exe.run(fluid.default_main_program(),
feed={
"src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq
},
fetch_list=[avg_cost],
use_program_cache=True)
avg_ppl = math.exp(ret_avg_cost[0])
if i % 100 == 0:
print("step:%d ppl:%.3f" % (i, avg_ppl))
t1 = time.time()
total_time += t1 - t0
print("epoch:%d num_steps:%d time_cost(s):%f" %
(epoch_idx, i, total_time / epoch_idx))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("model saved in %s" % save_dir)
print("finish training")
def train():
""" do training """
batch_size = 20
vocab, train_reader, test_reader = prepare_data(
batch_size=batch_size, buffer_size=1000, word_freq_threshold=0)
# End batch and end pass event handler
def event_handler(event):
""" event handler """
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print("\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics))
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
print("isinstance(event, paddle.event.EndPass)")
do_train(
train_reader=train_reader,
vocab=vocab,
network=network,
hid_size=200,
base_lr=1.0,
batch_size=batch_size,
pass_num=12,
use_cuda=True,
parallel=False,
model_dir="./output/model",
init_low_bound=-0.1,
init_high_bound=0.1)
if __name__ == "__main__":
if not os.path.exists("./output/model"):
os.makedirs("./output/model")
train()
import sys
import time
import numpy as np
import paddle.fluid as fluid
import paddle
def to_lodtensor(data, place):
""" convert to LODtensor """
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def prepare_data(batch_size,
buffer_size=1000,
word_freq_threshold=0,
enable_ce=False):
""" prepare the English Pann Treebank (PTB) data """
vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold)
if enable_ce:
train_reader = paddle.batch(
paddle.dataset.imikolov.train(
vocab,
buffer_size,
data_type=paddle.dataset.imikolov.DataType.SEQ),
batch_size)
else:
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imikolov.train(
vocab,
buffer_size,
data_type=paddle.dataset.imikolov.DataType.SEQ),
buf_size=buffer_size),
batch_size)
test_reader = paddle.batch(
paddle.dataset.imikolov.test(
vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ),
batch_size)
return vocab, train_reader, test_reader
export CUDA_VISIBLE_DEVICES=0
cd data
sh download_data.sh
cd ..
python train.py \
--data_path data/simple-examples/data/ \
--model_type small \
--use_gpu True \
--enable_ce | python _ce.py
# lstm lm
以下是本例的简要目录结构及说明:
```text
.
├── README.md # 文档
├── train.py # 训练脚本
├── reader.py # 数据读取
└── lm_model.py # 模型定义文件
```
## 简介
循环神经网络语言模型的介绍可以参阅论文[Recurrent Neural Network Regularization](https://arxiv.org/abs/1409.2329),本文主要是说明基于lstm的语言的模型的实现,数据是采用ptb dataset,下载地址为
http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
## 数据下载
用户可以自行下载数据,并解压, 也可以利用目录中的脚本
cd data; sh download_data.sh
## 训练
运行命令
`CUDA_VISIBLE_DEVICES=0 python train.py --data_path data/simple-examples/data/ --model_type small --use_gpu True`
开始训练模型。
model_type 为模型配置的大小,目前支持 small,medium, large 三种配置形式
实现采用双层的lstm,具体的参数和网络配置 可以参考 train.py, lm_model.py 文件中的设置
## 训练结果示例
p40中训练日志如下(small config), test 测试集仅在最后一个epoch完成后进行测试
```text
epoch id 0
ppl 232 865.86505 1.0
ppl 464 632.76526 1.0
ppl 696 510.47153 1.0
ppl 928 437.60617 1.0
ppl 1160 393.38422 1.0
ppl 1392 353.05365 1.0
ppl 1624 325.73267 1.0
ppl 1856 305.488 1.0
ppl 2088 286.3128 1.0
ppl 2320 270.91504 1.0
train ppl 270.86246
valid ppl 181.867964379
...
ppl 2320 40.975872 0.001953125
train ppl 40.974102
valid ppl 117.85741214
test ppl 113.939103843
```
## 与tf结果对比
tf采用的版本是1.6
```text
small config
train valid test
fluid 1.0 40.962 118.111 112.617
tf 1.6 40.492 118.329 113.788
medium config
train valid test
fluid 1.0 45.620 87.398 83.682
tf 1.6 45.594 87.363 84.015
large config
train valid test
fluid 1.0 37.221 82.358 78.137
tf 1.6 38.342 82.311 78.121
```
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import distutils.util
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--model_type",
type=str,
default="small",
help="model_type [test|small|med|big]")
parser.add_argument(
"--data_path", type=str, help="all the data for train,valid,test")
parser.add_argument('--para_init', action='store_true')
parser.add_argument(
'--use_gpu', type=bool, default=False, help='whether using gpu')
parser.add_argument(
'--log_path',
help='path of the log file. If not set, logs are printed to console')
parser.add_argument('--enable_ce', action='store_true')
args = parser.parse_args()
return args
wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
tar -xzvf simple-examples.tgz
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid.layers as layers
import paddle.fluid as fluid
from paddle.fluid.layers.control_flow import StaticRNN as PaddingRNN
import numpy as np
def lm_model(hidden_size,
vocab_size,
batch_size,
num_layers=2,
num_steps=20,
init_scale=0.1,
dropout=None):
def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
weight_1_arr = []
weight_2_arr = []
bias_arr = []
hidden_array = []
cell_array = []
mask_array = []
for i in range(num_layers):
weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \
default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
weight_1_arr.append(weight_1)
bias_1 = layers.create_parameter(
[hidden_size * 4],
dtype="float32",
name="fc_bias1_" + str(i),
default_initializer=fluid.initializer.Constant(0.0))
bias_arr.append(bias_1)
pre_hidden = layers.slice(
init_hidden, axes=[0], starts=[i], ends=[i + 1])
pre_cell = layers.slice(
init_cell, axes=[0], starts=[i], ends=[i + 1])
pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
hidden_array.append(pre_hidden)
cell_array.append(pre_cell)
input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
rnn = PaddingRNN()
with rnn.step():
input = rnn.step_input(input_embedding)
for k in range(num_layers):
pre_hidden = rnn.memory(init=hidden_array[k])
pre_cell = rnn.memory(init=cell_array[k])
weight_1 = weight_1_arr[k]
bias = bias_arr[k]
nn = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=nn, y=weight_1)
gate_input = layers.elementwise_add(gate_input, bias)
#i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
i = layers.slice(
gate_input, axes=[1], starts=[0], ends=[hidden_size])
j = layers.slice(
gate_input,
axes=[1],
starts=[hidden_size],
ends=[hidden_size * 2])
f = layers.slice(
gate_input,
axes=[1],
starts=[hidden_size * 2],
ends=[hidden_size * 3])
o = layers.slice(
gate_input,
axes=[1],
starts=[hidden_size * 3],
ends=[hidden_size * 4])
c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
i) * layers.tanh(j)
m = layers.tanh(c) * layers.sigmoid(o)
rnn.update_memory(pre_hidden, m)
rnn.update_memory(pre_cell, c)
rnn.step_output(m)
rnn.step_output(c)
input = m
if dropout != None and dropout > 0.0:
input = layers.dropout(
input,
dropout_prob=dropout,
dropout_implementation='upscale_in_train')
rnn.step_output(input)
#real_res = layers.concat(res, 0)
rnnout = rnn()
last_hidden_array = []
last_cell_array = []
real_res = rnnout[-1]
for i in range(num_layers):
m = rnnout[i * 2]
c = rnnout[i * 2 + 1]
m.stop_gradient = True
c.stop_gradient = True
last_h = layers.slice(
m, axes=[0], starts=[num_steps - 1], ends=[num_steps])
last_hidden_array.append(last_h)
last_c = layers.slice(
c, axes=[0], starts=[num_steps - 1], ends=[num_steps])
last_cell_array.append(last_c)
'''
else:
real_res = rnnout[-1]
for i in range( num_layers ):
m1, c1, m2, c2 = rnnout
real_res = m2
m1.stop_gradient = True
c1.stop_gradient = True
c2.stop_gradient = True
'''
#layers.Print( first_hidden, message="22", summarize=10)
#layers.Print( rnnout[1], message="11", summarize=10)
#real_res = ( rnnout[1] + rnnout[2] + rnnout[3] + rnnout[4]) / 4.0
real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
last_hidden = layers.concat(last_hidden_array, 0)
last_cell = layers.concat(last_cell_array, 0)
'''
last_hidden = layers.concat( hidden_array, 1 )
last_hidden = layers.reshape( last_hidden, shape=[-1, num_layers, hidden_size])
last_hidden = layers.transpose( x = last_hidden, perm = [1, 0, 2])
last_cell = layers.concat( cell_array, 1)
last_cell = layers.reshape( last_cell, shape=[ -1, num_layers, hidden_size])
last_cell = layers.transpose( x = last_cell, perm = [1, 0, 2])
'''
return real_res, last_hidden, last_cell
def encoder_static(input_embedding, len=3, init_hidden=None,
init_cell=None):
weight_1_arr = []
weight_2_arr = []
bias_arr = []
hidden_array = []
cell_array = []
mask_array = []
for i in range(num_layers):
weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \
default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
weight_1_arr.append(weight_1)
bias_1 = layers.create_parameter(
[hidden_size * 4],
dtype="float32",
name="fc_bias1_" + str(i),
default_initializer=fluid.initializer.Constant(0.0))
bias_arr.append(bias_1)
pre_hidden = layers.slice(
init_hidden, axes=[0], starts=[i], ends=[i + 1])
pre_cell = layers.slice(
init_cell, axes=[0], starts=[i], ends=[i + 1])
pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
hidden_array.append(pre_hidden)
cell_array.append(pre_cell)
res = []
for index in range(len):
input = layers.slice(
input_embedding, axes=[1], starts=[index], ends=[index + 1])
input = layers.reshape(input, shape=[-1, hidden_size])
for k in range(num_layers):
pre_hidden = hidden_array[k]
pre_cell = cell_array[k]
weight_1 = weight_1_arr[k]
bias = bias_arr[k]
nn = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=nn, y=weight_1)
gate_input = layers.elementwise_add(gate_input, bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
i) * layers.tanh(j)
m = layers.tanh(c) * layers.sigmoid(o)
hidden_array[k] = m
cell_array[k] = c
input = m
if dropout != None and dropout > 0.0:
input = layers.dropout(
input,
dropout_prob=dropout,
dropout_implementation='upscale_in_train')
res.append(layers.reshape(input, shape=[1, -1, hidden_size]))
real_res = layers.concat(res, 0)
real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
last_hidden = layers.concat(hidden_array, 1)
last_hidden = layers.reshape(
last_hidden, shape=[-1, num_layers, hidden_size])
last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])
last_cell = layers.concat(cell_array, 1)
last_cell = layers.reshape(
last_cell, shape=[-1, num_layers, hidden_size])
last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])
return real_res, last_hidden, last_cell
x = layers.data(name="x", shape=[-1, 1, 1], dtype='int64')
y = layers.data(name="y", shape=[-1, 1], dtype='float32')
init_hidden = layers.data(name="init_hidden", shape=[1], dtype='float32')
init_cell = layers.data(name="init_cell", shape=[1], dtype='float32')
init_hidden = layers.reshape(
init_hidden, shape=[num_layers, -1, hidden_size])
init_cell = layers.reshape(init_cell, shape=[num_layers, -1, hidden_size])
x_emb = layers.embedding(
input=x,
size=[vocab_size, hidden_size],
dtype='float32',
is_sparse=True,
param_attr=fluid.ParamAttr(
name='embedding_para',
initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale)))
x_emb = layers.reshape(x_emb, shape=[-1, num_steps, hidden_size])
if dropout != None and dropout > 0.0:
x_emb = layers.dropout(
x_emb,
dropout_prob=dropout,
dropout_implementation='upscale_in_train')
rnn_out, last_hidden, last_cell = padding_rnn(
x_emb, len=num_steps, init_hidden=init_hidden, init_cell=init_cell)
rnn_out = layers.reshape(rnn_out, shape=[-1, num_steps, hidden_size])
softmax_weight = layers.create_parameter([hidden_size, vocab_size], dtype="float32", name="softmax_weight", \
default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
softmax_bias = layers.create_parameter([vocab_size], dtype="float32", name='softmax_bias', \
default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
projection = layers.matmul(rnn_out, softmax_weight)
projection = layers.elementwise_add(projection, softmax_bias)
projection = layers.reshape(projection, shape=[-1, vocab_size])
#y = layers.reshape( y, shape=[-1, vocab_size])
loss = layers.softmax_with_cross_entropy(
logits=projection, label=y, soft_label=False)
loss = layers.reshape(loss, shape=[-1, num_steps])
loss = layers.reduce_mean(loss, dim=[0])
loss = layers.reduce_sum(loss)
loss.permissions = True
feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
return loss, last_hidden, last_cell, feeding_list
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for parsing PTB text files."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import sys
import numpy as np
Py3 = sys.version_info[0] == 3
def _read_words(filename):
data = []
with open(filename, "r") as f:
return f.read().decode("utf-8").replace("\n", "<eos>").split()
def _build_vocab(filename):
data = _read_words(filename)
counter = collections.Counter(data)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
print("vocab word num", len(words))
word_to_id = dict(zip(words, range(len(words))))
return word_to_id
def _file_to_word_ids(filename, word_to_id):
data = _read_words(filename)
return [word_to_id[word] for word in data if word in word_to_id]
def ptb_raw_data(data_path=None):
"""Load PTB raw data from data directory "data_path".
Reads PTB text files, converts strings to integer ids,
and performs mini-batching of the inputs.
The PTB dataset comes from Tomas Mikolov's webpage:
http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
Args:
data_path: string path to the directory where simple-examples.tgz has
been extracted.
Returns:
tuple (train_data, valid_data, test_data, vocabulary)
where each of the data objects can be passed to PTBIterator.
"""
train_path = os.path.join(data_path, "ptb.train.txt")
#train_path = os.path.join(data_path, "train.fake")
valid_path = os.path.join(data_path, "ptb.valid.txt")
test_path = os.path.join(data_path, "ptb.test.txt")
word_to_id = _build_vocab(train_path)
train_data = _file_to_word_ids(train_path, word_to_id)
valid_data = _file_to_word_ids(valid_path, word_to_id)
test_data = _file_to_word_ids(test_path, word_to_id)
vocabulary = len(word_to_id)
return train_data, valid_data, test_data, vocabulary
def get_data_iter(raw_data, batch_size, num_steps):
data_len = len(raw_data)
raw_data = np.asarray(raw_data, dtype="int64")
#print( "raw", raw_data[:20] )
batch_len = data_len // batch_size
data = raw_data[0:batch_size * batch_len].reshape((batch_size, batch_len))
#h = data.reshape( (-1))
#print( "h", h[:20])
epoch_size = (batch_len - 1) // num_steps
for i in range(epoch_size):
start = i * num_steps
#print( i * num_steps )
x = np.copy(data[:, i * num_steps:(i + 1) * num_steps])
y = np.copy(data[:, i * num_steps + 1:(i + 1) * num_steps + 1])
yield (x, y)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import time
import os
import random
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.framework as framework
from paddle.fluid.executor import Executor
import reader
import sys
if sys.version[0] == '2':
reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append('..')
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
from args import *
import lm_model
import logging
import pickle
SEED = 123
def get_current_model_para(train_prog, train_exe):
param_list = train_prog.block(0).all_parameters()
param_name_list = [p.name for p in param_list]
vals = {}
for p_name in param_name_list:
p_array = np.array(fluid.global_scope().find_var(p_name).get_tensor())
vals[p_name] = p_array
return vals
def save_para_npz(train_prog, train_exe):
print("begin to save model to model_base")
param_list = train_prog.block(0).all_parameters()
param_name_list = [p.name for p in param_list]
vals = {}
for p_name in param_name_list:
p_array = np.array(fluid.global_scope().find_var(p_name).get_tensor())
vals[p_name] = p_array
emb = vals["embedding_para"]
print("begin to save model to model_base")
np.savez("mode_base", **vals)
def train():
args = parse_args()
model_type = args.model_type
logger = logging.getLogger("lm")
logger.setLevel(logging.INFO)
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
if args.enable_ce:
fluid.default_startup_program().random_seed = SEED
if args.log_path:
file_handler = logging.FileHandler(args.log_path)
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
else:
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
logger.info('Running with args : {}'.format(args))
vocab_size = 10000
if model_type == "test":
num_layers = 1
batch_size = 2
hidden_size = 10
num_steps = 3
init_scale = 0.1
max_grad_norm = 5.0
epoch_start_decay = 1
max_epoch = 1
dropout = 0.0
lr_decay = 0.5
base_learning_rate = 1.0
elif model_type == "small":
num_layers = 2
batch_size = 20
hidden_size = 200
num_steps = 20
init_scale = 0.1
max_grad_norm = 5.0
epoch_start_decay = 4
max_epoch = 13
dropout = 0.0
lr_decay = 0.5
base_learning_rate = 1.0
elif model_type == "medium":
num_layers = 2
batch_size = 20
hidden_size = 650
num_steps = 35
init_scale = 0.05
max_grad_norm = 5.0
epoch_start_decay = 6
max_epoch = 39
dropout = 0.5
lr_decay = 0.8
base_learning_rate = 1.0
elif model_type == "large":
num_layers = 2
batch_size = 20
hidden_size = 1500
num_steps = 35
init_scale = 0.04
max_grad_norm = 10.0
epoch_start_decay = 14
max_epoch = 55
dropout = 0.65
lr_decay = 1.0 / 1.15
base_learning_rate = 1.0
else:
print("model type not support")
return
# Training process
loss, last_hidden, last_cell, feed_order = lm_model.lm_model(
hidden_size,
vocab_size,
batch_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale,
dropout=dropout)
# clone from default main program and use it as the validation program
main_program = fluid.default_main_program()
inference_program = fluid.default_main_program().clone(for_test=True)
fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
clip_norm=max_grad_norm))
learning_rate = fluid.layers.create_global_var(
name="learning_rate",
shape=[1],
value=1.0,
dtype='float32',
persistable=True)
optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
optimizer.minimize(loss)
place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
exe = Executor(place)
exe.run(framework.default_startup_program())
data_path = args.data_path
print("begin to load data")
raw_data = reader.ptb_raw_data(data_path)
print("finished load data")
train_data, valid_data, test_data, _ = raw_data
def prepare_input(batch, init_hidden, init_cell, epoch_id=0, with_lr=True):
x, y = batch
new_lr = base_learning_rate * (lr_decay**max(
epoch_id + 1 - epoch_start_decay, 0.0))
lr = np.ones((1), dtype='float32') * new_lr
res = {}
x = x.reshape((-1, num_steps, 1))
y = y.reshape((-1, 1))
res['x'] = x
res['y'] = y
res['init_hidden'] = init_hidden
res['init_cell'] = init_cell
if with_lr:
res['learning_rate'] = lr
return res
def eval(data):
# when eval the batch_size set to 1
eval_data_iter = reader.get_data_iter(data, 1, num_steps)
total_loss = 0.0
iters = 0
init_hidden = np.zeros((num_layers, 1, hidden_size), dtype='float32')
init_cell = np.zeros((num_layers, 1, hidden_size), dtype='float32')
for batch_id, batch in enumerate(eval_data_iter):
input_data_feed = prepare_input(
batch, init_hidden, init_cell, epoch_id, with_lr=False)
fetch_outs = exe.run(
inference_program,
feed=input_data_feed,
fetch_list=[loss.name, last_hidden.name, last_cell.name])
cost_train = np.array(fetch_outs[0])
init_hidden = np.array(fetch_outs[1])
init_cell = np.array(fetch_outs[2])
total_loss += cost_train
iters += num_steps
ppl = np.exp(total_loss / iters)
return ppl
# get train epoch size
batch_len = len(train_data) // batch_size
epoch_size = (batch_len - 1) // num_steps
log_interval = epoch_size // 10
total_time = 0.0
for epoch_id in range(max_epoch):
start_time = time.time()
print("epoch id", epoch_id)
train_data_iter = reader.get_data_iter(train_data, batch_size,
num_steps)
total_loss = 0
init_hidden = None
init_cell = None
#debug_para(fluid.framework.default_main_program(), parallel_executor)
total_loss = 0
iters = 0
init_hidden = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
for batch_id, batch in enumerate(train_data_iter):
input_data_feed = prepare_input(
batch, init_hidden, init_cell, epoch_id=epoch_id)
fetch_outs = exe.run(feed=input_data_feed,
fetch_list=[
loss.name, last_hidden.name,
last_cell.name, 'learning_rate'
])
cost_train = np.array(fetch_outs[0])
init_hidden = np.array(fetch_outs[1])
init_cell = np.array(fetch_outs[2])
lr = np.array(fetch_outs[3])
total_loss += cost_train
iters += num_steps
if batch_id > 0 and batch_id % log_interval == 0:
ppl = np.exp(total_loss / iters)
print("ppl ", batch_id, ppl[0], lr[0])
ppl = np.exp(total_loss / iters)
if epoch_id == 0 and ppl[0] > 1000:
# for bad init, after first epoch, the loss is over 1000
# no more need to continue
return
end_time = time.time()
total_time += end_time - start_time
print("train ppl", ppl[0])
if epoch_id == max_epoch - 1 and args.enable_ce:
print("lstm_language_model_duration\t%s" % (total_time / max_epoch))
print("lstm_language_model_loss\t%s" % ppl[0])
model_path = os.path.join("model_new/", str(epoch_id))
if not os.path.isdir(model_path):
os.makedirs(model_path)
fluid.io.save_persistables(
executor=exe, dirname=model_path, main_program=main_program)
valid_ppl = eval(valid_data)
print("valid ppl", valid_ppl[0])
test_ppl = eval(test_data)
print("test ppl", test_ppl[0])
if __name__ == '__main__':
train()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册