未验证 提交 124aa9a1 编写于 作者: L lujun 提交者: GitHub

Merge pull request #4 from PaddlePaddle/develop

merge to local
......@@ -8,15 +8,11 @@ PaddlePaddle provides a rich set of computational units to enable users to adopt
- [fluid models](fluid): use PaddlePaddle's Fluid APIs. We especially recommend users to use Fluid models.
- [legacy models](legacy): use PaddlePaddle's v2 APIs.
PaddlePaddle 提供了丰富的计算单元,使得用户可以采用模块化的方法解决各种学习问题。在此repo中,我们展示了如何用 PaddlePaddle 来解决常见的机器学习任务,提供若干种不同的易学易用的神经网络模型。
- [fluid模型](fluid): 使用 PaddlePaddle Fluid版本的 APIs,我们特别推荐您使用Fluid模型。
- [legacy模型](legacy): 使用 PaddlePaddle v2版本的 APIs。
## License
This tutorial is contributed by [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) and licensed under the [Apache-2.0 license](LICENSE).
......
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
DATASET_PATH=${HOME}/.cache/paddle/dataset/cityscape/
cudaid=${deeplabv3plus:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py \
--batch_size=2 \
--train_crop_size=769 \
--total_step=50 \
--save_weights_path=output1 \
--dataset_path=$DATASET_PATH \
--enable_ce | python _ce.py
cudaid=${deeplabv3plus_m:=0,1,2,3} # use 0,1,2,3 card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py \
--batch_size=2 \
--train_crop_size=769 \
--total_step=50 \
--save_weights_path=output4 \
--dataset_path=$DATASET_PATH \
--enable_ce | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.1, 0, actived=True)
train_loss_card1_kpi = CostKpi('train_loss_card1', 0.05, 0)
each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.1, 0, actived=True)
train_loss_card4_kpi = CostKpi('train_loss_card4', 0.05, 0)
tracking_kpis = [
each_pass_duration_card1_kpi,
train_loss_card1_kpi,
each_pass_duration_card4_kpi,
train_loss_card4_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
......@@ -34,6 +34,10 @@ def add_arguments():
add_argument('parallel', bool, False, "using ParallelExecutor.")
add_argument('use_gpu', bool, True, "Whether use GPU or CPU.")
add_argument('num_classes', int, 19, "Number of classes.")
parser.add_argument(
'--enable_ce',
action='store_true',
help='If set, run the task with continuous evaluation logs.')
def load_model():
......@@ -51,7 +55,10 @@ def load_model():
else:
if args.num_classes == 19:
fluid.io.load_params(
exe, dirname=args.init_weights_path, main_program=tp)
exe,
dirname="",
filename=args.init_weights_path,
main_program=tp)
else:
fluid.io.load_vars(
exe, dirname="", filename=args.init_weights_path, vars=myvars)
......@@ -84,6 +91,15 @@ def loss(logit, label):
return loss, label_nignore
def get_cards(args):
if args.enable_ce:
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
else:
return args.num_devices
CityscapeDataset = reader.CityscapeDataset
parser = argparse.ArgumentParser()
......@@ -99,6 +115,13 @@ deeplabv3p = models.deeplabv3p
sp = fluid.Program()
tp = fluid.Program()
# only for ce
if args.enable_ce:
SEED = 102
sp.random_seed = SEED
tp.random_seed = SEED
crop_size = args.train_crop_size
batch_size = args.batch_size
image_shape = [crop_size, crop_size]
......@@ -155,7 +178,13 @@ if args.parallel:
batches = dataset.get_batch_generator(batch_size, total_step)
total_time = 0.0
epoch_idx = 0
train_loss = 0
for i, imgs, labels, names in batches:
epoch_idx += 1
begin_time = time.time()
prev_start_time = time.time()
if args.parallel:
retv = exe_p.run(fetch_list=[pred.name, loss_mean.name],
......@@ -167,11 +196,21 @@ for i, imgs, labels, names in batches:
'label': labels},
fetch_list=[pred, loss_mean])
end_time = time.time()
total_time += end_time - begin_time
if i % 100 == 0:
print("Model is saved to", args.save_weights_path)
save_model()
print("step {:d}, loss: {:.6f}, step_time_cost: {:.3f}".format(
i, np.mean(retv[1]), end_time - prev_start_time))
# only for ce
train_loss = np.mean(retv[1])
if args.enable_ce:
gpu_num = get_cards(args)
print("kpis\teach_pass_duration_card%s\t%s" %
(gpu_num, total_time / epoch_idx))
print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, train_loss))
print("Training done. Model is saved to", args.save_weights_path)
save_model()
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${face_detection:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --batch_size=2 --epoc_num=1 --batch_num=200 --parallel=False --enable_ce | python _ce.py
cudaid=${face_detection_m:=0,1,2,3} # use 0,1,2,3 card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --batch_size=8 --epoc_num=1 --batch_num=200 --parallel=False --enable_ce | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, actived=True)
train_face_loss_card1_kpi = CostKpi('train_face_loss_card1', 0.08, 0)
train_head_loss_card1_kpi = CostKpi('train_head_loss_card1', 0.08, 0)
each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.08, 0, actived=True)
train_face_loss_card4_kpi = CostKpi('train_face_loss_card4', 0.08, 0)
train_head_loss_card4_kpi = CostKpi('train_head_loss_card4', 0.08, 0)
tracking_kpis = [
each_pass_duration_card1_kpi,
train_face_loss_card1_kpi,
train_head_loss_card1_kpi,
each_pass_duration_card4_kpi,
train_face_loss_card4_kpi,
train_head_loss_card4_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
......@@ -32,6 +32,9 @@ add_arg('mean_BGR', str, '104., 117., 123.', "Mean value for B,G,R cha
add_arg('with_mem_opt', bool, True, "Whether to use memory optimization or not.")
add_arg('pretrained_model', str, './vgg_ilsvrc_16_fc_reduced/', "The init model path.")
add_arg('data_dir', str, 'data', "The base dir of dataset")
parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
parser.add_argument('--batch_num', type=int, help="batch num for ce")
parser.add_argument('--num_devices', type=int, default=1, help='Number of GPU devices')
#yapf: enable
train_parameters = {
......@@ -119,6 +122,16 @@ def train(args, config, train_params, train_file_list):
startup_prog = fluid.Program()
train_prog = fluid.Program()
#only for ce
if args.enable_ce:
SEED = 102
startup_prog.random_seed = SEED
train_prog.random_seed = SEED
num_workers = 1
pretrained_model = ""
if args.batch_num != None:
iters_per_epoc = args.batch_num
train_py_reader, fetches, loss = build_program(
train_params = train_params,
main_prog = train_prog,
......@@ -171,7 +184,12 @@ def train(args, config, train_params, train_file_list):
train_py_reader.start()
try:
total_time = 0.0
epoch_idx = 0
face_loss = 0
head_loss = 0
for pass_id in range(start_epoc, epoc_num):
epoch_idx += 1
start_time = time.time()
prev_start_time = start_time
end_time = 0
......@@ -198,14 +216,38 @@ def train(args, config, train_params, train_file_list):
"time {:.5f}".format(pass_id,
batch_id, fetch_vars[0], fetch_vars[1],
start_time - prev_start_time))
face_loss = fetch_vars[0]
head_loss = fetch_vars[1]
epoch_end_time = time.time()
total_time += epoch_end_time - start_time
if pass_id % 1 == 0 or pass_id == epoc_num - 1:
save_model(str(pass_id), train_prog)
# only for ce
if args.enable_ce:
gpu_num = get_cards(args)
print("kpis\teach_pass_duration_card%s\t%s" %
(gpu_num, total_time / epoch_idx))
print("kpis\ttrain_face_loss_card%s\t%s" %
(gpu_num, face_loss))
print("kpis\ttrain_head_loss_card%s\t%s" %
(gpu_num, head_loss))
except fluid.core.EOFException:
train_py_reader.reset()
except StopIteration:
train_py_reader.reset()
train_py_reader.reset()
def get_cards(args):
if args.enable_ce:
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
else:
return args.num_devices
if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
......
#!/bin/bash
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
cudaid=${face_detection:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --model_save_dir=output/ --data_dir=dataset/coco/ --max_iter=10 --enable_ce --pretrained_model=./imagenet_resnet50_fusebn | python _ce.py
cudaid=${face_detection_m:=0,1,2,3} # use 0,1,2,3 card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --model_save_dir=output/ --data_dir=dataset/coco/ --max_iter=10 --enable_ce --pretrained_model=./imagenet_resnet50_fusebn | python _ce.py
# this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi
from kpi import DurationKpi
each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, actived=True)
train_loss_card1_kpi = CostKpi('train_loss_card1', 0.08, 0)
each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.08, 0, actived=True)
train_loss_card4_kpi = CostKpi('train_loss_card4', 0.08, 0)
tracking_kpis = [
each_pass_duration_card1_kpi,
train_loss_card1_kpi,
each_pass_duration_card4_kpi,
train_loss_card4_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
log_to_ce(log)
......@@ -35,7 +35,7 @@ def train():
learning_rate = cfg.learning_rate
image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size]
if cfg.debug:
if cfg.debug or cfg.enable_ce:
fluid.default_startup_program().random_seed = 1000
fluid.default_main_program().random_seed = 1000
import random
......@@ -46,11 +46,14 @@ def train():
devices_num = len(devices.split(","))
total_batch_size = devices_num * cfg.TRAIN.im_per_batch
use_random = True
if cfg.enable_ce:
use_random = False
model = model_builder.FasterRCNN(
add_conv_body_func=resnet.add_ResNet50_conv4_body,
add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head,
use_pyreader=cfg.use_pyreader,
use_random=True)
use_random=use_random)
model.build_model(image_shape)
loss_cls, loss_bbox, rpn_cls_loss, rpn_reg_loss = model.loss()
loss_cls.persistable = True
......@@ -92,16 +95,19 @@ def train():
train_exe = fluid.ParallelExecutor(
use_cuda=bool(cfg.use_gpu), loss_name=loss.name)
shuffle = True
if cfg.enable_ce:
shuffle = False
if cfg.use_pyreader:
train_reader = reader.train(
batch_size=cfg.TRAIN.im_per_batch,
total_batch_size=total_batch_size,
padding_total=cfg.TRAIN.padding_minibatch,
shuffle=True)
shuffle=shuffle)
py_reader = model.py_reader
py_reader.decorate_paddle_reader(train_reader)
else:
train_reader = reader.train(batch_size=total_batch_size, shuffle=True)
train_reader = reader.train(batch_size=total_batch_size, shuffle=shuffle)
feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())
def save_model(postfix):
......@@ -118,6 +124,8 @@ def train():
try:
start_time = time.time()
prev_start_time = start_time
total_time = 0
last_loss = 0
every_pass_loss = []
for iter_id in range(cfg.max_iter):
prev_start_time = start_time
......@@ -131,9 +139,23 @@ def train():
iter_id, lr[0],
smoothed_loss.get_median_value(
), start_time - prev_start_time))
end_time = time.time()
total_time += end_time - start_time
last_loss = np.mean(np.array(losses[0]))
sys.stdout.flush()
if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
save_model("model_iter{}".format(iter_id))
# only for ce
if cfg.enable_ce:
gpu_num = devices_num
epoch_idx = iter_id + 1
loss = last_loss
print("kpis\teach_pass_duration_card%s\t%s" %
(gpu_num, total_time / epoch_idx))
print("kpis\ttrain_loss_card%s\t%s" %
(gpu_num, loss))
except fluid.core.EOFException:
py_reader.reset()
return np.mean(every_pass_loss)
......@@ -142,6 +164,8 @@ def train():
start_time = time.time()
prev_start_time = start_time
start = start_time
total_time = 0
last_loss = 0
every_pass_loss = []
smoothed_loss = SmoothedValue(cfg.log_window)
for iter_id, data in enumerate(train_reader()):
......@@ -154,6 +178,9 @@ def train():
smoothed_loss.add_value(loss_v)
lr = np.array(fluid.global_scope().find_var('learning_rate')
.get_tensor())
end_time = time.time()
total_time += end_time - start_time
last_loss = loss_v
print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format(
iter_id, lr[0],
smoothed_loss.get_median_value(), start_time - prev_start_time))
......@@ -162,6 +189,16 @@ def train():
save_model("model_iter{}".format(iter_id))
if (iter_id + 1) == cfg.max_iter:
break
# only for ce
if cfg.enable_ce:
gpu_num = devices_num
epoch_idx = iter_id + 1
loss = last_loss
print("kpis\teach_pass_duration_card%s\t%s" %
(gpu_num, total_time / epoch_idx))
print("kpis\ttrain_loss_card%s\t%s" %
(gpu_num, loss))
return np.mean(every_pass_loss)
if cfg.use_pyreader:
......
......@@ -129,6 +129,9 @@ def parse_args():
add_arg('draw_threshold', float, 0.8, "Confidence threshold to draw bbox.")
add_arg('image_path', str, 'data/COCO17/val2017', "The image path used to inference and visualize.")
add_arg('image_name', str, '', "The single image used to inference and visualize.")
# ce
parser.add_argument(
'--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
# yapf: enable
args = parser.parse_args()
file_name = sys.argv[0]
......
......@@ -165,7 +165,8 @@ def train(args):
'conditions': conditions_data},
fetch_list={dg_loss})[0][0]
losses[1].append(dg_loss_n)
t_time += (time.time() - s_time)
batch_time = time.time() - s_time
t_time += batch_time
......@@ -180,8 +181,9 @@ def train(args):
fetch_list={g_img})[0]
total_images = np.concatenate([real_image, generated_images])
fig = plot(total_images)
msg = "Epoch ID={0}\n Batch ID={1}\n D-Loss={2}\n DG-Loss={3}\n gen={4}".format(
pass_id, batch_id, d_loss_n, dg_loss_n, check(generated_images))
msg = "Epoch ID={0}\n Batch ID={1}\n D-Loss={2}\n DG-Loss={3}\n gen={4}\n " \
"Batch_time_cost={5:.2f}".format(
pass_id, batch_id, d_loss_n, dg_loss_n, check(generated_images), batch_time)
print(msg)
plt.title(msg)
plt.savefig(
......
......@@ -187,10 +187,12 @@ def train(args):
fetch_list=[d_A_trainer.d_loss_A],
feed={"input_A": tensor_A,
"fake_pool_A": fake_pool_A})[0]
t_time += (time.time() - s_time)
print("epoch{}; batch{}; g_A_loss: {}; d_B_loss: {}; g_B_loss: {}; d_A_loss: {};".format(
batch_time = time.time() - s_time
t_time += batch_time
print("epoch{}; batch{}; g_A_loss: {}; d_B_loss: {}; g_B_loss: {}; d_A_loss: {}; "
"Batch_time_cost: {:.2f}".format(
epoch, batch_id, g_A_loss[0], d_B_loss[0], g_B_loss[0],
d_A_loss[0]))
d_A_loss[0], batch_time))
losses[0].append(g_A_loss[0])
losses[1].append(d_A_loss[0])
sys.stdout.flush()
......
......@@ -7,6 +7,7 @@ cudaid=${object_detection_cudaid:=0}
export CUDA_VISIBLE_DEVICES=$cudaid
python train.py --batch_size=${BATCH_SIZE} --num_epochs=5 --enable_ce=True --lr_strategy=cosine_decay | python _ce.py
BATCH_SIZE=224
cudaid=${object_detection_cudaid_m:=0, 1, 2, 3}
export CUDA_VISIBLE_DEVICES=$cudaid
python train.py --batch_size=${BATCH_SIZE} --num_epochs=5 --enable_ce=True --lr_strategy=cosine_decay | python _ce.py
......@@ -242,7 +242,7 @@ def train(args):
device_num = subprocess.check_output(['nvidia-smi', '-L']).decode().count('\n')
train_batch_size = args.batch_size / device_num
test_batch_size = 8
test_batch_size = 16
if not args.enable_ce:
train_reader = paddle.batch(
reader.train(), batch_size=train_batch_size, drop_last=True)
......@@ -306,7 +306,7 @@ def train(args):
train_loss = np.array(train_info[0]).mean()
train_acc1 = np.array(train_info[1]).mean()
train_acc5 = np.array(train_info[2]).mean()
train_speed = np.array(train_time).mean() / train_batch_size
train_speed = np.array(train_time).mean() / (train_batch_size * device_num)
test_py_reader.start()
......
......@@ -68,6 +68,7 @@ class GeneratorEnqueuer(object):
try:
task()
except Exception:
traceback.print_exc()
self._stop_event.set()
break
else:
......@@ -75,6 +76,7 @@ class GeneratorEnqueuer(object):
try:
task()
except Exception:
traceback.print_exc()
self._stop_event.set()
break
......
......@@ -176,10 +176,17 @@ def coco(settings, file_list, mode, batch_size, shuffle):
if mode == 'train' and shuffle:
np.random.shuffle(images)
batch_out = []
if '2014' in file_list:
sub_dir = "train2014" if model == "train" else "val2014"
elif '2017' in file_list:
sub_dir = "train2017" if mode == "train" else "val2017"
data_dir = os.path.join(settings.data_dir, sub_dir)
for image in images:
image_name = image['file_name']
image_path = os.path.join(settings.data_dir, image_name)
image_path = os.path.join(data_dir, image_name)
if not os.path.exists(image_path):
raise ValueError("%s is not exist, you should specify "
"data path correctly." % image_path)
im = Image.open(image_path)
if im.mode == 'L':
im = im.convert('RGB')
......@@ -242,7 +249,9 @@ def pascalvoc(settings, file_list, mode, batch_size, shuffle):
image_path, label_path = image.split()
image_path = os.path.join(settings.data_dir, image_path)
label_path = os.path.join(settings.data_dir, label_path)
if not os.path.exists(image_path):
raise ValueError("%s is not exist, you should specify "
"data path correctly." % image_path)
im = Image.open(image_path)
if im.mode == 'L':
im = im.convert('RGB')
......@@ -295,7 +304,6 @@ def train(settings,
max_queue=24,
enable_ce=False):
file_list = os.path.join(settings.data_dir, file_list)
if 'coco' in settings.dataset:
generator = coco(settings, file_list, "train", batch_size, shuffle)
else:
......@@ -341,6 +349,9 @@ def test(settings, file_list, batch_size):
def infer(settings, image_path):
def reader():
if not os.path.exists(image_path):
raise ValueError("%s is not exist, you should specify "
"data path correctly." % image_path)
img = Image.open(image_path)
if img.mode == 'L':
img = im.convert('RGB')
......
......@@ -7,8 +7,8 @@ from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.05, actived=True)
train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.05, 0, actived=True)
tracking_kpis = [
train_cost_kpi,
......
......@@ -390,6 +390,8 @@ def train(args):
else:
global_step, last_cost = train_with_feed(global_step)
train_time += time.time() - begin_time
print("Pass {0}, pass_time_cost {1}"
.format(epoch, "%2.2f sec" % (time.time() - begin_time)))
# For internal continuous evaluation
if "CE_MODE_X" in os.environ:
print("kpis train_cost %f" % last_cost)
......
......@@ -3,6 +3,7 @@
import os
import sys
#sys.path.insert(0, os.environ['ceroot'])
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
......
......@@ -446,7 +446,9 @@ def train(logger, args):
logger.info('Dev eval result: {}'.format(
bleu_rouge))
pass_end_time = time.time()
time_consumed = pass_end_time - pass_start_time
logger.info('epoch: {0}, epoch_time_cost: {1:.2f}'.format(
pass_id, time_consumed))
logger.info('Evaluating the model after epoch {}'.format(
pass_id))
if brc_data.dev_set is not None:
......@@ -459,7 +461,7 @@ def train(logger, args):
else:
logger.warning(
'No dev set is loaded for evaluation in the dataset!')
time_consumed = pass_end_time - pass_start_time
logger.info('Average train loss for epoch {} is {}'.format(
pass_id, "%.10f" % (1.0 * total_loss / total_num)))
......
......@@ -89,7 +89,7 @@ def train(train_reader,
def train_net():
word_dict, train_reader, test_reader = utils.prepare_data(
"imdb", self_dict=False, batch_size=4, buf_size=50000)
"imdb", self_dict=False, batch_size=128, buf_size=50000)
if sys.argv[1] == "bow":
train(
......
......@@ -6,9 +6,9 @@ export OMP_NUM_THREADS=1
cudaid=${text_matching_on_quora:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce | python _ce.py
FLAGS_benchmark=true python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce --epoch_num=5 | python _ce.py
cudaid=${text_matching_on_quora_m:=0,1,2,3} # use 0,1,2,3 card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce | python _ce.py
FLAGS_benchmark=true python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce --epoch_num=5 | python _ce.py
......@@ -7,11 +7,11 @@ from kpi import CostKpi
from kpi import DurationKpi
each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.05, 0, actived=True)
train_avg_cost_card1_kpi = CostKpi('train_avg_cost_card1', 0.2, 0)
each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, actived=True)
train_avg_cost_card1_kpi = CostKpi('train_avg_cost_card1', 0.08, 0)
train_avg_acc_card1_kpi = CostKpi('train_avg_acc_card1', 0.02, 0)
each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.05, 0, actived=True)
train_avg_cost_card4_kpi = CostKpi('train_avg_cost_card4', 0.2, 0)
each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.08, 0, actived=True)
train_avg_cost_card4_kpi = CostKpi('train_avg_cost_card4', 0.08, 0)
train_avg_acc_card4_kpi = CostKpi('train_avg_acc_card4', 0.02, 0)
tracking_kpis = [
......
......@@ -34,6 +34,7 @@ parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--model_name', type=str, default='cdssmNet', help="Which model to train")
parser.add_argument('--config', type=str, default='cdssm_base', help="The global config setting")
parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
parser.add_argument('--epoch_num', type=int, help='Number of epoch')
DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache/paddle/dataset')
......@@ -241,6 +242,9 @@ def main():
args = parser.parse_args()
global_config = configs.__dict__[args.config]()
if args.epoch_num != None:
global_config.epoch_num = args.epoch_num
print("net_name: ", args.model_name)
net = models.__dict__[args.model_name](global_config)
......
......@@ -33,7 +33,7 @@ class CNNEncoder(object):
""" cnn-encoder"""
def __init__(self,
param_name="cnn.w",
param_name="cnn",
win_size=3,
ksize=128,
act='tanh',
......@@ -51,13 +51,15 @@ class CNNEncoder(object):
filter_size=self.win_size,
act=self.act,
pool_type=self.pool_type,
param_attr=str(self.param_name))
param_attr=self.param_name + ".param",
bias_attr=self.param_name + ".bias")
class GrnnEncoder(object):
""" grnn-encoder """
def __init__(self, param_name="grnn.w", hidden_size=128):
def __init__(self, param_name="grnn", hidden_size=128):
self.param_name = param_name
self.hidden_size = hidden_size
......@@ -65,13 +67,15 @@ class GrnnEncoder(object):
fc0 = nn.fc(
input=emb,
size=self.hidden_size * 3,
param_attr=str(str(self.param_name) + "_fc")
)
param_attr=self.param_name + "_fc.w",
bias_attr=False)
gru_h = nn.dynamic_gru(
input=fc0,
size=self.hidden_size,
is_reverse=False,
param_attr=str(self.param_name))
param_attr=self.param_name + ".param",
bias_attr=self.param_name + ".bias")
return nn.sequence_pool(input=gru_h, pool_type='max')
......@@ -139,17 +143,17 @@ class MultiviewSimnet(object):
# lookup embedding for each slot
q_embs = [
nn.embedding(
input=query, size=self.emb_shape, param_attr="emb.w")
input=query, size=self.emb_shape, param_attr="emb")
for query in q_slots
]
pt_embs = [
nn.embedding(
input=title, size=self.emb_shape, param_attr="emb.w")
input=title, size=self.emb_shape, param_attr="emb")
for title in pt_slots
]
nt_embs = [
nn.embedding(
input=title, size=self.emb_shape, param_attr="emb.w")
input=title, size=self.emb_shape, param_attr="emb")
for title in nt_slots
]
......@@ -170,9 +174,9 @@ class MultiviewSimnet(object):
nt_concat = nn.concat(nt_encodes)
# projection of hidden layer
q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w')
pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w')
nt_hid = nn.fc(nt_concat, size=self.hidden_size, param_attr='t_fc.w')
q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', bias_attr='q_fc.b')
pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b')
nt_hid = nn.fc(nt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b')
# cosine of hidden layers
cos_pos = nn.cos_sim(q_hid, pt_hid)
......@@ -213,12 +217,12 @@ class MultiviewSimnet(object):
# lookup embedding for each slot
q_embs = [
nn.embedding(
input=query, size=self.emb_shape, param_attr="emb.w")
input=query, size=self.emb_shape, param_attr="emb")
for query in q_slots
]
pt_embs = [
nn.embedding(
input=title, size=self.emb_shape, param_attr="emb.w")
input=title, size=self.emb_shape, param_attr="emb")
for title in pt_slots
]
# encode each embedding field with encoder
......@@ -232,8 +236,8 @@ class MultiviewSimnet(object):
q_concat = nn.concat(q_encodes)
pt_concat = nn.concat(pt_encodes)
# projection of hidden layer
q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w')
pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w')
q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', bias_attr='q_fc.b')
pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b')
# cosine of hidden layers
cos = nn.cos_sim(q_hid, pt_hid)
return cos
......@@ -25,6 +25,7 @@ cd data && ./download.sh && cd ..
```bash
python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
```
如果您想使用我们支持的第三方词汇表,请将--other_dict_path设置为您存放将使用的词汇表的目录,并设置--with_other_dict使用它
## 训练
训练的命令行选项可以通过`python train.py -h`列出。
......
......@@ -14,6 +14,11 @@ Download dataset:
```bash
cd data && ./download.sh && cd ..
```
if you would like to use our supported third party vocab, please run:
```bash
wget http://download.tensorflow.org/models/LM_LSTM_CNN/vocab-2016-09-10.txt
```
## Model
This model implement a skip-gram model of word2vector.
......@@ -24,8 +29,10 @@ This model implement a skip-gram model of word2vector.
Preprocess the training data to generate a word dict.
```bash
python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --is_local --dict_path data/1-billion_dict
```
if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
save the vocab you will use and set --with_other_dict flag on to using it.
## Train
The command line options for training can be listed by `python train.py -h`.
......
......@@ -2,11 +2,9 @@ import time
import os
import paddle.fluid as fluid
import numpy as np
from Queue import PriorityQueue
import logging
import argparse
import preprocess
from sklearn.metrics.pairwise import cosine_similarity
word_to_id = dict()
id_to_word = dict()
......@@ -51,8 +49,8 @@ def parse_args():
'--test_acc',
action='store_true',
required=False,
default=True,
help='if using test_files , (default: True)')
default=False,
help='if using test_files , (default: False)')
parser.add_argument(
'--test_files_dir',
type=str,
......@@ -85,14 +83,12 @@ def build_test_case_from_file(args, emb):
logger.info("test files list: {}".format(current_list))
test_cases = list()
test_labels = list()
test_case_descs = list()
exclude_lists = list()
for file_dir in current_list:
with open(args.test_files_dir + "/" + file_dir, 'r') as f:
count = 0
for line in f:
if count == 0:
pass
elif ':' in line:
if ':' in line:
logger.info("{}".format(line))
pass
else:
......@@ -102,14 +98,15 @@ def build_test_case_from_file(args, emb):
line.split()[2]]]
test_case_desc = line.split()[0] + " - " + line.split()[
1] + " + " + line.split()[2] + " = " + line.split()[3]
test_cases.append([test_case, test_case_desc])
test_cases.append(test_case)
test_case_descs.append(test_case_desc)
test_labels.append(word_to_id[line.split()[3]])
exclude_lists.append([
word_to_id[line.split()[0]],
word_to_id[line.split()[1]], word_to_id[line.split()[2]]
])
count += 1
return test_cases, test_labels, exclude_lists
test_cases = norm(np.array(test_cases))
return test_cases, test_case_descs, test_labels, exclude_lists
def build_small_test_case(emb):
......@@ -133,8 +130,27 @@ def build_small_test_case(emb):
'deeper']]
desc5 = "old - older + deeper = deep"
label5 = word_to_id["deep"]
return [[emb1, desc1], [emb2, desc2], [emb3, desc3], [emb4, desc4],
[emb5, desc5]], [label1, label2, label3, label4, label5]
emb6 = emb[word_to_id['boy']]
desc6 = "boy"
label6 = word_to_id["boy"]
emb7 = emb[word_to_id['king']]
desc7 = "king"
label7 = word_to_id["king"]
emb8 = emb[word_to_id['sun']]
desc8 = "sun"
label8 = word_to_id["sun"]
emb9 = emb[word_to_id['key']]
desc9 = "key"
label9 = word_to_id["key"]
test_cases = [emb1, emb2, emb3, emb4, emb5, emb6, emb7, emb8, emb9]
test_case_desc = [
desc1, desc2, desc3, desc4, desc5, desc6, desc7, desc8, desc9
]
test_labels = [
label1, label2, label3, label4, label5, label6, label7, label8, label9
]
return norm(np.array(test_cases)), test_case_desc, test_labels
def build_test_case(args, emb):
......@@ -144,86 +160,80 @@ def build_test_case(args, emb):
return build_small_test_case(emb)
def norm(x):
y = np.linalg.norm(x, axis=1, keepdims=True)
return x / y
def inference_test(scope, model_dir, args):
BuildWord_IdMap(args.dict_path)
logger.info("model_dir is: {}".format(model_dir + "/"))
emb = np.array(scope.find_var("embeding").get_tensor())
x = norm(emb)
logger.info("inference result: ====================")
test_cases = list()
test_cases = None
test_case_desc = list()
test_labels = list()
exclude_lists = list()
if args.test_acc:
test_cases, test_labels, exclude_lists = build_test_case(args, emb)
test_cases, test_case_desc, test_labels, exclude_lists = build_test_case(
args, emb)
else:
test_cases, test_labels = build_test_case(args, emb)
test_cases, test_case_desc, test_labels = build_test_case(args, emb)
exclude_lists = [[-1]]
accual_rank = 1 if args.test_acc else args.rank_num
correct_num = 0
cosine_similarity_matrix = np.dot(test_cases, x.T)
results = topKs(accual_rank, cosine_similarity_matrix, exclude_lists,
args.test_acc)
for i in range(len(test_labels)):
pq = None
if args.test_acc:
pq = topK(
accual_rank,
emb,
test_cases[i][0],
exclude_lists[i],
is_acc=True)
else:
pq = pq = topK(
accual_rank,
emb,
test_cases[i][0],
exclude_lists[0],
is_acc=False)
logger.info("Test result for {}".format(test_cases[i][1]))
logger.info("Test result for {}".format(test_case_desc[i]))
result = results[i]
for j in range(accual_rank):
pq_tmps = pq.get()
if (j == accual_rank - 1) and (
pq_tmps.id == test_labels[i]
): # if the nearest word is what we want
if result[j][1] == test_labels[
i]: # if the nearest word is what we want
correct_num += 1
logger.info("{} nearest is {}, rate is {}".format(
accual_rank - j, id_to_word[pq_tmps.id], pq_tmps.priority))
acc = correct_num / len(test_labels)
logger.info("Test acc is: {}, there are {} / {}}".format(acc, correct_num,
len(test_labels)))
logger.info("{} nearest is {}, rate is {}".format(j, id_to_word[
result[j][1]], result[j][0]))
logger.info("Test acc is: {}, there are {} / {}".format(correct_num / len(
test_labels), correct_num, len(test_labels)))
class PQ_Entry(object):
def __init__(self, cos_similarity, id):
self.priority = cos_similarity
self.id = id
def __cmp__(self, other):
return cmp(self.priority, other.priority)
def topK(k, cosine_similarity_list, exclude_list, is_acc=False):
if k == 1 and is_acc: # accelerate acc calculate
max = cosine_similarity_list[0]
id = 0
for i in range(len(cosine_similarity_list)):
if cosine_similarity_list[i] >= max and (i not in exclude_list):
max = cosine_similarity_list[i]
id = i
else:
pass
return [[max, id]]
else:
result = list()
result_index = np.argpartition(cosine_similarity_list, -k)[-k:]
for index in result_index:
result.append([cosine_similarity_list[index], index])
result.sort(reverse=True)
return result
def topK(k, emb, test_emb, exclude_list, is_acc=False):
pq = PriorityQueue(k + 1)
while not pq.empty():
try:
pq.get(False)
except Empty:
continue
pq.task_done()
if len(emb) <= k:
for i in range(len(emb)):
x = cosine_similarity([emb[i]], [test_emb])
pq.put(PQ_Entry(x, i))
return pq
def topKs(k, cosine_similarity_matrix, exclude_lists, is_acc=False):
results = list()
result_queues = list()
correct_num = 0
for i in range(len(emb)):
if is_acc and (i in exclude_list):
pass
for i in range(cosine_similarity_matrix.shape[0]):
tmp_pq = None
if is_acc:
tmp_pq = topK(k, cosine_similarity_matrix[i], exclude_lists[i],
is_acc)
else:
x = cosine_similarity([emb[i]], [test_emb])
pq_e = PQ_Entry(x, i)
if pq.full():
pq.get()
pq.put(pq_e)
pq.get()
return pq
tmp_pq = topK(k, cosine_similarity_matrix[i], exclude_lists[0],
is_acc)
result_queues.append(tmp_pq)
return result_queues
def infer_during_train(args):
......@@ -235,8 +245,6 @@ def infer_during_train(args):
while True:
time.sleep(60)
current_list = os.listdir(args.model_output_dir)
# logger.info("current_list is : {}".format(current_list))
# logger.info("model_file_list is : {}".format(model_file_list))
if set(model_file_list) == set(current_list):
if solved_new:
solved_new = False
......@@ -271,6 +279,8 @@ def infer_once(args):
fluid.io.load_persistables(
executor=exe, dirname=args.model_output_dir + "/")
inference_test(Scope, args.model_output_dir, args)
else:
logger.info("Wrong Directory or save model failed!")
if __name__ == '__main__':
......
......@@ -95,8 +95,7 @@ def skip_gram_word2vec(dict_size,
capacity=64, feed_list=datas, name='py_reader', use_double_buffer=True)
words = fluid.layers.read_file(py_reader)
emb = fluid.layers.embedding(
target_emb = fluid.layers.embedding(
input=words[0],
is_sparse=is_sparse,
size=[dict_size, embedding_size],
......@@ -104,16 +103,23 @@ def skip_gram_word2vec(dict_size,
name='embeding',
initializer=fluid.initializer.Normal(scale=1 /
math.sqrt(dict_size))))
context_emb = fluid.layers.embedding(
input=words[1],
is_sparse=is_sparse,
size=[dict_size, embedding_size],
param_attr=fluid.ParamAttr(
name='embeding',
initializer=fluid.initializer.Normal(scale=1 /
math.sqrt(dict_size))))
cost, cost_nce, cost_hs = None, None, None
if with_nce:
cost_nce = nce_layer(emb, words[1], embedding_size, dict_size, 5,
cost_nce = nce_layer(target_emb, words[1], embedding_size, dict_size, 5,
"uniform", word_frequencys, None)
cost = cost_nce
if with_hsigmoid:
cost_hs = hsigmoid_layer(emb, words[1], words[2], words[3], dict_size,
is_sparse)
cost_hs = hsigmoid_layer(context_emb, words[0], words[2], words[3],
dict_size, is_sparse)
cost = cost_hs
if with_nce and with_hsigmoid:
cost = fluid.layers.elementwise_add(cost_nce, cost_hs)
......
......@@ -3,6 +3,7 @@
import re
import six
import argparse
import io
prog = re.compile("[^a-z ]", flags=0)
word_count = dict()
......@@ -83,7 +84,6 @@ def native_to_unicode(s):
return _to_unicode(s)
except UnicodeDecodeError:
res = _to_unicode(s, ignore_errors=True)
tf.logging.info("Ignoring Unicode error, outputting: %s" % res)
return res
......@@ -199,34 +199,30 @@ def preprocess(args):
# word to count
if args.with_other_dict:
with open(args.other_dict_path, 'r') as f:
with io.open(args.other_dict_path, 'r', encoding='utf-8') as f:
for line in f:
word_count[native_to_unicode(line.strip())] = 1
if args.is_local:
for i in range(1, 100):
with open(args.data_path + "/news.en-000{:0>2d}-of-00100".format(
i)) as f:
with io.open(
args.data_path + "/news.en-000{:0>2d}-of-00100".format(i),
encoding='utf-8') as f:
for line in f:
line = strip_lines(line)
words = line.split()
for item in words:
if item in word_count:
word_count[item] = word_count[item] + 1
else:
word_count[native_to_unicode('<UNK>')] += 1
# with open(args.data_path + "/tmp.txt") as f:
# for line in f:
# print("line before strip is: {}".format(line))
# line = strip_lines(line, word_count)
# print("line after strip is: {}".format(line))
# words = line.split()
# print("words after split is: {}".format(words))
# for item in words:
# if item in word_count:
# word_count[item] = word_count[item] + 1
# else:
# word_count[item] = 1
if args.with_other_dict:
for item in words:
if item in word_count:
word_count[item] = word_count[item] + 1
else:
word_count[native_to_unicode('<UNK>')] += 1
else:
for item in words:
if item in word_count:
word_count[item] = word_count[item] + 1
else:
word_count[item] = 1
item_to_remove = []
for item in word_count:
if word_count[item] <= args.freq:
......@@ -236,21 +232,17 @@ def preprocess(args):
path_table, path_code, word_code_len = build_Huffman(word_count, 40)
with open(args.dict_path, 'w+') as f:
with io.open(args.dict_path, 'w+', encoding='utf-8') as f:
for k, v in word_count.items():
f.write(k.encode("utf-8") + " " + str(v).encode("utf-8") + '\n')
f.write(k + " " + str(v) + '\n')
with open(args.dict_path + "_ptable", 'w+') as f2:
with io.open(args.dict_path + "_ptable", 'w+', encoding='utf-8') as f2:
for pk, pv in path_table.items():
f2.write(
pk.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
for x in pv)) + '\n')
f2.write(pk + '\t' + ' '.join((str(x) for x in pv)) + '\n')
with open(args.dict_path + "_pcode", 'w+') as f3:
with io.open(args.dict_path + "_pcode", 'w+', encoding='utf-8') as f3:
for pck, pcv in path_code.items():
f3.write(
pck.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
for x in pcv)) + '\n')
f3.write(pck + '\t' + ' '.join((str(x) for x in pcv)) + '\n')
if __name__ == "__main__":
......
......@@ -2,14 +2,32 @@
import numpy as np
import preprocess
import logging
import io
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
class NumpyRandomInt(object):
def __init__(self, a, b, buf_size=1000):
self.idx = 0
self.buffer = np.random.random_integers(a, b, buf_size)
self.a = a
self.b = b
def __call__(self):
if self.idx == len(self.buffer):
self.buffer = np.random.random_integers(self.a, self.b,
len(self.buffer))
self.idx = 0
result = self.buffer[self.idx]
self.idx += 1
return result
class Word2VecReader(object):
def __init__(self,
dict_path,
......@@ -24,6 +42,7 @@ class Word2VecReader(object):
self.num_non_leaf = 0
self.word_to_id_ = dict()
self.id_to_word = dict()
self.word_count = dict()
self.word_to_path = dict()
self.word_to_code = dict()
self.trainer_id = trainer_id
......@@ -33,20 +52,19 @@ class Word2VecReader(object):
word_counts = []
word_id = 0
with open(dict_path, 'r') as f:
with io.open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.decode(encoding='UTF-8')
word, count = line.split()[0], int(line.split()[1])
self.word_count[word] = count
self.word_to_id_[word] = word_id
self.id_to_word[word_id] = word #build id to word dict
word_id += 1
word_counts.append(count)
word_all_count += count
with open(dict_path + "_word_to_id_", 'w+') as f6:
with io.open(dict_path + "_word_to_id_", 'w+', encoding='utf-8') as f6:
for k, v in self.word_to_id_.items():
f6.write(
k.encode("utf-8") + " " + str(v).encode("utf-8") + '\n')
f6.write(k + " " + str(v) + '\n')
self.dict_size = len(self.word_to_id_)
self.word_frequencys = [
......@@ -55,22 +73,22 @@ class Word2VecReader(object):
print("dict_size = " + str(
self.dict_size)) + " word_all_count = " + str(word_all_count)
with open(dict_path + "_ptable", 'r') as f2:
with io.open(dict_path + "_ptable", 'r', encoding='utf-8') as f2:
for line in f2:
self.word_to_path[line.split("\t")[0]] = np.fromstring(
self.word_to_path[line.split('\t')[0]] = np.fromstring(
line.split('\t')[1], dtype=int, sep=' ')
self.num_non_leaf = np.fromstring(
line.split('\t')[1], dtype=int, sep=' ')[0]
print("word_ptable dict_size = " + str(len(self.word_to_path)))
with open(dict_path + "_pcode", 'r') as f3:
with io.open(dict_path + "_pcode", 'r', encoding='utf-8') as f3:
for line in f3:
line = line.decode(encoding='UTF-8')
self.word_to_code[line.split("\t")[0]] = np.fromstring(
self.word_to_code[line.split('\t')[0]] = np.fromstring(
line.split('\t')[1], dtype=int, sep=' ')
print("word_pcode dict_size = " + str(len(self.word_to_code)))
self.random_generator = NumpyRandomInt(1, self.window_size_ + 1)
def get_context_words(self, words, idx, window_size):
def get_context_words(self, words, idx):
"""
Get the context word list of target word.
......@@ -78,31 +96,34 @@ class Word2VecReader(object):
idx: input word index
window_size: window size
"""
target_window = np.random.randint(1, window_size + 1)
# need to keep in mind that maybe there are no enough words before the target word.
start_point = idx - target_window if (idx - target_window) > 0 else 0
target_window = self.random_generator()
start_point = idx - target_window # if (idx - target_window) > 0 else 0
if start_point < 0:
start_point = 0
end_point = idx + target_window
# context words of the target word
targets = set(words[start_point:idx] + words[idx + 1:end_point + 1])
return list(targets)
targets = words[start_point:idx] + words[idx + 1:end_point + 1]
return set(targets)
def train(self, with_hs):
def _reader():
for file in self.filelist:
with open(self.data_path_ + "/" + file, 'r') as f:
with io.open(
self.data_path_ + "/" + file, 'r',
encoding='utf-8') as f:
logger.info("running data in {}".format(self.data_path_ +
"/" + file))
count = 1
for line in f:
if self.trainer_id == count % self.trainer_num:
line = preprocess.strip_lines(line)
line = preprocess.strip_lines(line, self.word_count)
word_ids = [
self.word_to_id_[word] for word in line.split()
if word in self.word_to_id_
]
for idx, target_id in enumerate(word_ids):
context_word_ids = self.get_context_words(
word_ids, idx, self.window_size_)
word_ids, idx)
for context_id in context_word_ids:
yield [target_id], [context_id]
else:
......@@ -111,26 +132,28 @@ class Word2VecReader(object):
def _reader_hs():
for file in self.filelist:
with open(self.data_path_ + "/" + file, 'r') as f:
with io.open(
self.data_path_ + "/" + file, 'r',
encoding='utf-8') as f:
logger.info("running data in {}".format(self.data_path_ +
"/" + file))
count = 1
for line in f:
if self.trainer_id == count % self.trainer_num:
line = preprocess.strip_lines(line)
line = preprocess.strip_lines(line, self.word_count)
word_ids = [
self.word_to_id_[word] for word in line.split()
if word in self.word_to_id_
]
for idx, target_id in enumerate(word_ids):
context_word_ids = self.get_context_words(
word_ids, idx, self.window_size_)
word_ids, idx)
for context_id in context_word_ids:
yield [target_id], [context_id], [
self.word_to_code[self.id_to_word[
self.word_to_path[self.id_to_word[
target_id]]
], [
self.word_to_path[self.id_to_word[
self.word_to_code[self.id_to_word[
target_id]]
]
else:
......@@ -144,13 +167,20 @@ class Word2VecReader(object):
if __name__ == "__main__":
window_size = 10
window_size = 5
reader = Word2VecReader(
"./data/1-billion_dict",
"./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/",
["news.en-00001-of-00100"], 0, 1)
reader = Word2VecReader("data/enwik9_dict", "data/enwik9", window_size)
i = 0
for x, y in reader.train()():
# print(reader.train(True))
for x, y, z, f in reader.train(True)():
print("x: " + str(x))
print("y: " + str(y))
print("path: " + str(z))
print("code: " + str(f))
print("\n")
if i == 10:
exit(0)
......
......@@ -12,7 +12,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = ""
import paddle
import paddle.fluid as fluid
from paddle.fluid.executor import global_scope
import six
import reader
from network_conf import skip_gram_word2vec
from infer import inference_test
......@@ -29,7 +29,7 @@ def parse_args():
'--train_data_path',
type=str,
default='./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
help="The path of training dataset")
help="The path of taining dataset")
parser.add_argument(
'--dict_path',
type=str,
......@@ -43,7 +43,7 @@ def parse_args():
parser.add_argument(
'--batch_size',
type=int,
default=100,
default=1000,
help="The size of mini-batch (default:100)")
parser.add_argument(
'--num_passes',
......@@ -125,14 +125,44 @@ def parse_args():
return parser.parse_args()
def convert_python_to_tensor(batch_size, sample_reader, is_hs):
def __reader__():
result = None
if is_hs:
result = [[], [], [], []]
else:
result = [[], []]
for sample in sample_reader():
for i, fea in enumerate(sample):
result[i].append(fea)
if len(result[0]) == batch_size:
tensor_result = []
for tensor in result:
t = fluid.Tensor()
dat = np.array(tensor, dtype='int64')
if len(dat.shape) > 2:
dat = dat.reshape((dat.shape[0], dat.shape[2]))
elif len(dat.shape) == 1:
dat = dat.reshape((-1, 1))
t.set(dat, fluid.CPUPlace())
tensor_result.append(t)
yield tensor_result
if is_hs:
result = [[], [], [], []]
else:
result = [[], []]
return __reader__
def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.train((args.with_hs or (not args.with_nce))),
buf_size=args.batch_size * 100),
batch_size=args.batch_size)
py_reader.decorate_paddle_reader(train_reader)
py_reader.decorate_tensor_provider(
convert_python_to_tensor(args.batch_size,
reader.train((args.with_hs or (
not args.with_nce))), (args.with_hs or (
not args.with_nce))))
place = fluid.CPUPlace()
......@@ -140,6 +170,7 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
exe.run(fluid.default_startup_program())
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.use_experimental_executor = True
print("CPU_NUM:" + str(os.getenv("CPU_NUM")))
exec_strategy.num_threads = int(os.getenv("CPU_NUM"))
......@@ -161,32 +192,23 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
profiler_step_end = 30
for pass_id in range(args.num_passes):
epoch_start = time.time()
py_reader.start()
time.sleep(10)
epoch_start = time.time()
batch_id = 0
start = time.clock()
try:
while True:
if profiler_step == profiler_step_start:
fluid.profiler.start_profiler(profile_state)
loss_val = train_exe.run(fetch_list=[loss.name])
loss_val = np.mean(loss_val)
if profiler_step == profiler_step_end:
fluid.profiler.stop_profiler('total', 'trainer_profile.log')
profiler_step += 1
else:
profiler_step += 1
if batch_id % 50 == 0:
logger.info(
"TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
format(pass_id, batch_id,
loss_val.mean() / args.batch_size,
py_reader.queue.size()))
loss_val.mean(), py_reader.queue.size()))
if args.with_speed:
if batch_id % 1000 == 0 and batch_id != 0:
elapsed = (time.clock() - start)
......@@ -256,7 +278,7 @@ def train(args):
optimizer = None
if args.with_Adam:
optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
optimizer = fluid.optimizer.Adam(learning_rate=1e-4, lazy_mode=True)
else:
optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
......
# 该目录的模型已经不再维护,不推荐使用。建议使用Fluid目录下的模型。
# Introduction to models
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://github.com/PaddlePaddle/models)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册