未验证 提交 c2ff52df 编写于 作者: Q Qingsheng Li 提交者: GitHub

Merge branch 'develop' into soft-link

# saved model
output/
# coco and pascalvoc data
data/ILSVRC2012/ILSVRC2012_img_val.tar
data/ILSVRC2012/ILSVRC2012_img_train.tar
data/ILSVRC2012/ImageNet_label.tgz
data/ILSVRC2012/train_list.txt
data/ILSVRC2012/val_list.txt
#!/bin/bash
# This file is only used for continuous evaluation.
cudaid=${object_detection_cudaid:=0}
export CUDA_VISIBLE_DEVICES=$cudaid
python train.py --batch_size=64 --num_passes=10 --total_images=6149 --enable_ce=True | python _ce.py
cudaid=${object_detection_cudaid:=0, 1, 2, 3}
export CUDA_VISIBLE_DEVICES=$cudaid
python train.py --batch_size=64 --num_passes=10 --total_images=6149 --enable_ce=True | python _ce.py
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_acc_top1_kpi = AccKpi('train_acc_top1', 0.05, 0, desc='TOP1 ACC')
train_acc_top5_kpi = AccKpi(
'train_acc_top5', 0.05, 0, actived=False, desc='TOP5 ACC')
train_cost_kpi = CostKpi('train_cost', 0.3, 0, actived=True, desc='train cost')
test_acc_top1_kpi = AccKpi('test_acc_top1', 0.05, 0, desc='TOP1 ACC')
test_acc_top5_kpi = AccKpi(
'test_acc_top5', 0.05, 0, actived=False, desc='TOP5 ACC')
test_cost_kpi = CostKpi('test_cost', 1.0, 0, actived=True, desc='train cost')
train_speed_kpi = AccKpi(
'train_speed',
0.05,
0,
actived=True,
unit_repr='seconds/image',
desc='train speed in one GPU card')
train_acc_top1_card4_kpi = AccKpi(
'train_acc_top1_card4', 0.05, 0, desc='TOP1 ACC')
train_acc_top5_card4_kpi = AccKpi(
'train_acc_top5_card4', 0.05, 0, actived=False, desc='TOP5 ACC')
train_cost_card4_kpi = CostKpi(
'train_cost_kpi', 0.3, 0, actived=True, desc='train cost')
test_acc_top1_card4_kpi = AccKpi(
'test_acc_top1_card4', 0.05, 0, desc='TOP1 ACC')
test_acc_top5_card4_kpi = AccKpi(
'test_acc_top5_card4', 0.05, 0, actived=False, desc='TOP5 ACC')
test_cost_card4_kpi = CostKpi(
'test_cost_card4', 1.0, 0, actived=True, desc='train cost')
train_speed_card4_kpi = AccKpi(
'train_speed_card4',
0.05,
0,
actived=True,
unit_repr='seconds/image',
desc='train speed in four GPU card')
tracking_kpis = [
train_acc_top1_kpi, train_acc_top5_kpi, train_cost_kpi, test_acc_top1_kpi,
test_acc_top5_kpi, test_cost_kpi, train_speed_kpi, train_acc_top1_card4_kpi,
train_acc_top5_card4_kpi, train_cost_card4_kpi, test_acc_top1_card4_kpi,
test_acc_top5_card4_kpi, test_cost_card4_kpi, train_speed_card4_kpi
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print log
print("****")
log_to_ce(log)
......@@ -11,6 +11,7 @@ train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"dropout_seed": None,
"learning_strategy": {
"name": "piecewise_decay",
"batch_size": 256,
......@@ -101,7 +102,9 @@ class SE_ResNeXt():
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
# do not set seed when traning, it is only used for debug
drop = fluid.layers.dropout(
x=pool, dropout_prob=0.5, seed=self.params["dropout_seed"])
stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
out = fluid.layers.fc(input=drop,
size=class_dim,
......
......@@ -4,6 +4,7 @@ import time
import sys
import paddle
import paddle.fluid as fluid
import paddle.dataset.flowers as flowers
import models
import reader
import argparse
......@@ -28,6 +29,7 @@ add_arg('checkpoint', str, None, "Whether to resume chec
add_arg('lr', float, 0.1, "set learning rate.")
add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.")
add_arg('model', str, "SE_ResNeXt50_32x4d", "Set the network to use.")
add_arg('enable_ce', bool, False, "If set True, enable continuous evaluation job.")
# yapf: enable
model_list = [m for m in dir(models) if "__" not in m]
......@@ -100,6 +102,9 @@ def train(args):
# model definition
model = models.__dict__[model_name]()
if args.enable_ce:
assert model_name == "SE_ResNeXt50_32x4d"
if model_name is "GoogleNet":
out0, out1, out2 = model.net(input=image, class_dim=class_dim)
cost0 = fluid.layers.cross_entropy(input=out0, label=label)
......@@ -129,6 +134,8 @@ def train(args):
params["num_epochs"] = args.num_epochs
params["learning_strategy"]["batch_size"] = args.batch_size
params["learning_strategy"]["name"] = args.lr_strategy
if args.enable_ce:
params["dropout_seed"] = 10
# initialize optimizer
optimizer = optimizer_setting(params)
......@@ -137,6 +144,9 @@ def train(args):
if with_memory_optimization:
fluid.memory_optimize(fluid.default_main_program())
if args.enable_ce:
fluid.default_startup_program().random_seed = 1000
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -153,8 +163,20 @@ def train(args):
train_batch_size = args.batch_size
test_batch_size = 16
train_reader = paddle.batch(reader.train(), batch_size=train_batch_size)
test_reader = paddle.batch(reader.val(), batch_size=test_batch_size)
if not args.enable_ce:
train_reader = paddle.batch(reader.train(), batch_size=train_batch_size)
test_reader = paddle.batch(reader.val(), batch_size=test_batch_size)
else:
# use flowers dataset for CE and set use_xmap False to avoid disorder data
# but it is time consuming. For faster speed, need another dataset.
import random
random.seed(0)
train_reader = paddle.batch(
flowers.train(use_xmap=False), batch_size=train_batch_size)
test_reader = paddle.batch(
flowers.test(use_xmap=False), batch_size=test_batch_size)
feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
train_exe = fluid.ParallelExecutor(
......@@ -162,9 +184,12 @@ def train(args):
fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
gpu = os.getenv("CUDA_VISIBLE_DEVICES") or ""
gpu_nums = len(gpu.split(","))
for pass_id in range(params["num_epochs"]):
train_info = [[], [], []]
test_info = [[], [], []]
train_time = []
for batch_id, data in enumerate(train_reader()):
t1 = time.time()
loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data))
......@@ -176,6 +201,7 @@ def train(args):
train_info[0].append(loss)
train_info[1].append(acc1)
train_info[2].append(acc5)
train_time.append(period)
if batch_id % 10 == 0:
print("Pass {0}, trainbatch {1}, loss {2}, \
acc1 {3}, acc5 {4} time {5}"
......@@ -187,6 +213,7 @@ def train(args):
train_loss = np.array(train_info[0]).mean()
train_acc1 = np.array(train_info[1]).mean()
train_acc5 = np.array(train_info[2]).mean()
train_speed = np.array(train_time).mean() / train_batch_size
cnt = 0
for test_batch_id, data in enumerate(test_reader()):
t1 = time.time()
......@@ -226,6 +253,36 @@ def train(args):
os.makedirs(model_path)
fluid.io.save_persistables(exe, model_path)
# This is for continuous evaluation only
if args.enable_ce and pass_id == args.num_epochs - 1:
if gpu_nums == 1:
# Use the last cost/acc for training
print("kpis train_cost %s" % train_loss)
print("kpis train_acc_top1 %s" % train_acc1)
print("kpis train_acc_top5 %s" % train_acc5)
# Use the mean cost/acc for testing
print("kpis test_cost %s" % test_loss)
print("kpis test_acc_top1 %s" % test_acc1)
print("kpis test_acc_top5 %s" % test_acc5)
print("kpis train_speed %s" % train_speed)
else:
# Use the last cost/acc for training
print("kpis train_cost_card%s %s" %
(gpu_nums, train_loss))
print("kpis train_acc_top1_card%s %s" %
(gpu_nums, train_acc1))
print("kpis train_acc_top5_card%s %s" %
(gpu_nums, train_acc5))
# Use the mean cost/acc for testing
print("kpis test_cost_card%s %s" %
(gpu_nums, test_loss))
print("kpis test_acc_top1_card%s %s" %
(gpu_nums, test_acc1))
print("kpis test_acc_top5_card%s %s" %
(gpu_nums, test_acc5))
print("kpis train_speed_card%s %s" %
(gpu_nums, train_speed))
def main():
args = parser.parse_args()
......
......@@ -6,9 +6,9 @@ export OMP_NUM_THREADS=1
cudaid=${language_model:=0} # use 0-th card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py | python _ce.py
FLAGS_benchmark=true python train.py --enable_ce | python _ce.py
cudaid=${language_model_m:=0,1,2,3} # use 0,1,2,3 card as default
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py | python _ce.py
FLAGS_benchmark=true python train.py --enable_ce | python _ce.py
......@@ -145,7 +145,7 @@ def train(train_reader,
if pass_idx == pass_num - 1 and args.enable_ce:
#Note: The following logs are special for CE monitoring.
#Other situations do not need to care about these logs.
gpu_num = get_cards()
gpu_num = get_cards(args.enable_ce)
if gpu_num == 1:
print("kpis imikolov_20_pass_duration %s" %
(total_time / epoch_idx))
......
#!/bin/bash
DATA_PATH=$HOME/.cache/paddle/dataset/wmt16
if [ ! -d $DATA_PATH ] ; then
python -c 'import paddle;paddle.dataset.wmt16.train(10000, 10000, "en")'\
'().next()'
tar -zxf $DATA_PATH/wmt16.tar.gz -C $DATA_PATH
fi
train(){
python -u train.py \
--src_vocab_fpath $DATA_PATH/en_10000.dict \
--trg_vocab_fpath $DATA_PATH/de_10000.dict \
--special_token '<s>' '<e>' '<unk>' \
--train_file_pattern $DATA_PATH/wmt16/train \
--val_file_pattern $DATA_PATH/wmt16/val \
--use_token_batch True \
--batch_size 2048 \
--sort_type pool \
--pool_size 10000 \
--enable_ce True \
weight_sharing False \
pass_num 20 \
dropout_seed 10
}
train | python _ce.py
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
test_cost_kpi = CostKpi('test_cost', 0.005, 0, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.06, 0, actived=True)
tracking_kpis = [
train_cost_kpi,
test_cost_kpi,
train_duration_kpi,
]
def parse_log(log):
'''
This method should be implemented by model developers.
The suggestion:
each line in the log should be key, value, for example:
"
train_cost\t1.0
test_cost\t1.0
train_cost\t1.0
train_cost\t1.0
train_acc\t1.2
"
'''
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
\ No newline at end of file
......@@ -81,6 +81,8 @@ class ModelHyperParams(object):
n_layer = 6
# dropout rate used by all dropout layers.
dropout = 0.1
# random seed used in dropout for CE.
dropout_seed = None
# the flag indicating whether to share embedding and softmax weights.
# vocabularies in source and target should be same for weight sharing.
weight_sharing = True
......
......@@ -111,7 +111,10 @@ def multi_head_attention(queries,
x=weights, shape=product.shape, actual_shape=post_softmax_shape)
if dropout_rate:
weights = layers.dropout(
weights, dropout_prob=dropout_rate, is_test=False)
weights,
dropout_prob=dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
out = layers.matmul(weights, v)
return out
......@@ -171,7 +174,10 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
elif cmd == "d": # add dropout
if dropout_rate:
out = layers.dropout(
out, dropout_prob=dropout_rate, is_test=False)
out,
dropout_prob=dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False)
return out
......@@ -211,7 +217,9 @@ def prepare_encoder(src_word,
shape=[batch_size, seq_len, src_emb_dim],
actual_shape=src_data_shape)
return layers.dropout(
enc_input, dropout_prob=dropout_rate,
enc_input,
dropout_prob=dropout_rate,
seed=ModelHyperParams.dropout_seed,
is_test=False) if dropout_rate else enc_input
......
......@@ -103,6 +103,12 @@ def parse_args():
help="The device type.")
parser.add_argument(
'--sync', type=ast.literal_eval, default=True, help="sync mode.")
parser.add_argument(
"--enable_ce",
type=ast.literal_eval,
default=True,
help="The flag indicating whether to run the task "
"for continuous evaluation.")
args = parser.parse_args()
# Append args related to dict
......@@ -382,6 +388,12 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
data_input_names, util_input_names, sum_cost,
token_num)
# the best cross-entropy value with label smoothing
loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log(
(1. - TrainTaskConfig.label_smooth_eps
)) + TrainTaskConfig.label_smooth_eps *
np.log(TrainTaskConfig.label_smooth_eps / (
ModelHyperParams.trg_vocab_size - 1) + 1e-20))
init = False
for pass_id in xrange(TrainTaskConfig.pass_num):
pass_start_time = time.time()
......@@ -421,19 +433,27 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
) # sum the cost from multi-devices
total_token_num = token_num_val.sum()
total_avg_cost = total_sum_cost / total_token_num
print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
(pass_id, batch_id, total_sum_cost, total_avg_cost,
np.exp([min(total_avg_cost, 100)])))
print("epoch: %d, batch: %d, avg loss: %f, normalized loss: %f,"
" ppl: %f" % (pass_id, batch_id, total_avg_cost,
total_avg_cost - loss_normalizer,
np.exp([min(total_avg_cost, 100)])))
if batch_id > 0 and batch_id % 1000 == 0:
fluid.io.save_persistables(
exe,
os.path.join(TrainTaskConfig.ckpt_dir, "latest.checkpoint"))
init = True
time_consumed = time.time() - pass_start_time
# Validate and save the model for inference.
print("epoch: %d, " % pass_id +
("val avg loss: %f, val ppl: %f, " % test()
if args.val_file_pattern is not None else "") + "consumed %fs" %
(time.time() - pass_start_time))
if args.val_file_pattern is not None:
val_avg_cost, val_ppl = test()
print(
"epoch: %d, val avg loss: %f, val normalized loss: %f, val ppl: %f,"
" consumed %fs" % (pass_id, val_avg_cost,
val_avg_cost - loss_normalizer, val_ppl,
time_consumed))
else:
print("epoch: %d, consumed %fs" % (pass_id, time_consumed))
fluid.io.save_persistables(
exe,
os.path.join(TrainTaskConfig.ckpt_dir,
......@@ -442,6 +462,10 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
os.path.join(TrainTaskConfig.model_dir,
"pass_" + str(pass_id) + ".infer.model"),
data_input_names[:-2] + util_input_names, [predict], exe)
if args.enable_ce: # For CE
print("kpis\ttrain_cost\t%f" % total_avg_cost)
print("kpis\ttest_cost\t%f" % val_avg_cost)
print("kpis\ttrain_duration\t%f" % time_consumed)
def train(args):
......@@ -465,6 +489,9 @@ def train(args):
exe = fluid.Executor(place)
if args.enable_ce:
fluid.default_startup_program().random_seed = 1000
sum_cost, avg_cost, predict, token_num = transformer(
ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
......
......@@ -7,12 +7,12 @@ from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, actived=True)
test_acc_kpi = AccKpi('test_acc', 0.01, actived=True)
train_speed_kpi = AccKpi('train_speed', 0.2, actived=True)
train_cost_card4_kpi = CostKpi('train_cost_card4', 0.02, actived=True)
test_acc_card4_kpi = AccKpi('test_acc_card4', 0.01, actived=True)
train_speed_card4_kpi = AccKpi('train_speed_card4', 0.2, actived=True)
train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
test_acc_kpi = AccKpi('test_acc', 0.01, 0, actived=True)
train_speed_kpi = AccKpi('train_speed', 0.2, 0, actived=True)
train_cost_card4_kpi = CostKpi('train_cost_card4', 0.02, 0, actived=True)
test_acc_card4_kpi = AccKpi('test_acc_card4', 0.01, 0, actived=True)
train_speed_card4_kpi = AccKpi('train_speed_card4', 0.2, 0, actived=True)
tracking_kpis = [
train_cost_kpi,
......
import paddle.v2 as paddle
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
......
###!/bin/bash
####This file is only used for continuous evaluation.
export CE_MODE_X=1
sh data/download.sh
python train.py | python _ce.py
......@@ -22,11 +22,7 @@
## 数据获取
请参考PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md) 一节中数据获取方式,将该例中的data文件夹拷贝至本例目录下,运行其中的download.sh脚本获取训练和测试数据。
## 通用脚本获取
请将PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md)中提供的用于数据读取的文件[reader.py](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/reader.py)以及包含字典导入等通用功能的文件[utils.py](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/utils.py)复制到本目录下。本例将会使用到这两个脚本。
完整数据的获取请参考PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md) 一节中的方式。本例的示例数据同样可以通过运行data/download.sh来获取。
## 训练
......
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_acc_kpi = AccKpi('train_precision', 0.005, actived=True)
test_acc_kpi = CostKpi('test_precision', 0.005, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.05, actived=True)
tracking_kpis = [
train_acc_kpi,
test_acc_kpi,
train_duration_kpi,
]
def parse_log(log):
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
if [ -f assignment2.zip ]; then
echo "data exist"
exit 0
else
wget http://cs224d.stanford.edu/assignment2/assignment2.zip
fi
if [ $? -eq 0 ];then
unzip assignment2.zip
cp assignment2_release/data/ner/wordVectors.txt ./data
cp assignment2_release/data/ner/vocab.txt ./data
rm -rf assignment2_release
else
echo "download data error!" >> /dev/stderr
exit 1
fi
B-LOC
I-LOC
B-MISC
I-MISC
B-ORG
I-ORG
B-PER
I-PER
O
CRICKET NNP I-NP O
- : O O
LEICESTERSHIRE NNP I-NP I-ORG
TAKE NNP I-NP O
OVER IN I-PP O
AT NNP I-NP O
TOP NNP I-NP O
AFTER NNP I-NP O
INNINGS NNP I-NP O
VICTORY NN I-NP O
. . O O
LONDON NNP I-NP I-LOC
1996-08-30 CD I-NP O
West NNP I-NP I-MISC
Indian NNP I-NP I-MISC
all-rounder NN I-NP O
Phil NNP I-NP I-PER
Simmons NNP I-NP I-PER
took VBD I-VP O
four CD I-NP O
for IN I-PP O
38 CD I-NP O
on IN I-PP O
Friday NNP I-NP O
as IN I-PP O
Leicestershire NNP I-NP I-ORG
beat VBD I-VP O
Somerset NNP I-NP I-ORG
by IN I-PP O
an DT I-NP O
innings NN I-NP O
and CC O O
39 CD I-NP O
runs NNS I-NP O
in IN I-PP O
two CD I-NP O
days NNS I-NP O
to TO I-VP O
take VB I-VP O
over IN I-PP O
at IN B-PP O
the DT I-NP O
head NN I-NP O
of IN I-PP O
the DT I-NP O
county NN I-NP O
championship NN I-NP O
. . O O
Their PRP$ I-NP O
stay NN I-NP O
on IN I-PP O
top NN I-NP O
, , O O
though RB I-ADVP O
, , O O
may MD I-VP O
be VB I-VP O
short-lived JJ I-ADJP O
as IN I-PP O
title NN I-NP O
rivals NNS I-NP O
Essex NNP I-NP I-ORG
, , O O
Derbyshire NNP I-NP I-ORG
and CC I-NP O
Surrey NNP I-NP I-ORG
all DT O O
closed VBD I-VP O
in RP I-PRT O
on IN I-PP O
victory NN I-NP O
while IN I-SBAR O
Kent NNP I-NP I-ORG
made VBD I-VP O
up RP I-PRT O
for IN I-PP O
lost VBN I-NP O
time NN I-NP O
in IN I-PP O
their PRP$ I-NP O
rain-affected JJ I-NP O
match NN I-NP O
against IN I-PP O
Nottinghamshire NNP I-NP I-ORG
. . O O
After IN I-PP O
bowling VBG I-NP O
Somerset NNP I-NP I-ORG
out RP I-PRT O
for IN I-PP O
83 CD I-NP O
on IN I-PP O
the DT I-NP O
opening NN I-NP O
morning NN I-NP O
at IN I-PP O
Grace NNP I-NP I-LOC
Road NNP I-NP I-LOC
, , O O
Leicestershire NNP I-NP I-ORG
extended VBD I-VP O
their PRP$ I-NP O
first JJ I-NP O
innings NN I-NP O
by IN I-PP O
94 CD I-NP O
runs VBZ I-VP O
before IN I-PP O
being VBG I-VP O
bowled VBD I-VP O
out RP I-PRT O
for IN I-PP O
296 CD I-NP O
with IN I-PP O
England NNP I-NP I-LOC
discard VBP I-VP O
Andy NNP I-NP I-PER
Caddick NNP I-NP I-PER
taking VBG I-VP O
three CD I-NP O
for IN I-PP O
83 CD I-NP O
. . O O
EU NNP I-NP I-ORG
rejects VBZ I-VP O
German JJ I-NP I-MISC
call NN I-NP O
to TO I-VP O
boycott VB I-VP O
British JJ I-NP I-MISC
lamb NN I-NP O
. . O O
Peter NNP I-NP I-PER
Blackburn NNP I-NP I-PER
BRUSSELS NNP I-NP I-LOC
1996-08-22 CD I-NP O
The DT I-NP O
European NNP I-NP I-ORG
Commission NNP I-NP I-ORG
said VBD I-VP O
on IN I-PP O
Thursday NNP I-NP O
it PRP B-NP O
disagreed VBD I-VP O
with IN I-PP O
German JJ I-NP I-MISC
advice NN I-NP O
to TO I-PP O
consumers NNS I-NP O
to TO I-VP O
shun VB I-VP O
British JJ I-NP I-MISC
lamb NN I-NP O
until IN I-SBAR O
scientists NNS I-NP O
determine VBP I-VP O
whether IN I-SBAR O
mad JJ I-NP O
cow NN I-NP O
disease NN I-NP O
can MD I-VP O
be VB I-VP O
transmitted VBN I-VP O
to TO I-PP O
sheep NN I-NP O
. . O O
Germany NNP I-NP I-LOC
's POS B-NP O
representative NN I-NP O
to TO I-PP O
the DT I-NP O
European NNP I-NP I-ORG
Union NNP I-NP I-ORG
's POS B-NP O
veterinary JJ I-NP O
committee NN I-NP O
Werner NNP I-NP I-PER
Zwingmann NNP I-NP I-PER
said VBD I-VP O
on IN I-PP O
Wednesday NNP I-NP O
consumers NNS I-NP O
should MD I-VP O
buy VB I-VP O
sheepmeat NN I-NP O
from IN I-PP O
countries NNS I-NP O
other JJ I-ADJP O
than IN I-PP O
Britain NNP I-NP I-LOC
until IN I-SBAR O
the DT I-NP O
scientific JJ I-NP O
advice NN I-NP O
was VBD I-VP O
clearer JJR I-ADJP O
. . O O
" " O O
We PRP I-NP O
do VBP I-VP O
n't RB I-VP O
support VB I-VP O
any DT I-NP O
such JJ I-NP O
recommendation NN I-NP O
because IN I-SBAR O
we PRP I-NP O
do VBP I-VP O
n't RB I-VP O
see VB I-VP O
any DT I-NP O
grounds NNS I-NP O
for IN I-PP O
it PRP I-NP O
, , O O
" " O O
the DT I-NP O
Commission NNP I-NP I-ORG
's POS B-NP O
chief JJ I-NP O
spokesman NN I-NP O
Nikolaus NNP I-NP I-PER
van NNP I-NP I-PER
der FW I-NP I-PER
Pas NNP I-NP I-PER
told VBD I-VP O
a DT I-NP O
news NN I-NP O
briefing NN I-NP O
. . O O
He PRP I-NP O
said VBD I-VP O
further JJ I-NP O
scientific JJ I-NP O
study NN I-NP O
was VBD I-VP O
required VBN I-VP O
and CC O O
if IN I-SBAR O
it PRP I-NP O
was VBD I-VP O
found VBN I-VP O
that IN I-SBAR O
action NN I-NP O
was VBD I-VP O
needed VBN I-VP O
it PRP I-NP O
should MD I-VP O
be VB I-VP O
taken VBN I-VP O
by IN I-PP O
the DT I-NP O
European NNP I-NP I-ORG
Union NNP I-NP I-ORG
. . O O
"""
Conll03 dataset.
"""
from utils import *
__all__ = ["data_reader"]
def canonicalize_digits(word):
if any([c.isalpha() for c in word]): return word
word = re.sub("\d", "DG", word)
if word.startswith("DG"):
word = word.replace(",", "") # remove thousands separator
return word
def canonicalize_word(word, wordset=None, digits=True):
word = word.lower()
if digits:
if (wordset != None) and (word in wordset): return word
word = canonicalize_digits(word) # try to canonicalize numbers
if (wordset == None) or (word in wordset): return word
else: return "UUUNKKK" # unknown token
def data_reader(data_file, word_dict, label_dict):
"""
The dataset can be obtained according to http://www.clips.uantwerpen.be/conll2003/ner/.
It returns a reader creator, each sample in the reader includes:
word id sequence, label id sequence and raw sentence.
:return: reader creator
:rtype: callable
"""
def reader():
UNK_IDX = word_dict["UUUNKKK"]
sentence = []
labels = []
with open(data_file, "r") as f:
for line in f:
if len(line.strip()) == 0:
if len(sentence) > 0:
word_idx = [
word_dict.get(
canonicalize_word(w, word_dict), UNK_IDX)
for w in sentence
]
mark = [1 if w[0].isupper() else 0 for w in sentence]
label_idx = [label_dict[l] for l in labels]
yield word_idx, mark, label_idx
sentence = []
labels = []
else:
segs = line.strip().split()
sentence.append(segs[0])
# transform I-TYPE to BIO schema
if segs[-1] != "O" and (len(labels) == 0 or
labels[-1][1:] != segs[-1][1:]):
labels.append("B" + segs[-1][1:])
else:
labels.append(segs[-1])
return reader
import os
import math
import time
import numpy as np
import paddle.v2 as paddle
import paddle
import paddle.fluid as fluid
import reader
......@@ -24,12 +25,19 @@ def test(exe, chunk_evaluator, inference_program, test_data, place):
return chunk_evaluator.eval(exe)
def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
model_save_dir, num_passes, use_gpu, parallel):
def main(train_data_file,
test_data_file,
vocab_file,
target_file,
emb_file,
model_save_dir,
num_passes,
use_gpu,
parallel,
batch_size=200):
if not os.path.exists(model_save_dir):
os.mkdir(model_save_dir)
BATCH_SIZE = 200
word_dict = load_dict(vocab_file)
label_dict = load_dict(target_file)
......@@ -53,60 +61,76 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
chunk_scheme="IOB",
num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
inference_program = fluid.default_main_program().clone()
inference_program = fluid.default_main_program().clone(for_test=True)
with fluid.program_guard(inference_program):
test_target = chunk_evaluator.metrics + chunk_evaluator.states
inference_program = fluid.io.get_inference_program(test_target)
train_reader = paddle.batch(
paddle.reader.shuffle(
if "CE_MODE_X" not in os.environ:
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.data_reader(train_data_file, word_dict, label_dict),
buf_size=20000),
batch_size=batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
reader.data_reader(test_data_file, word_dict, label_dict),
buf_size=20000),
batch_size=batch_size)
else:
train_reader = paddle.batch(
reader.data_reader(train_data_file, word_dict, label_dict),
buf_size=20000),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
paddle.reader.shuffle(
batch_size=batch_size)
test_reader = paddle.batch(
reader.data_reader(test_data_file, word_dict, label_dict),
buf_size=20000),
batch_size=BATCH_SIZE)
batch_size=batch_size)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
exe = fluid.Executor(place)
if "CE_MODE_X" in os.environ:
fluid.default_startup_program().random_seed = 110
exe.run(fluid.default_startup_program())
embedding_name = 'emb'
embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
embedding_param.set(word_vector_values, place)
batch_id = 0
for pass_id in xrange(num_passes):
chunk_evaluator.reset(exe)
for data in train_reader():
for batch_id, data in enumerate(train_reader()):
cost, batch_precision, batch_recall, batch_f1_score = exe.run(
fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost] + chunk_evaluator.metrics)
if batch_id % 5 == 0:
print(cost)
print("Pass " + str(pass_id) + ", Batch " + str(
batch_id) + ", Cost " + str(cost[0]) + ", Precision " + str(
batch_precision[0]) + ", Recall " + str(batch_recall[0])
+ ", F1_score" + str(batch_f1_score[0]))
batch_id = batch_id + 1
pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe)
print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + str(
pass_precision) + " pass_recall:" + str(pass_recall) +
" pass_f1_score:" + str(pass_f1_score))
pass_precision, pass_recall, pass_f1_score = test(
test_pass_precision, test_pass_recall, test_pass_f1_score = test(
exe, chunk_evaluator, inference_program, test_reader, place)
print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + str(
pass_precision) + " pass_recall:" + str(pass_recall) +
" pass_f1_score:" + str(pass_f1_score))
test_pass_precision) + " pass_recall:" + str(test_pass_recall) +
" pass_f1_score:" + str(test_pass_f1_score))
save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id)
fluid.io.save_inference_model(save_dirname, ['word', 'mark', 'target'],
[crf_decode], exe)
crf_decode, exe)
if ("CE_MODE_X" in os.environ) and (pass_id % 50 == 0):
if pass_id > 0:
print("kpis train_precision %f" % pass_precision)
print("kpis test_precision %f" % test_pass_precision)
print("kpis train_duration %f" % (time.time() - time_begin))
time_begin = time.time()
if __name__ == "__main__":
......@@ -118,5 +142,6 @@ if __name__ == "__main__":
emb_file="data/wordVectors.txt",
model_save_dir="models",
num_passes=1000,
batch_size=1,
use_gpu=False,
parallel=False)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import os
import re
import argparse
import numpy as np
from collections import defaultdict
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def get_embedding(emb_file='data/wordVectors.txt'):
"""
Get the trained word vector.
"""
return np.loadtxt(emb_file, dtype=float)
def load_dict(dict_path):
"""
Load the word dictionary from the given file.
Each line of the given file is a word, which can include multiple columns
seperated by tab.
This function takes the first column (columns in a line are seperated by
tab) as key and takes line number of a line as the key (index of the word
in the dictionary).
"""
return dict((line.strip().split("\t")[0], idx)
for idx, line in enumerate(open(dict_path, "r").readlines()))
def load_reverse_dict(dict_path):
"""
Load the word dictionary from the given file.
Each line of the given file is a word, which can include multiple columns
seperated by tab.
This function takes line number of a line as the key (index of the word in
the dictionary) and the first column (columns in a line are seperated by
tab) as the value.
"""
return dict((idx, line.strip().split("\t")[0])
for idx, line in enumerate(open(dict_path, "r").readlines()))
###!/bin/bash
####This file is only used for continuous evaluation.
export CE_MODE_X=1
python train.py cnn | python _ce.py
####this file is only used for continuous evaluation test!
import os
import sys
sys.path.append(os.environ['ceroot'])
from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_acc_kpi = AccKpi('train_acc', 0.005, actived=True)
train_cost_kpi = CostKpi('train_cost', 0.005, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.05, actived=True)
tracking_kpis = [
train_acc_kpi,
train_cost_kpi,
train_duration_kpi,
]
def parse_log(log):
for line in log.split('\n'):
fs = line.strip().split('\t')
print(fs)
if len(fs) == 3 and fs[0] == 'kpis':
print("-----%s" % fs)
kpi_name = fs[1]
kpi_value = float(fs[2])
yield kpi_name, kpi_value
def log_to_ce(log):
kpi_tracker = {}
for kpi in tracking_kpis:
kpi_tracker[kpi.name] = kpi
for (kpi_name, kpi_value) in parse_log(log):
print(kpi_name, kpi_value)
kpi_tracker[kpi_name].add_record(kpi_value)
kpi_tracker[kpi_name].persist()
if __name__ == '__main__':
log = sys.stdin.read()
print("*****")
print(log)
print("****")
log_to_ce(log)
import unittest
import contextlib
import paddle
import paddle.fluid as fluid
import paddle.v2 as paddle
import numpy as np
import sys
import time
......
import unittest
import contextlib
import paddle
import paddle.fluid as fluid
import paddle.v2 as paddle
import numpy as np
import sys
import time
......
......@@ -4,8 +4,8 @@ import unittest
import contextlib
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.v2 as paddle
import utils
......
......@@ -2,8 +2,8 @@ import sys
import time
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.v2 as paddle
def bow_net(data,
......
import os
import sys
import time
import unittest
import contextlib
import paddle
import paddle.fluid as fluid
import paddle.v2 as paddle
import utils
from nets import bow_net
......@@ -53,8 +54,12 @@ def train(train_reader,
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
# For internal continuous evaluation
if "CE_MODE_X" in os.environ:
fluid.default_startup_program().random_seed = 110
exe.run(fluid.default_startup_program())
for pass_id in xrange(pass_num):
pass_start = time.time()
data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
for data in train_reader():
avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(),
......@@ -73,6 +78,13 @@ def train(train_reader,
epoch_model = save_dirname + "/" + "epoch" + str(pass_id)
fluid.io.save_inference_model(epoch_model, ["words", "label"], acc, exe)
pass_end = time.time()
# For internal continuous evaluation
if "CE_MODE_X" in os.environ:
print("kpis train_acc %f" % avg_acc)
print("kpis train_cost %f" % avg_cost)
print("kpis train_duration %f" % (pass_end - pass_start))
def train_net():
word_dict, train_reader, test_reader = utils.prepare_data(
......
import os
import sys
import time
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.v2 as paddle
def to_lodtensor(data, place):
......@@ -64,15 +65,22 @@ def prepare_data(data_type="imdb",
raise RuntimeError("No such dataset")
if data_type == "imdb":
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=buf_size),
batch_size=batch_size)
if "CE_MODE_X" in os.environ:
train_reader = paddle.batch(
paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.test(word_dict), buf_size=buf_size),
batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.imdb.test(word_dict), batch_size=batch_size)
else:
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=buf_size),
batch_size=batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.test(word_dict), buf_size=buf_size),
batch_size=batch_size)
else:
raise RuntimeError("no such dataset")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册