提交 eee198aa 编写于 作者: T tianxin04

update README

上级 dcfa4624
...@@ -261,3 +261,30 @@ text_a text_b label ...@@ -261,3 +261,30 @@ text_a text_b label
[dev evaluation] f1: 0.951949, precision: 0.944636, recall: 0.959376, elapsed time: 19.156693 s [dev evaluation] f1: 0.951949, precision: 0.944636, recall: 0.959376, elapsed time: 19.156693 s
[test evaluation] f1: 0.937390, precision: 0.925988, recall: 0.949077, elapsed time: 36.565929 s [test evaluation] f1: 0.937390, precision: 0.925988, recall: 0.949077, elapsed time: 36.565929 s
``` ```
### FAQ
#### 如何获取输入句子经过 ERNIE 编码后的 Embedding 表示?
可以通过 ernie_encoder.py 抽取出输入句子的 Embedding 表示和句子中每个 token 的 Embedding 表示,数据格式和 [Fine-tuning 任务](#Fine-tuning-任务) 一节中介绍的各种类型 Fine-tuning 任务的训练数据格式一致;以获取 LCQM dev 数据集中的句子 Embedding 和 token embedding 为例,示例脚本如下:
```
export FLAGS_sync_nccl_allreduce=1
export CUDA_VISIBLE_DEVICES=7
python -u ernir_encoder.py \
--use_cuda true \
--batch_size 32 \
--output_dir "./test" \
--init_pretraining_params ${MODEL_PATH}/params \
--data_set ${TASK_DATA_PATH}/lcqmc/dev.tsv \
--vocab_path config/vocab.txt \
--max_seq_len 128 \
--ernie_config_path config/ernie_config.json
```
上述脚本运行结束后,会在当前路径的 test 目录下分别生成 `cls_emb.npy` 文件存储句子 embeddings 和 `top_layer_emb.npy` 文件存储 token embeddings; 实际使用时,参照示例脚本修改数据路径、embeddings 文件存储路径等配置即可运行;
#### 如何获取输入句子中每个 token 经过 ERNIE 编码后的 Embedding 表示?
[解决方案同上](#如何获取输入句子经过-ERNIE-编码后的-Embedding-表示?)
...@@ -45,16 +45,13 @@ data_g.add_arg("max_seq_len", int, 512, "Number of words of the longe ...@@ -45,16 +45,13 @@ data_g.add_arg("max_seq_len", int, 512, "Number of words of the longe
data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training.") data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training.")
data_g.add_arg("do_lower_case", bool, True, data_g.add_arg("do_lower_case", bool, True,
"Whether to lower case the input text. Should be True for uncased models and False for cased models.") "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
data_g.add_arg("random_seed", int, 0, "Random seed.")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.") run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.") run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).")
run_type_g.add_arg("num_iteration_per_drop_scope", int, 10, "Iteration intervals to drop scope.")
# yapf: enable # yapf: enable
def create_model(args, pyreader_name, ernie_config, is_prediction=False): def create_model(args, pyreader_name, ernie_config):
pyreader = fluid.layers.py_reader( pyreader = fluid.layers.py_reader(
capacity=50, capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
...@@ -108,35 +105,31 @@ def main(args): ...@@ -108,35 +105,31 @@ def main(args):
reader = task_reader.ExtractEmbeddingReader( reader = task_reader.ExtractEmbeddingReader(
vocab_path=args.vocab_path, vocab_path=args.vocab_path,
max_seq_len=args.max_seq_len, max_seq_len=args.max_seq_len,
do_lower_case=args.do_lower_case, do_lower_case=args.do_lower_case)
random_seed=args.random_seed)
startup_prog = fluid.Program() startup_prog = fluid.Program()
if args.random_seed is not None:
startup_prog.random_seed = args.random_seed
data_generator = reader.data_generator( data_generator = reader.data_generator(
input_file=args.data_set, input_file=args.data_set,
batch_size=args.batch_size, batch_size=args.batch_size,
epoch=1, epoch=1,
shuffle=False, shuffle=False)
phase="train")
total_examples = reader.get_num_examples(args.data_set) total_examples = reader.get_num_examples(args.data_set)
print("Device count: %d" % dev_count) print("Device count: %d" % dev_count)
print("Total num examples: %d" % total_examples) print("Total num examples: %d" % total_examples)
train_program = fluid.Program() infer_program = fluid.Program()
with fluid.program_guard(train_program, startup_prog): with fluid.program_guard(infer_program, startup_prog):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
pyreader, graph_vars = create_model( pyreader, graph_vars = create_model(
args, pyreader_name='reader', ernie_config=ernie_config) args, pyreader_name='reader', ernie_config=ernie_config)
fluid.memory_optimize(input_program=train_program) fluid.memory_optimize(input_program=infer_program)
train_program = train_program.clone(for_test=True) infer_program = infer_program.clone(for_test=True)
exe.run(startup_prog) exe.run(startup_prog)
...@@ -148,10 +141,7 @@ def main(args): ...@@ -148,10 +141,7 @@ def main(args):
"WARNING: args 'init_pretraining_params' must be specified") "WARNING: args 'init_pretraining_params' must be specified")
exec_strategy = fluid.ExecutionStrategy() exec_strategy = fluid.ExecutionStrategy()
if args.use_fast_executor:
exec_strategy.use_experimental_executor = True
exec_strategy.num_threads = dev_count exec_strategy.num_threads = dev_count
exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
pyreader.decorate_tensor_provider(data_generator) pyreader.decorate_tensor_provider(data_generator)
pyreader.start() pyreader.start()
...@@ -162,7 +152,7 @@ def main(args): ...@@ -162,7 +152,7 @@ def main(args):
while True: while True:
try: try:
cls_emb, unpad_top_layer_emb = exe.run( cls_emb, unpad_top_layer_emb = exe.run(
program=train_program, program=infer_program,
fetch_list=[ fetch_list=[
graph_vars["cls_embeddings"].name, graph_vars[ graph_vars["cls_embeddings"].name, graph_vars[
"top_layer_embeddings"].name "top_layer_embeddings"].name
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册