提交 6e5a2af4 编写于 作者: W wangxiao1021

add examples

上级 fac8802f
*.pyc *.pyc
__pycache__ __pycache__
pretrain_model pretrain_model
pretrain
output*
output_model output_model
build build
dist dist
......
此差异已折叠。
此差异已折叠。
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""v1.1
BERT model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from paddle import fluid
from paddle.fluid import layers
from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
from paddlepalm.interface import backbone
class Model(backbone):
def __init__(self, config, phase):
# self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变
self._emb_size = config["hidden_size"]
self._n_layer = config["num_hidden_layers"]
self._n_head = config["num_attention_heads"]
self._voc_size = config["vocab_size"]
self._max_position_seq_len = config["max_position_embeddings"]
self._sent_types = config["type_vocab_size"]
self._hidden_act = config["hidden_act"]
self._prepostprocess_dropout = config["hidden_dropout_prob"]
self._attention_dropout = config["attention_probs_dropout_prob"]
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
# Initialize all weigths by truncated normal initializer, and all biases
# will be initialized by constant zero by default.
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=config["initializer_range"])
@property
def inputs_attr(self):
return {"token_ids": [[-1, -1, 1], 'int64'],
"position_ids": [[-1, -1, 1], 'int64'],
"segment_ids": [[-1, -1, 1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32']}
@property
def outputs_attr(self):
return {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
"embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
"encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
def build(self, inputs, scope_name=""):
src_ids = inputs['token_ids']
pos_ids = inputs['position_ids']
sent_ids = inputs['segment_ids']
input_mask = inputs['input_mask']
self._emb_dtype = 'float32'
# padding id in vocabulary must be set to 0
emb_out = fluid.layers.embedding(
input=src_ids,
size=[self._voc_size, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._word_emb_name, initializer=self._param_initializer),
is_sparse=False)
# fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
position_emb_out = fluid.layers.embedding(
input=pos_ids,
size=[self._max_position_seq_len, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
sent_emb_out = fluid.layers.embedding(
sent_ids,
size=[self._sent_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
emb_out = emb_out + position_emb_out
emb_out = emb_out + sent_emb_out
emb_out = pre_process_layer(
emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
enc_out = encoder(
enc_input=emb_out,
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
hidden_act=self._hidden_act,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
name=scope_name+'encoder')
next_sent_feat = fluid.layers.slice(
input=enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
bias_attr=scope_name+"pooled_fc.b_0")
return {'embedding_table': embedding_table,
'word_embedding': emb_out,
'encoder_outputs': enc_out,
'sentence_embedding': next_sent_feat,
'sentence_pair_embedding': next_sent_feat}
def postprocess(self, rt_outputs):
pass
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Ernie model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from paddle import fluid
from paddle.fluid import layers
from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
from paddlepalm.interface import backbone
class Model(backbone):
def __init__(self,
config,
phase):
# self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变
self._emb_size = config['hidden_size']
self._n_layer = config['num_hidden_layers']
self._n_head = config['num_attention_heads']
self._voc_size = config['vocab_size']
self._max_position_seq_len = config['max_position_embeddings']
if config['sent_type_vocab_size']:
self._sent_types = config['sent_type_vocab_size']
else:
self._sent_types = config['type_vocab_size']
self._task_types = config['task_type_vocab_size']
self._hidden_act = config['hidden_act']
self._prepostprocess_dropout = config['hidden_dropout_prob']
self._attention_dropout = config['attention_probs_dropout_prob']
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
self._task_emb_name = "task_embedding"
self._emb_dtype = "float32"
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=config['initializer_range'])
@property
def inputs_attr(self):
return {"token_ids": [[-1, -1, 1], 'int64'],
"position_ids": [[-1, -1, 1], 'int64'],
"segment_ids": [[-1, -1, 1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"task_ids": [[-1,-1, 1], 'int64']}
@property
def outputs_attr(self):
return {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
"embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
"encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
def build(self, inputs, scope_name=""):
src_ids = inputs['token_ids']
pos_ids = inputs['position_ids']
sent_ids = inputs['segment_ids']
input_mask = inputs['input_mask']
task_ids = inputs['task_ids']
# padding id in vocabulary must be set to 0
emb_out = fluid.layers.embedding(
input=src_ids,
size=[self._voc_size, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._word_emb_name, initializer=self._param_initializer),
is_sparse=False)
# fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
position_emb_out = fluid.layers.embedding(
input=pos_ids,
size=[self._max_position_seq_len, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
sent_emb_out = fluid.layers.embedding(
sent_ids,
size=[self._sent_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
emb_out = emb_out + position_emb_out
emb_out = emb_out + sent_emb_out
task_emb_out = fluid.layers.embedding(
task_ids,
size=[self._task_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._task_emb_name,
initializer=self._param_initializer))
emb_out = emb_out + task_emb_out
emb_out = pre_process_layer(
emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
enc_out = encoder(
enc_input=emb_out,
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
hidden_act=self._hidden_act,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
name=scope_name+'encoder')
next_sent_feat = fluid.layers.slice(
input=enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
bias_attr=scope_name+"pooled_fc.b_0")
return {'embedding_table': embedding_table,
'word_embedding': emb_out,
'encoder_outputs': enc_out,
'sentence_embedding': next_sent_feat,
'sentence_pair_embedding': next_sent_feat}
def postprocess(self, rt_outputs):
pass
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Transformer encoder."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from functools import partial
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.layer_helper import LayerHelper as LayerHelper
from functools import reduce # py3
def layer_norm(x, begin_norm_axis=1, epsilon=1e-6, param_attr=None, bias_attr=None):
helper = LayerHelper('layer_norm', **locals())
mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True)
shift_x = layers.elementwise_sub(x=x, y=mean, axis=0)
variance = layers.reduce_mean(layers.square(shift_x), dim=begin_norm_axis, keep_dim=True)
r_stdev = layers.rsqrt(variance + epsilon)
norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0)
param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])]
param_dtype = norm_x.dtype
scale = helper.create_parameter(
attr=param_attr,
shape=param_shape,
dtype=param_dtype,
default_initializer=fluid.initializer.Constant(1.))
bias = helper.create_parameter(
attr=bias_attr,
shape=param_shape,
dtype=param_dtype,
is_bias=True,
default_initializer=fluid.initializer.Constant(0.))
out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1)
out = layers.elementwise_add(x=out, y=bias, axis=-1)
return out
def multi_head_attention(queries,
keys,
values,
attn_bias,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.,
cache=None,
param_initializer=None,
name='multi_head_att'):
"""
Multi-Head Attention. Note that attn_bias is added to the logit before
computing softmax activiation to mask certain selected positions so that
they will not considered in attention weights.
"""
keys = queries if keys is None else keys
values = keys if values is None else values
if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
raise ValueError(
"Inputs: quries, keys and values should all be 3-D tensors.")
def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
"""
Add linear projection to queries, keys, and values.
"""
q = layers.fc(input=queries,
size=d_key * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_query_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_query_fc.b_0')
k = layers.fc(input=keys,
size=d_key * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_key_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_key_fc.b_0')
v = layers.fc(input=values,
size=d_value * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_value_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_value_fc.b_0')
return q, k, v
def __split_heads(x, n_head):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions and then transpose. Specifically, input a tensor with shape
[bs, max_sequence_length, n_head * hidden_dim] then output a tensor
with shape [bs, n_head, max_sequence_length, hidden_dim].
"""
hidden_size = x.shape[-1]
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
reshaped = layers.reshape(
x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
# permuate the dimensions into:
# [batch_size, n_head, max_sequence_len, hidden_size_per_head]
return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def __combine_heads(x):
"""
Transpose and then reshape the last two dimensions of inpunt tensor x
so that it becomes one dimension, which is reverse to __split_heads.
"""
if len(x.shape) == 3: return x
if len(x.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
return layers.reshape(
x=trans_x,
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
inplace=True)
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
"""
Scaled Dot-Product Attention
"""
scaled_q = layers.scale(x=q, scale=d_key**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
if attn_bias:
product += attn_bias
weights = layers.softmax(product)
if dropout_rate:
weights = layers.dropout(
weights,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
out = layers.matmul(weights, v)
return out
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
if cache is not None: # use cache and concat time steps
# Since the inplace reshape in __split_heads changes the shape of k and
# v, which is the cache input for next time step, reshape the cache
# input from the previous time step first.
k = cache["k"] = layers.concat(
[layers.reshape(
cache["k"], shape=[0, 0, d_model]), k], axis=1)
v = cache["v"] = layers.concat(
[layers.reshape(
cache["v"], shape=[0, 0, d_model]), v], axis=1)
q = __split_heads(q, n_head)
k = __split_heads(k, n_head)
v = __split_heads(v, n_head)
ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
dropout_rate)
out = __combine_heads(ctx_multiheads)
# Project back to the model size.
proj_out = layers.fc(input=out,
size=d_model,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_output_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_output_fc.b_0')
return proj_out
def positionwise_feed_forward(x,
d_inner_hid,
d_hid,
dropout_rate,
hidden_act,
param_initializer=None,
name='ffn'):
"""
Position-wise Feed-Forward Networks.
This module consists of two linear transformations with a ReLU activation
in between, which is applied to each position separately and identically.
"""
hidden = layers.fc(input=x,
size=d_inner_hid,
num_flatten_dims=2,
act=hidden_act,
param_attr=fluid.ParamAttr(
name=name + '_fc_0.w_0',
initializer=param_initializer),
bias_attr=name + '_fc_0.b_0')
if dropout_rate:
hidden = layers.dropout(
hidden,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
out = layers.fc(input=hidden,
size=d_hid,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_fc_1.w_0', initializer=param_initializer),
bias_attr=name + '_fc_1.b_0')
return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
name=''):
"""
Add residual connection, layer normalization and droput to the out tensor
optionally according to the value of process_cmd.
This will be used before or after multi-head attention and position-wise
feed-forward networks.
"""
for cmd in process_cmd:
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out_dtype = out.dtype
if out_dtype == fluid.core.VarDesc.VarType.FP16:
out = layers.cast(x=out, dtype="float32")
out = layer_norm(
out,
begin_norm_axis=len(out.shape) - 1,
param_attr=fluid.ParamAttr(
name=name + '_layer_norm_scale',
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
name=name + '_layer_norm_bias',
initializer=fluid.initializer.Constant(0.)))
if out_dtype == fluid.core.VarDesc.VarType.FP16:
out = layers.cast(x=out, dtype="float16")
elif cmd == "d": # add dropout
if dropout_rate:
out = layers.dropout(
out,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
return out
pre_process_layer = partial(pre_post_process_layer, None)
post_process_layer = pre_post_process_layer
def encoder_layer(enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name=''):
"""The encoder layers that can be stacked to form a deep encoder.
This module consits of a multi-head (self) attention followed by
position-wise feed-forward networks and both the two components companied
with the post_process_layer to add residual connection, layer normalization
and droput.
"""
attn_output = multi_head_attention(
pre_process_layer(
enc_input,
preprocess_cmd,
prepostprocess_dropout,
name=name + '_pre_att'),
None,
None,
attn_bias,
d_key,
d_value,
d_model,
n_head,
attention_dropout,
param_initializer=param_initializer,
name=name + '_multi_head_att')
attn_output = post_process_layer(
enc_input,
attn_output,
postprocess_cmd,
prepostprocess_dropout,
name=name + '_post_att')
ffd_output = positionwise_feed_forward(
pre_process_layer(
attn_output,
preprocess_cmd,
prepostprocess_dropout,
name=name + '_pre_ffn'),
d_inner_hid,
d_model,
relu_dropout,
hidden_act,
param_initializer=param_initializer,
name=name + '_ffn')
return post_process_layer(
attn_output,
ffd_output,
postprocess_cmd,
prepostprocess_dropout,
name=name + '_post_ffn')
def encoder(enc_input,
attn_bias,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name=''):
"""
The encoder is composed of a stack of identical layers returned by calling
encoder_layer.
"""
for i in range(n_layer):
enc_output = encoder_layer(
enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd,
postprocess_cmd,
param_initializer=param_initializer,
name=name + '_layer_' + str(i))
enc_input = enc_output
enc_output = pre_process_layer(
enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
return enc_output
task_instance: "mrqa"
save_path: "output_model/firstrun"
backbone: "bert"
backbone_config_path: "../../pretrain_model/bert/bert_config.json"
batch_size: 4
num_epochs: 2
optimizer: "adam"
learning_rate: 3e-5
warmup_proportion: 0.1
weight_decay: 0.1
print_every_n_steps: 10
train_file: data/mrqa/train.json
reader: mrc
paradigm: mrc
vocab_path: "../../pretrain_model/bert/vocab.txt"
do_lower_case: True
max_seq_len: 512
doc_stride: 128
max_query_len: 64
import paddlepalm as palm
if __name__ == '__main__':
controller = palm.Controller('config.yaml')
controller.load_pretrain('../../pretrain_model/bert/params')
controller.train()
export CUDA_VISIBLE_DEVICES=0
python run.py
此差异已折叠。
此差异已折叠。
import paddlepalm as palm
if __name__ == '__main__':
max_seqlen = 512
batch_size = 32
match_reader = palm.reader.match(train_file, vocab, \
max_seqlen, file_format='csv', tokenizer='wordpiece', \
lang='en', shuffle_train=True)
mrc_reader = palm.reader.mrc(train_file, phase='train')
mlm_reader = palm.reader.mlm(train_file, phase='train')
palm.reader.
match = palm.tasktype.cls(num_classes=4)
mrc = palm.tasktype.match(learning_strategy='pairwise')
mlm = palm.tasktype.mlm()
mlm.print()
bb_flags = palm.load_json('./pretrain/ernie/ernie_config.json')
bb = palm.backbone.ernie(bb_flags['xx'], xxx)
bb.print()
match4mrqa = palm.Task('match4mrqa', match_reader, match_tt)
mrc4mrqa = palm.Task('match4mrqa', match_reader, match_tt)
# match4mrqa.reuse_with(mrc4mrqa)
controller = palm.Controller([mrqa, match4mrqa, mlm4mrqa])
loss = controller.build_forward(bb, mask_task=[])
n_steps = controller.estimate_train_steps(basetask=mrqa, num_epochs=2, batch_size=8, dev_count=4)
adam = palm.optimizer.Adam(loss)
sched = palm.schedualer.LinearWarmup(learning_rate, max_train_steps=n_steps, warmup_steps=0.1*n_steps)
controller.build_backward(optimizer=adam, schedualer=sched, weight_decay=0.001, use_ema=True, ema_decay=0.999)
controller.random_init_params()
controller.load_pretrain('../../pretrain_model/ernie/params')
controller.train()
# controller = palm.Controller(config='config.yaml', task_dir='tasks', for_train=False)
# controller.pred('mrqa', inference_model_dir='output_model/secondrun/mrqa/infer_model')
train_file: "data/match4mrqa/train.tsv"
reader: match
paradigm: match
train_file: "data/mlm4mrqa/train.tsv"
reader: mlm
paradigm: mlm
train_file: data/mrqa/train.json
pred_file: data/mrqa/dev.json
pred_output_path: 'mrqa_output'
reader: mrc
paradigm: mrc
doc_stride: 128
max_query_len: 64
max_answer_len: 30
n_best_size: 20
../../pretrain/
\ No newline at end of file
train_file: "data/cls4mrqa/train.tsv"
reader: cls
paradigm: cls
n_classes: 4
train_file: "data/cls4mrqa/train.tsv"
reader: cls
paradigm: cls
n_classes: 4
train_file: "data/cls4mrqa/train.tsv"
reader: cls
paradigm: cls
n_classes: 4
train_file: "data/cls4mrqa/train.tsv"
reader: cls
paradigm: cls
n_classes: 4
train_file: "data/cls4mrqa/train.tsv"
reader: cls
paradigm: cls
n_classes: 4
train_file: "data/cls4mrqa/train.tsv"
reader: cls
paradigm: cls
n_classes: 4
## Examples 1: Classification
This task is a sentiment analysis task. The following sections detail model preparation, dataset preparation, and how to run the task.
### Step 1: Prepare Pre-trained Models & Datasets
#### Pre-trianed Model
The pre-training model of this mission is: [ernie-zh-base](https://github.com/PaddlePaddle/PALM/tree/r0.3-api).
Make sure you have downloaded the required pre-training model in the current folder.
#### Dataset
This task uses the `chnsenticorp` dataset.
Download dataset:
```shell
python download.py
```
If everything goes well, there will be a folder named `data/` created with all the datas in it.
The data should have 2 fields, `label text_a`, with tsv format. Here is some example datas:
```
label text_a
0 当当网名不符实,订货多日不见送货,询问客服只会推托,只会要求用户再下订单。如此服务留不住顾客的。去别的网站买书服务更好。
0 XP的驱动不好找!我的17号提的货,现在就降价了100元,而且还送杀毒软件!
1 <荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道当年我听说这本书的时候花很长时间去图书馆找和借都没能如愿,所以这次一看到当当有,马上买了,红迷们也要记得备货哦!
```
### Step 2: Train & Predict
The code used to perform classification task is in `run.py`. If you have prepared the pre-training model and the data set required for the task, run:
```shell
python run.py
```
If you want to specify a specific gpu or use multiple gpus for training, please use **`CUDA_VISIBLE_DEVICES`**, for example:
```shell
CUDA_VISIBLE_DEVICES=0,1,2 python run.py
```
Some logs will be shown below:
```
step 1/154 (epoch 0), loss: 5.512, speed: 0.51 steps/s
step 2/154 (epoch 0), loss: 2.595, speed: 3.36 steps/s
step 3/154 (epoch 0), loss: 1.798, speed: 3.48 steps/s
```
After the run, you can view the saved models in the `outputs/` folder and the predictions in the `outputs/predict` folder. Here are some examples of predictions:
```
{"index": 0, "logits": [-0.2014336884021759, 0.6799028515815735], "probs": [0.29290086030960083, 0.7070990800857544], "label": 1}
{"index": 1, "logits": [0.8593899011611938, -0.29743513464927673], "probs": [0.7607553601264954, 0.23924466967582703], "label": 0}
{"index": 2, "logits": [0.7462944388389587, -0.7083730101585388], "probs": [0.8107157349586487, 0.18928426504135132], "label": 0}
```
### Step 3: Evaluate
Once you have the prediction, you can run the evaluation script to evaluate the model:
```shell
python evaluate.py
```
The evaluation results are as follows:
```
precision: 0.956666666667, recall: 0.949013157895, f1: 0.95688225039
```
# -*- coding: utf-8 -*-
import os
import requests
import tarfile
import shutil
from tqdm import tqdm
def download(src, url):
file_size = int(requests.head(url).headers['Content-Length'])
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'70.0.3538.67 Safari/537.36'
}
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
abs_path = os.path.abspath(__file__)
download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
downlaod_path = os.path.join(os.path.dirname(abs_path), "task_data_zh.tgz")
target_dir = os.path.dirname(abs_path)
download(downlaod_path, download_url)
tar = tarfile.open(downlaod_path)
tar.extractall(target_dir)
os.remove(downlaod_path)
abs_path = os.path.abspath(__file__)
dst_dir = os.path.join(os.path.dirname(abs_path), "data")
if not os.path.exists(dst_dir) or not os.path.isdir(dst_dir):
os.makedirs(dst_dir)
for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')):
shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir)
shutil.rmtree(os.path.join(target_dir, 'task_data'))
# -*- coding: utf-8 -*-
import json
import numpy as np
def accuracy(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
return (preds == labels).mean()
def f1(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
tp = np.sum((labels == '1') & (preds == '1'))
tn = np.sum((labels == '0') & (preds == '0'))
fp = np.sum((labels == '0') & (preds == '1'))
fn = np.sum((labels == '1') & (preds == '0'))
p = tp * 1.0 / (tp + fp)
r = tp * 1.0 / (tp + fn) * 1.0
f1 = (2 * p * r) / (p + r + 1e-8)
return f1
def recall(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
# recall=TP/(TP+FN)
tp = np.sum((labels == '1') & (preds == '1'))
fn = np.sum((labels == '1') & (preds == '0'))
re = tp * 1.0 / (tp + fn)
return re
def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
if eval_phase == 'test':
data_dir="./data/test.tsv"
elif eval_phase == 'dev':
data_dir="./data/dev.tsv"
else:
assert eval_phase in ['dev', 'test'], 'eval_phase should be dev or test'
labels = []
with open(data_dir, "r") as file:
first_flag = True
for line in file:
line = line.split("\t")
label = line[0]
if label=='label':
continue
labels.append(str(label))
file.close()
preds = []
with open(res_dir, "r") as file:
for line in file.readlines():
line = json.loads(line)
pred = line['label']
preds.append(str(pred))
file.close()
assert len(labels) == len(preds), "prediction result doesn't match to labels"
print('data num: {}'.format(len(labels)))
print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
res_evaluate()
# coding=utf-8
import paddlepalm as palm
import json
from paddlepalm.distribute import gpu_dev_count
if __name__ == '__main__':
# configs
max_seqlen = 256
batch_size = 8
num_epochs = 10
lr = 5e-5
weight_decay = 0.01
vocab_path = './pretrain/ernie-zh-base/vocab.txt'
train_file = './data/train.tsv'
predict_file = './data/test.tsv'
config = json.load(open('./pretrain/ernie-zh-base/ernie_config.json'))
input_dim = config['hidden_size']
num_classes = 2
dropout_prob = 0.1
random_seed = 1
task_name = 'chnsenticorp'
save_path = './outputs/'
pred_output = './outputs/predict/'
save_type = 'ckpt'
print_steps = 20
pre_params = './pretrain/ernie-zh-base/params'
# ----------------------- for training -----------------------
# step 1-1: create readers for training
cls_reader = palm.reader.ClassifyReader(vocab_path, max_seqlen, seed=random_seed)
# step 1-2: load the training data
cls_reader.load_data(train_file, batch_size, num_epochs=num_epochs)
# step 2: create a backbone of the model to extract text features
ernie = palm.backbone.ERNIE.from_config(config)
# step 3: register the backbone in reader
cls_reader.register_with(ernie)
# step 4: create the task output head
cls_head = palm.head.Classify(num_classes, input_dim, dropout_prob)
# step 5-1: create a task trainer
trainer = palm.Trainer(task_name)
# step 5-2: build forward graph with backbone and task head
loss_var = trainer.build_forward(ernie, cls_head)
# step 6-1*: use warmup
n_steps = cls_reader.num_examples * num_epochs // batch_size
warmup_steps = int(0.1 * n_steps)
sched = palm.lr_sched.TriangularSchedualer(warmup_steps, n_steps)
# step 6-2: create a optimizer
adam = palm.optimizer.Adam(loss_var, lr, sched)
# step 6-3: build backward
trainer.build_backward(optimizer=adam, weight_decay=weight_decay)
# step 7: fit prepared reader and data
trainer.fit_reader(cls_reader)
# step 8-1*: load pretrained parameters
trainer.load_pretrain(pre_params)
# step 8-2*: set saver to save model
# save_steps = n_steps // gpu_dev_count - batch_size
save_steps = 2396
trainer.set_saver(save_steps=save_steps, save_path=save_path, save_type=save_type)
# step 8-3: start training
trainer.train(print_steps=print_steps)
# ----------------------- for prediction -----------------------
# step 1-1: create readers for prediction
print('prepare to predict...')
predict_cls_reader = palm.reader.ClassifyReader(vocab_path, max_seqlen, seed=random_seed, phase='predict')
# step 1-2: load the training data
predict_cls_reader.load_data(predict_file, batch_size)
# step 2: create a backbone of the model to extract text features
pred_ernie = palm.backbone.ERNIE.from_config(config, phase='predict')
# step 3: register the backbone in reader
predict_cls_reader.register_with(pred_ernie)
# step 4: create the task output head
cls_pred_head = palm.head.Classify(num_classes, input_dim, phase='predict')
# step 5: build forward graph with backbone and task head
trainer.build_predict_forward(pred_ernie, cls_pred_head)
# step 6: load pretrained model
# model_path = './outputs/ckpt.step'+str(save_steps)
model_path = './outputs/ckpt.step'+str(11980)
pred_ckpt = trainer.load_ckpt(model_path)
# step 7: fit prepared reader and data
trainer.fit_reader(predict_cls_reader, phase='predict')
# step 8: predict
print('predicting..')
trainer.predict(print_steps=print_steps, output_dir=pred_output)
## Examples 2: Mathing
This task is a sentence pair matching task. The following sections detail model preparation, dataset preparation, and how to run the task.
### Step 1: Prepare Pre-trained Models & Datasets
#### Pre-trianed Model
The pre-training model of this mission is: [ernie-en-base](https://github.com/PaddlePaddle/PALM/tree/r0.3-api).
Make sure you have downloaded the required pre-training model in the current folder.
#### Dataset
This task uses the `Quora Question Pairs matching` dataset.
Download dataset:
```shell
python download.py
```
After the dataset is downloaded, you should convert the data format for training:
```shell
python process.py quora_duplicate_questions.tsv train.tsv test.tsv
```
If everything goes well, there will be a folder named `data/` created with all the converted datas in it.
The data should have 3 fields, `text_a text_b label`, with tsv format. Here is some example datas:
```
text_a text_b label
How can the arrangement of corynebacterium xerosis be described? How would you describe waves? 0
How do you fix a Google Play Store account that isn't working? What can cause the Google Play store to not open? How are such probelms fixed? 1
Which is the best earphone under 1000? What are the best earphones under 1k? 1
What are the differences between the Dell Inspiron 3000, 5000, and 7000 series laptops? "Should I buy an Apple MacBook Pro 15"" or a Dell Inspiron 17 5000 series?" 0
```
### Step 2: Train & Predict
The code used to perform classification task is in `run.py`. If you have prepared the pre-training model and the data set required for the task, run:
```shell
python run.py
```
If you want to specify a specific gpu or use multiple gpus for training, please use **`CUDA_VISIBLE_DEVICES`**, for example:
```shell
CUDA_VISIBLE_DEVICES=0,1,2 python run.py
```
Some logs will be shown below:
```
step 20/49087 (epoch 0), loss: 1.079, speed: 3.48 steps/s
step 40/49087 (epoch 0), loss: 1.251, speed: 5.18 steps/s
step 60/49087 (epoch 0), loss: 1.193, speed: 5.04 steps/s
```
After the run, you can view the saved models in the `outputs/` folder and the predictions in the `outputs/predict` folder. Here are some examples of predictions:
```
{"index": 0, "logits": [-0.32688724994659424, -0.8568955063819885], "probs": [0.629485011100769, 0.3705149292945862], "label": 0}
{"index": 1, "logits": [-0.2735646963119507, -0.7983021140098572], "probs": [0.6282548904418945, 0.37174513936042786], "label": 0}
{"index": 2, "logits": [-0.3381381630897522, -0.8614270091056824], "probs": [0.6279165148735046, 0.37208351492881775], "label": 0}
```
### Step 3: Evaluate
Once you have the prediction, you can run the evaluation script to evaluate the model:
```shell
python evaluate.py
```
The evaluation results are as follows:
```
precision: 0.857906976744, recall: 0.824249846908, f1: 0.81501664653
```
# -*- coding: utf-8 -*-
import os
import requests
from tqdm import tqdm
def download(src, url):
file_size = int(requests.head(url).headers['Content-Length'])
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'70.0.3538.67 Safari/537.36'
}
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
abs_path = os.path.abspath(__file__)
data_dir = os.path.join(os.path.dirname(abs_path), "data")
if not os.path.exists(data_dir) or not os.path.isdir(data_dir):
os.makedirs(data_dir)
download_url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
downlaod_path = os.path.join(data_dir, "quora_duplicate_questions.tsv")
download(downlaod_path, download_url)
# -*- coding: utf-8 -*-
import json
import numpy as np
def accuracy(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
return (preds == labels).mean()
def f1(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
tp = np.sum((labels == '1') & (preds == '1'))
tn = np.sum((labels == '0') & (preds == '0'))
fp = np.sum((labels == '0') & (preds == '1'))
fn = np.sum((labels == '1') & (preds == '0'))
p = tp * 1.0 / (tp + fp)
r = tp * 1.0 / (tp + fn) * 1.0
f1 = (2 * p * r) / (p + r + 1e-8)
return f1
def recall(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
# recall=TP/(TP+FN)
tp = np.sum((labels == '1') & (preds == '1'))
fn = np.sum((labels == '1') & (preds == '0'))
re = tp * 1.0 / (tp + fn)
return re
def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
if eval_phase == 'test':
data_dir="./data/test.tsv"
elif eval_phase == 'dev':
data_dir="./data/dev.tsv"
else:
assert eval_phase in ['dev', 'test'], 'eval_phase should be dev or test'
labels = []
with open(data_dir, "r") as file:
first_flag = True
for line in file:
line = line.split("\t")
label = line[2][:-1]
if label=='label':
continue
labels.append(str(label))
file.close()
preds = []
with open(res_dir, "r") as file:
for line in file.readlines():
line = json.loads(line)
pred = line['label']
preds.append(str(pred))
file.close()
assert len(labels) == len(preds), "prediction result({}) doesn't match to labels({})".format(len(preds),len(labels))
print('data num: {}'.format(len(labels)))
print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
res_evaluate()
# -*- coding: utf-8 -*-
import sys
import os
if len(sys.argv) != 4:
exit(0)
data_dir = sys.argv[1]
if not os.path.exists(data_dir):
print("%s not exists" % data_dir)
exit(0)
train_dir = sys.argv[2]
train_file = open(train_dir, "w")
train_file.write("text_a\ttext_b\tlabel\n")
test_dir = sys.argv[3]
test_file = open(test_dir, "w")
test_file.write("text_a\ttext_b\tlabel\n")
with open(data_dir, "r") as file:
before = ""
cnt = 0
for line in file:
line = line.strip("\n")
line_t = line.split("\t")
flag = 0
if len(line_t) < 6:
if flag:
flag = 0
out_line = "{}{}\n".format(out_line, line)
else:
flag = 1
outline = "{}".format(line)
continue
else:
out_line = "{}\t{}\t{}\n".format(line_t[3], line_t[4], line_t[5])
cnt += 1
if 2 <= cnt <= 4301:
test_file.write(out_line)
if 4301 <= cnt <= 104301:
train_file.write(out_line)
train_file.close()
test_file.close()
# coding=utf-8
import paddlepalm as palm
import json
from paddlepalm.distribute import gpu_dev_count
if __name__ == '__main__':
# configs
max_seqlen = 128
batch_size = 16
num_epochs = 3
lr = 3e-5
weight_decay = 0.0
num_classes = 2
random_seed = 1
dropout_prob = 0.1
save_path = './outputs/'
save_type = 'ckpt'
pred_model_path = './outputs/ckpt.step'+str(18732)
print_steps = 50
pred_output = './outputs/predict/'
pre_params = './pretrain/ernie-en-base/params'
task_name = 'Quora Question Pairs matching'
vocab_path = './pretrain/ernie-en-base/vocab.txt'
train_file = './data/train.tsv'
predict_file = './data/test.tsv'
config = json.load(open('./pretrain/ernie-en-base/ernie_config.json'))
input_dim = config['hidden_size']
# ----------------------- for training -----------------------
# step 1-1: create readers for training
match_reader = palm.reader.MatchReader(vocab_path, max_seqlen, seed=random_seed)
# step 1-2: load the training data
match_reader.load_data(train_file, file_format='tsv', num_epochs=num_epochs, batch_size=batch_size)
# step 2: create a backbone of the model to extract text features
ernie = palm.backbone.ERNIE.from_config(config)
# step 3: register the backbone in reader
match_reader.register_with(ernie)
# step 4: create the task output head
match_head = palm.head.Match(num_classes, input_dim, dropout_prob)
# step 5-1: create a task trainer
trainer = palm.Trainer(task_name)
# step 5-2: build forward graph with backbone and task head
loss_var = trainer.build_forward(ernie, match_head)
# step 6-1*: use warmup
n_steps = match_reader.num_examples * num_epochs // batch_size
warmup_steps = int(0.1 * n_steps)
print('total_steps: {}'.format(n_steps))
print('warmup_steps: {}'.format(warmup_steps))
sched = palm.lr_sched.TriangularSchedualer(warmup_steps, n_steps)
# step 6-2: create a optimizer
adam = palm.optimizer.Adam(loss_var, lr, sched)
# step 6-3: build backward
trainer.build_backward(optimizer=adam, weight_decay=weight_decay)
# step 7: fit prepared reader and data
trainer.fit_reader(match_reader)
# step 8-1*: load pretrained parameters
trainer.load_pretrain(pre_params, False)
# step 8-2*: set saver to save model
# save_steps = (n_steps-16) // gpu_dev_count
save_steps = 6244
trainer.set_saver(save_path=save_path, save_steps=save_steps, save_type=save_type)
# step 8-3: start training
trainer.train(print_steps=print_steps)
# ----------------------- for prediction -----------------------
# step 1-1: create readers for prediction
print('prepare to predict...')
predict_match_reader = palm.reader.MatchReader(vocab_path, max_seqlen, seed=random_seed, phase='predict')
# step 1-2: load the training data
predict_match_reader.load_data(predict_file, batch_size)
# step 2: create a backbone of the model to extract text features
pred_ernie = palm.backbone.ERNIE.from_config(config, phase='predict')
# step 3: register the backbone in reader
predict_match_reader.register_with(pred_ernie)
# step 4: create the task output head
match_pred_head = palm.head.Match(num_classes, input_dim, phase='predict')
# step 5: build forward graph with backbone and task head
trainer.build_predict_forward(pred_ernie, match_pred_head)
# step 6: load pretrained model
pred_ckpt = trainer.load_ckpt(pred_model_path)
# step 7: fit prepared reader and data
trainer.fit_reader(predict_match_reader, phase='predict')
# step 8: predict
print('predicting..')
trainer.predict(print_steps=print_steps, output_dir=pred_output)
## Examples 4: Machine Reading Comprehension
This task is a machine reading comprehension task. The following sections detail model preparation, dataset preparation, and how to run the task.
### Step 1: Prepare Pre-trained Models & Datasets
#### Pre-trianed Model
The pre-training model of this mission is: [ernie-zh-base](https://github.com/PaddlePaddle/PALM/tree/r0.3-api).
Make sure you have downloaded the required pre-training model in the current folder.
#### Dataset
This task uses the `CMRC2018` dataset. `CMRC2018` is an evaluation conducted by Chinese information society. The task of evaluation is to extract reading comprehension.
Download dataset:
```shell
python download.py
```
If everything goes well, there will be a folder named `data/` created with all the datas in it.
Here is some example datas:
```json
"paragraphs": [
{
"id": "TRAIN_36",
"context": "NGC 6231是一个位于天蝎座的疏散星团,天球座标为赤经16时54分,赤纬-41度48分,视觉观测大小约45角分,亮度约2.6视星等,距地球5900光年。NGC 6231年龄约为三百二十万年,是一个非常年轻的星团,星团内的最亮星是5等的天蝎座 ζ1星。用双筒望远镜或小型望远镜就能看到个别的行星。NGC 6231在1654年被意大利天文学家乔瓦尼·巴蒂斯特·霍迪尔纳(Giovanni Battista Hodierna)以Luminosae的名字首次纪录在星表中,但是未见记载于夏尔·梅西耶的天体列表和威廉·赫歇尔的深空天体目录。这个天体在1678年被爱德蒙·哈雷(I.7)、1745年被夏西亚科斯(Jean-Phillippe Loys de Cheseaux)(9)、1751年被尼可拉·路易·拉卡伊(II.13)分别再次独立发现。",
"qas": [
{
"question": "NGC 6231的经纬度是多少?",
"id": "TRAIN_36_QUERY_0",
"answers": [
{
"text": "赤经16时54分,赤纬-41度48分",
"answer_start": 27
}
]
}
```
### Step 2: Train & Predict
The code used to perform classification task is in `run.py`. If you have prepared the pre-training model and the data set required for the task, run:
```shell
python run.py
```
If you want to specify a specific gpu or use multiple gpus for training, please use **`CUDA_VISIBLE_DEVICES`**, for example:
```shell
CUDA_VISIBLE_DEVICES=0,1,2 python run.py
```
Some logs will be shown below:
```
step 1/1515 (epoch 0), loss: 6.251, speed: 0.31 steps/s
step 2/1515 (epoch 0), loss: 6.206, speed: 0.80 steps/s
step 3/1515 (epoch 0), loss: 6.172, speed: 0.86 steps/s
```
After the run, you can view the saved models in the `outputs/` folder and the predictions in the `outputs/predict` folder. Here are some examples of predictions:
```json
{
"DEV_0_QUERY_0": "光 荣 和 ω-force 开 发",
"DEV_0_QUERY_1": "任 天 堂 游 戏 谜 之 村 雨 城",
"DEV_0_QUERY_2": "战 史 演 武 」&「 争 霸 演 武 」。",
"DEV_1_QUERY_0": "大 陆 传 统 器 乐 及 戏 曲 里 面 常 用 的 打 击 乐 记 谱 方 法 , 以 中 文 字 的 声 音 模 拟 敲 击 乐 的 声 音 , 纪 录 打 击 乐 的 各 种 不 同 的 演 奏 方 法 。",
"DEV_1_QUERY_1": "「 锣 鼓 点",
"DEV_1_QUERY_2": "锣 鼓 的 运 用 有 约 定 俗 成 的 程 式 , 依 照 角 色 行 当 的 身 份 、 性 格 、 情 绪 以 及 环 境 , 配 合 相 应 的 锣 鼓 点",
"DEV_1_QUERY_3": "鼓 、 锣 、 钹 和 板 四 类 型",
"DEV_2_QUERY_0": "364.6 公 里",
}
```
### Step 3: Evaluate
Once you have the prediction, you can run the evaluation script to evaluate the model:
```shell
python evaluate.py
```
The evaluation results are as follows:
```
data_num: 3219
em_sroce: 0.963031997515, f1: 83.9865402973
```
# -*- coding: utf-8 -*-
import os
import requests
import tarfile
import shutil
from tqdm import tqdm
def download(src, url):
file_size = int(requests.head(url).headers['Content-Length'])
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'70.0.3538.67 Safari/537.36'
}
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
abs_path = os.path.abspath(__file__)
download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
downlaod_path = os.path.join(os.path.dirname(abs_path), "task_data_zh.tgz")
target_dir = os.path.dirname(abs_path)
download(downlaod_path, download_url)
tar = tarfile.open(downlaod_path)
tar.extractall(target_dir)
os.remove(downlaod_path)
abs_path = os.path.abspath(__file__)
dst_dir = os.path.join(os.path.dirname(abs_path), "data")
if not os.path.exists(dst_dir) or not os.path.isdir(dst_dir):
os.makedirs(dst_dir)
for file in os.listdir(os.path.join(target_dir, 'task_data', 'cmrc2018')):
shutil.move(os.path.join(target_dir, 'task_data', 'cmrc2018', file), dst_dir)
shutil.rmtree(os.path.join(target_dir, 'task_data'))
# -*- coding: utf-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Evaluation script for CMRC 2018
version: v5
Note:
v5 formatted output, add usage description
v4 fixed segmentation issues
'''
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from collections import Counter, OrderedDict
import string
import re
import argparse
import json
import sys
import nltk
import pdb
# split Chinese with English
def mixed_segmentation(in_str, rm_punc=False):
in_str = in_str.lower().strip()
segs_out = []
temp_str = ""
sp_char = [
'-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', ',', '。', ':',
'?', '!', '“', '”', ';', '’', '《', '》', '……', '·', '、', '「', '」', '(',
')', '-', '~', '『', '』'
]
for char in in_str:
if rm_punc and char in sp_char:
continue
if re.search(r'[\u4e00-\u9fa5]', char) or char in sp_char:
if temp_str != "":
ss = nltk.word_tokenize(temp_str)
segs_out.extend(ss)
temp_str = ""
segs_out.append(char)
else:
temp_str += char
#handling last part
if temp_str != "":
ss = nltk.word_tokenize(temp_str)
segs_out.extend(ss)
return segs_out
# remove punctuation
def remove_punctuation(in_str):
in_str = in_str.lower().strip()
sp_char = [
'-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', ',', '。', ':',
'?', '!', '“', '”', ';', '’', '《', '》', '……', '·', '、', '「', '」', '(',
')', '-', '~', '『', '』'
]
out_segs = []
for char in in_str:
if char in sp_char:
continue
else:
out_segs.append(char)
return ''.join(out_segs)
# find longest common string
def find_lcs(s1, s2):
m = [[0 for i in range(len(s2) + 1)] for j in range(len(s1) + 1)]
mmax = 0
p = 0
for i in range(len(s1)):
for j in range(len(s2)):
if s1[i] == s2[j]:
m[i + 1][j + 1] = m[i][j] + 1
if m[i + 1][j + 1] > mmax:
mmax = m[i + 1][j + 1]
p = i + 1
return s1[p - mmax:p], mmax
#
def evaluate(ground_truth_file, prediction_file):
f1 = 0
em = 0
total_count = 0
skip_count = 0
for instances in ground_truth_file["data"]:
for instance in instances["paragraphs"]:
context_text = instance['context'].strip()
for qas in instance['qas']:
total_count += 1
query_id = qas['id'].strip()
query_text = qas['question'].strip()
answers = [ans["text"] for ans in qas["answers"]]
if query_id not in prediction_file:
print('Unanswered question: {}\n'.format(
query_id))
skip_count += 1
continue
prediction = prediction_file[query_id]
f1 += calc_f1_score(answers, prediction)
em += calc_em_score(answers, prediction)
f1_score = 100.0 * f1 / total_count
em_score = 100.0 * em / total_count
return f1_score, em_score, total_count, skip_count
def calc_f1_score(answers, prediction):
f1_scores = []
for ans in answers:
ans_segs = mixed_segmentation(ans, rm_punc=True)
prediction_segs = mixed_segmentation(prediction, rm_punc=True)
lcs, lcs_len = find_lcs(ans_segs, prediction_segs)
if lcs_len == 0:
f1_scores.append(0)
continue
precision = 1.0 * lcs_len / len(prediction_segs)
recall = 1.0 * lcs_len / len(ans_segs)
f1 = (2 * precision * recall) / (precision + recall)
f1_scores.append(f1)
return max(f1_scores)
def calc_em_score(answers, prediction):
em = 0
for ans in answers:
ans_ = remove_punctuation(ans)
prediction_ = remove_punctuation(prediction)
if ans_ == prediction_:
em = 1
break
return em
def eval_file(dataset_file, prediction_file):
ground_truth_file = json.load(open(dataset_file, 'r'))
prediction_file = json.load(open(prediction_file, 'r'))
F1, EM, TOTAL, SKIP = evaluate(ground_truth_file, prediction_file)
AVG = (EM + F1) * 0.5
return EM, F1, AVG, TOTAL
if __name__ == '__main__':
EM, F1, AVG, TOTAL = eval_file("task_data/cmrc2018/dev.json", "predictions.json")
print(EM)
print(F1)
print(TOTAL)
\ No newline at end of file
# coding=utf-8
import paddlepalm as palm
import json
from paddlepalm.distribute import gpu_dev_count
if __name__ == '__main__':
# configs
max_seqlen = 512
batch_size = 8
num_epochs = 8
lr = 3e-5
doc_stride = 128
max_query_len = 64
max_ans_len = 128
weight_decay = 0.01
print_steps = 20
vocab_path = './pretrain/ernie-zh-base/vocab.txt'
do_lower_case = True
train_file = './data/train.json'
predict_file = './data/dev.json'
save_path = './outputs/'
pred_output = './outputs/predict/'
save_type = 'ckpt'
task_name = 'cmrc2018'
pre_params = './pretrain/ernie-zh-base/params'
config = json.load(open('./pretrain/ernie-zh-base/ernie_config.json'))
# ----------------------- for training -----------------------
# step 1-1: create readers for training
mrc_reader = palm.reader.MRCReader(vocab_path, max_seqlen, max_query_len, doc_stride, do_lower_case=do_lower_case)
# step 1-2: load the training data
mrc_reader.load_data(train_file, file_format='json', num_epochs=num_epochs, batch_size=batch_size)
# step 2: create a backbone of the model to extract text features
ernie = palm.backbone.ERNIE.from_config(config)
# step 3: register the backbone in reader
mrc_reader.register_with(ernie)
# step 4: create the task output head
mrc_head = palm.head.MRC(max_query_len, config['hidden_size'], do_lower_case=do_lower_case, max_ans_len=max_ans_len)
# step 5-1: create a task trainer
trainer = palm.Trainer(task_name)
# step 5-2: build forward graph with backbone and task head
loss_var = trainer.build_forward(ernie, mrc_head)
# step 6-1*: use warmup
n_steps = mrc_reader.num_examples * num_epochs // batch_size
warmup_steps = int(0.1 * n_steps)
sched = palm.lr_sched.TriangularSchedualer(warmup_steps, n_steps)
# step 6-2: create a optimizer
adam = palm.optimizer.Adam(loss_var, lr, sched)
# step 6-3: build backward
trainer.build_backward(optimizer=adam, weight_decay=weight_decay)
# step 7: fit prepared reader and data
trainer.fit_reader(mrc_reader)
# step 8-1*: load pretrained parameters
trainer.load_pretrain(pre_params)
# step 8-2*: set saver to save model
# save_steps = (n_steps-8) // gpu_dev_count // 4
save_steps = 1520
trainer.set_saver(save_path=save_path, save_steps=save_steps, save_type=save_type)
# step 8-3: start training
trainer.train(print_steps=print_steps)
# ----------------------- for prediction -----------------------
# step 1-1: create readers for prediction
predict_mrc_reader = palm.reader.MRCReader(vocab_path, max_seqlen, max_query_len, doc_stride, do_lower_case=do_lower_case, phase='predict')
# step 1-2: load the training data
predict_mrc_reader.load_data(predict_file, batch_size)
# step 2: create a backbone of the model to extract text features
pred_ernie = palm.backbone.ERNIE.from_config(config, phase='predict')
# step 3: register the backbone in reader
predict_mrc_reader.register_with(pred_ernie)
# step 4: create the task output head
mrc_pred_head = palm.head.MRC(max_query_len, config['hidden_size'], do_lower_case=do_lower_case, max_ans_len=max_ans_len, phase='predict')
# step 5: build forward graph with backbone and task head
trainer.build_predict_forward(pred_ernie, mrc_pred_head)
# step 6: load pretrained model
pred_model_path = './outputs/ckpt.step'+str(12160)
pred_ckpt = trainer.load_ckpt(pred_model_path)
# step 7: fit prepared reader and data
trainer.fit_reader(predict_mrc_reader, phase='predict')
# step 8: predict
print('predicting..')
trainer.predict(print_steps=print_steps, output_dir="outputs/")
## Examples 5: Predict(Classification)
This task is a sentiment analysis task. The following sections detail model preparation, dataset preparation, and how to run the task.
### Step 1: Prepare Pre-trained Models & Datasets
#### Pre-trianed Model
The pre-training model of this mission is: [ernie-zh-base](https://github.com/PaddlePaddle/PALM/tree/r0.3-api).
Make sure you have downloaded the required pre-training model in the current folder.
#### Dataset
This task uses the `chnsenticorp` dataset.
Download dataset:
```shell
python download.py
```
If everything goes well, there will be a folder named `data/` created with all the datas in it.
The data should have 2 fields, `label text_a`, with tsv format. Here is some example datas:
```
label text_a
0 当当网名不符实,订货多日不见送货,询问客服只会推托,只会要求用户再下订单。如此服务留不住顾客的。去别的网站买书服务更好。
0 XP的驱动不好找!我的17号提的货,现在就降价了100元,而且还送杀毒软件!
1 <荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道当年我听说这本书的时候花很长时间去图书馆找和借都没能如愿,所以这次一看到当当有,马上买了,红迷们也要记得备货哦!
```
### Step 2: Predict
The code used to perform classification task is in `run.py`. If you have prepared the pre-training model and the data set required for the task, run:
```shell
python run.py
```
If you want to specify a specific gpu or use multiple gpus for predict, please use **`CUDA_VISIBLE_DEVICES`**, for example:
```shell
CUDA_VISIBLE_DEVICES=0,1,2 python run.py
```
Some logs will be shown below:
```
step 1/154, speed: 0.51 steps/s
step 2/154, speed: 3.36 steps/s
step 3/154, speed: 3.48 steps/s
```
After the run, you can view the predictions in the `outputs/predict` folder. Here are some examples of predictions:
```
{"index": 0, "logits": [-0.2014336884021759, 0.6799028515815735], "probs": [0.29290086030960083, 0.7070990800857544], "label": 1}
{"index": 1, "logits": [0.8593899011611938, -0.29743513464927673], "probs": [0.7607553601264954, 0.23924466967582703], "label": 0}
{"index": 2, "logits": [0.7462944388389587, -0.7083730101585388], "probs": [0.8107157349586487, 0.18928426504135132], "label": 0}
```
### Step 3: Evaluate
Once you have the prediction, you can run the evaluation script to evaluate the model:
```shell
python evaluate.py
```
The evaluation results are as follows: (need to update)
```
precision: 0.956666666667, recall: 0.949013157895, f1: 0.95688225039
```
# -*- coding: utf-8 -*-
import os
import requests
import tarfile
import shutil
from tqdm import tqdm
def download(src, url):
file_size = int(requests.head(url).headers['Content-Length'])
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'70.0.3538.67 Safari/537.36'
}
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
abs_path = os.path.abspath(__file__)
download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
downlaod_path = os.path.join(os.path.dirname(abs_path), "task_data_zh.tgz")
target_dir = os.path.dirname(abs_path)
download(downlaod_path, download_url)
tar = tarfile.open(downlaod_path)
tar.extractall(target_dir)
os.remove(downlaod_path)
abs_path = os.path.abspath(__file__)
dst_dir = os.path.join(os.path.dirname(abs_path), "data")
if not os.path.exists(dst_dir) or not os.path.isdir(dst_dir):
os.makedirs(dst_dir)
for file in os.listdir(os.path.join(target_dir, 'task_data', 'chnsenticorp')):
shutil.move(os.path.join(target_dir, 'task_data', 'chnsenticorp', file), dst_dir)
shutil.rmtree(os.path.join(target_dir, 'task_data'))
# -*- coding: utf-8 -*-
import json
import numpy as np
def accuracy(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
return (preds == labels).mean()
def f1(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
tp = np.sum((labels == '1') & (preds == '1'))
tn = np.sum((labels == '0') & (preds == '0'))
fp = np.sum((labels == '0') & (preds == '1'))
fn = np.sum((labels == '1') & (preds == '0'))
p = tp * 1.0 / (tp + fp)
r = tp * 1.0 / (tp + fn) * 1.0
f1 = (2 * p * r) / (p + r + 1e-8)
return f1
def recall(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
# recall=TP/(TP+FN)
tp = np.sum((labels == '1') & (preds == '1'))
fn = np.sum((labels == '1') & (preds == '0'))
re = tp * 1.0 / (tp + fn)
return re
def res_evaluate(res_dir="./outputs/predict/predictions.json", eval_phase='test'):
if eval_phase == 'test':
data_dir="./data/test.tsv"
elif eval_phase == 'dev':
data_dir="./data/dev.tsv"
else:
assert eval_phase in ['dev', 'test'], 'eval_phase should be dev or test'
labels = []
with open(data_dir, "r") as file:
first_flag = True
for line in file:
line = line.split("\t")
label = line[0]
if label=='label':
continue
labels.append(str(label))
file.close()
preds = []
with open(res_dir, "r") as file:
for line in file.readlines():
line = json.loads(line)
pred = line['label']
preds.append(str(pred))
file.close()
assert len(labels) == len(preds), "prediction result doesn't match to labels"
print('data num: {}'.format(len(labels)))
print("precision: {}, recall: {}, f1: {}".format(accuracy(preds, labels), recall(preds, labels), f1(preds, labels)))
res_evaluate()
# coding=utf-8
import paddlepalm as palm
import json
from paddlepalm.distribute import gpu_dev_count
if __name__ == '__main__':
# configs
max_seqlen = 256
batch_size = 8
vocab_path = './pretrain/ernie-zh-base/vocab.txt'
predict_file = './data/test.tsv'
random_seed = 1
config = json.load(open('./pretrain/ernie-zh-base/ernie_config.json'))
input_dim = config['hidden_size']
num_classes = 2
task_name = 'chnsenticorp'
pred_output = './outputs/predict/'
print_steps = 20
pre_params = './pretrain/ernie-zh-base/params'
# ----------------------- for prediction -----------------------
# step 1-1: create readers for prediction
print('prepare to predict...')
predict_cls_reader = palm.reader.ClassifyReader(vocab_path, max_seqlen, seed=random_seed, phase='predict')
# step 1-2: load the training data
predict_cls_reader.load_data(predict_file, batch_size)
# step 2: create a backbone of the model to extract text features
pred_ernie = palm.backbone.ERNIE.from_config(config, phase='predict')
# step 3: register the backbone in reader
predict_cls_reader.register_with(pred_ernie)
# step 4: create the task output head
cls_pred_head = palm.head.Classify(num_classes, input_dim, phase='predict')
# step 5-1: create a task trainer
trainer = palm.Trainer(task_name)
# step 5-2: build forward graph with backbone and task head
trainer.build_predict_forward(pred_ernie, cls_pred_head)
# step 6: load pretrained model
pred_model = trainer.load_ckpt(pre_params)
# step 7: fit prepared reader and data
trainer.fit_reader(predict_cls_reader, phase='predict')
# step 8: predict
print('predicting..')
trainer.predict(print_steps=print_steps, output_dir=pred_output)
## Examples 3: Tagging
This task is a named entity recognition task. The following sections detail model preparation, dataset preparation, and how to run the task.
### Step 1: Prepare Pre-trained Models & Datasets
#### Pre-trianed Model
The pre-training model of this mission is: [ernie-zh-base](https://github.com/PaddlePaddle/PALM/tree/r0.3-api).
Make sure you have downloaded the required pre-training model in the current folder.
#### Dataset
This task uses the `MSRA-NER(SIGHAN2006)` dataset.
Download dataset:
```shell
python download.py
```
If everything goes well, there will be a folder named `data/` created with all the datas in it.
The data should have 2 fields, `text_a label`, with tsv format. Here is some example datas:
```
text_a label
在 这 里 恕 弟 不 恭 之 罪 , 敢 在 尊 前 一 诤 : 前 人 论 书 , 每 曰 “ 字 字 有 来 历 , 笔 笔 有 出 处 ” , 细 读 公 字 , 何 尝 跳 出 前 人 藩 篱 , 自 隶 变 而 后 , 直 至 明 季 , 兄 有 何 新 出 ? O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
相 比 之 下 , 青 岛 海 牛 队 和 广 州 松 日 队 的 雨 中 之 战 虽 然 也 是 0 ∶ 0 , 但 乏 善 可 陈 。 O O O O O B-ORG I-ORG I-ORG I-ORG I-ORG O B-ORG I-ORG I-ORG I-ORG I-ORG O O O O O O O O O O O O O O O O O O O
理 由 多 多 , 最 无 奈 的 却 是 : 5 月 恰 逢 双 重 考 试 , 她 攻 读 的 博 士 学 位 论 文 要 通 考 ; 她 任 教 的 两 所 学 校 , 也 要 在 这 段 时 日 大 考 。 O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
```
### Step 2: Train & Predict
The code used to perform classification task is in `run.py`. If you have prepared the pre-training model and the data set required for the task, run:
```shell
python run.py
```
If you want to specify a specific gpu or use multiple gpus for training, please use **`CUDA_VISIBLE_DEVICES`**, for example:
```shell
CUDA_VISIBLE_DEVICES=0,1,2 python run.py
```
Some logs will be shown below:
```
step 1/652 (epoch 0), loss: 216.002, speed: 0.32 steps/s
step 2/652 (epoch 0), loss: 202.567, speed: 1.28 steps/s
step 3/652 (epoch 0), loss: 170.677, speed: 1.05 steps/s
```
After the run, you can view the saved models in the `outputs/` folder and the predictions in the `outputs/predict` folder. Here are some examples of predictions:
```
[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 6, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
```
### Step 3: Evaluate
Once you have the prediction, you can run the evaluation script to evaluate the model:
```python
python evaluate.py
```
The evaluation results are as follows:
```
precision: 0.948718989809, recall: 0.944806113784, f1: 0.946758508914
```
# -*- coding: utf-8 -*-
import os
import requests
import tarfile
import shutil
from tqdm import tqdm
def download(src, url):
file_size = int(requests.head(url).headers['Content-Length'])
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'70.0.3538.67 Safari/537.36'
}
pbar = tqdm(total=file_size)
resp = requests.get(url, headers=header, stream=True)
with open(src, 'ab') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(1024)
pbar.close()
return file_size
abs_path = os.path.abspath(__file__)
download_url = "https://ernie.bj.bcebos.com/task_data_zh.tgz"
downlaod_path = os.path.join(os.path.dirname(abs_path), "task_data_zh.tgz")
target_dir = os.path.dirname(abs_path)
download(downlaod_path, download_url)
tar = tarfile.open(downlaod_path)
tar.extractall(target_dir)
os.remove(downlaod_path)
abs_path = os.path.abspath(__file__)
dst_dir = os.path.join(os.path.dirname(abs_path), "data")
if not os.path.exists(dst_dir) or not os.path.isdir(dst_dir):
os.makedirs(dst_dir)
for file in os.listdir(os.path.join(target_dir, 'task_data', 'msra_ner')):
shutil.move(os.path.join(target_dir, 'task_data', 'msra_ner', file), dst_dir)
shutil.rmtree(os.path.join(target_dir, 'task_data'))
# -*- coding: utf-8 -*-
import json
def load_label_map(map_dir="./data/label_map.json"):
"""
:param map_dir: dict indictuing chunk type
:return:
"""
return json.load(open(map_dir, "r"))
def cal_chunk(total_res, total_label):
assert len(total_label) == len(total_res), 'prediction result doesn\'t match to labels'
num_labels = 0
num_corr = 0
num_infers = 0
for res, label in zip(total_res, total_label):
assert len(res) == len(label), "prediction result doesn\'t match to labels"
num_labels += sum([0 if i == 6 else 1 for i in label])
num_corr += sum([1 if label[i] == res[i] and label[i] != 6 else 0 for i in range(len(label))])
num_infers += sum([0 if i == 6 else 1 for i in res])
precision = num_corr * 1.0 / num_infers if num_infers > 0 else 0.0
recall = num_corr * 1.0 / num_labels if num_labels > 0 else 0.0
f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
return precision, recall, f1
def res_evaluate(res_dir="./outputs/predict/predictions.json", data_dir="./data/test.tsv"):
label_map = load_label_map()
total_label = []
with open(data_dir, "r") as file:
first_flag = True
for line in file:
if first_flag:
first_flag = False
continue
line = line.strip("\n")
if len(line) == 0:
continue
line = line.split("\t")
if len(line) < 2:
continue
labels = line[1].split("\x02")
total_label.append(labels)
total_label = [[label_map[j] for j in i] for i in total_label]
total_res = []
with open(res_dir, "r") as file:
cnt = 0
for line in file:
line = line.strip("\n")
if len(line) == 0:
continue
try:
res_arr = json.loads(line)
if len(total_label[cnt]) < len(res_arr):
total_res.append(res_arr[1: 1 + len(total_label[cnt])])
elif len(total_label[cnt]) == len(res_arr):
total_res.append(res_arr)
else:
total_res.append(res_arr)
total_label[cnt] = total_label[cnt][: len(res_arr)]
except:
print("json format error: {}".format(cnt))
print(line)
cnt += 1
precision, recall, f1 = cal_chunk(total_res, total_label)
print("precision: {}, recall: {}, f1: {}".format(precision, recall, f1))
res_evaluate()
# coding=utf-8
import paddlepalm as palm
import json
from paddlepalm.distribute import gpu_dev_count
if __name__ == '__main__':
# configs
max_seqlen = 256
batch_size = 16
num_epochs = 6
lr = 5e-5
num_classes = 7
weight_decay = 0.01
dropout_prob = 0.1
vocab_path = './pretrain/ernie-zh-base/vocab.txt'
label_map = './data/label_map.json'
random_seed = 1
train_file = './data/train.tsv'
predict_file = './data/test.tsv'
save_path='./outputs/'
save_type='ckpt'
pre_params = './pretrain/ernie-zh-base/params'
config = json.load(open('./pretrain/ernie-zh-base/ernie_config.json'))
input_dim = config['hidden_size']
task_name = 'msra_ner'
pred_output = './outputs/predict/'
train_print_steps = 10
pred_print_steps = 20
# ----------------------- for training -----------------------
# step 1-1: create readers for training
ner_reader = palm.reader.SequenceLabelReader(vocab_path, max_seqlen, label_map, seed=random_seed)
# step 1-2: load the training data
ner_reader.load_data(train_file, file_format='tsv', num_epochs=num_epochs, batch_size=batch_size)
# step 2: create a backbone of the model to extract text features
ernie = palm.backbone.ERNIE.from_config(config)
# step 3: register the backbone in reader
ner_reader.register_with(ernie)
# step 4: create the task output head
ner_head = palm.head.SequenceLabel(num_classes, input_dim, dropout_prob)
# step 5-1: create a task trainer
trainer = palm.Trainer(task_name)
# step 5-2: build forward graph with backbone and task head
loss_var = trainer.build_forward(ernie, ner_head)
# step 6-1*: use warmup
n_steps = ner_reader.num_examples * num_epochs // batch_size
warmup_steps = int(0.1 * n_steps)
print('total_steps: {}'.format(n_steps))
print('warmup_steps: {}'.format(warmup_steps))
sched = palm.lr_sched.TriangularSchedualer(warmup_steps, n_steps)
# step 6-2: create a optimizer
adam = palm.optimizer.Adam(loss_var, lr, sched)
# step 6-3: build backward
trainer.build_backward(optimizer=adam, weight_decay=weight_decay)
# step 7: fit prepared reader and data
trainer.fit_reader(ner_reader)
# step 8-1*: load pretrained parameters
trainer.load_pretrain(pre_params)
# step 8-2*: set saver to save model
save_steps = (n_steps-20)// gpu_dev_count
print('save_steps: {}'.format(save_steps))
trainer.set_saver(save_path=save_path, save_steps=save_steps, save_type=save_type)
# step 8-3: start training
trainer.train(print_steps=train_print_steps)
# ----------------------- for prediction -----------------------
# step 1-1: create readers for prediction
print('prepare to predict...')
predict_ner_reader = palm.reader.SequenceLabelReader(vocab_path, max_seqlen, label_map, phase='predict')
# step 1-2: load the training data
predict_ner_reader.load_data(predict_file, batch_size)
# step 2: create a backbone of the model to extract text features
pred_ernie = palm.backbone.ERNIE.from_config(config, phase='predict')
# step 3: register the backbone in reader
predict_ner_reader.register_with(pred_ernie)
# step 4: create the task output head
ner_pred_head = palm.head.SequenceLabel(num_classes, input_dim, phase='predict')
# step 5: build forward graph with backbone and task head
trainer.build_predict_forward(pred_ernie, ner_pred_head)
# step 6: load pretrained model
pred_model_path = './outputs/ckpt.step' + str(save_steps)
pred_ckpt = trainer.load_ckpt(pred_model_path)
# step 7: fit prepared reader and data
trainer.fit_reader(predict_ner_reader, phase='predict')
# step 8: predict
print('predicting..')
trainer.predict(print_steps=pred_print_steps, output_dir=pred_output)
此差异已折叠。
...@@ -9,6 +9,7 @@ import head ...@@ -9,6 +9,7 @@ import head
from trainer import Trainer from trainer import Trainer
from multihead_trainer import MultiHeadTrainer
del interface del interface
del task_instance del task_instance
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
from conf_controller import ConfigController
from controller import Controller
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -5,5 +5,5 @@ import multiprocessing ...@@ -5,5 +5,5 @@ import multiprocessing
gpu_dev_count = int(fluid.core.get_cuda_device_count()) gpu_dev_count = int(fluid.core.get_cuda_device_count())
cpu_dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) cpu_dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
from reader import yield_pieces, data_feeder from reader import yield_pieces, data_feeder, decode_fake
此差异已折叠。
from _downloader import * from _downloader import *
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
from slanted_triangular_schedualer import TriangularSchedualer from slanted_triangular_schedualer import TriangularSchedualer
from warmup_schedualer import WarmupSchedualer from warmup_schedualer import WarmupSchedualer
# scheduled_lr = fluid.layers.learning_rate_scheduler\
# .noam_decay(1/(warmup_steps *(config['learning_rate'] ** 2)),
# warmup_steps)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册