提交 b44960b1 编写于 作者: X xixiaoyao

add docs

上级 b34720e4
from slanted_triangular_schedualer import TriangularSchedualer
from warmup_schedualer import WarmupSchedualer
class BaseSchedualer():
class Schedualer():
def __init__(self):
self._prog = None
......@@ -7,6 +7,6 @@ class BaseSchedualer():
def _set_prog(self, prog):
self._prog = prog
def build(self, learning_rate):
def _build(self, learning_rate):
raise NotImplementedError()
# scheduled_lr = fluid.layers.learning_rate_scheduler\
# .noam_decay(1/(warmup_steps *(config['learning_rate'] ** 2)),
# warmup_steps)
from paddlepalm.lr_sched.schedualer import BaseSchedualer
from paddlepalm.lr_sched.schedualer import Schedualer
from paddle import fluid
class TriangularSchedualer(BaseSchedualer):
class TriangularSchedualer(Schedualer):
""" Applies linear warmup of learning rate from 0 to learning_rate until warmup_steps, and then decay to 0 linearly until num_train_steps."""
""" Implementation of Slanted Triangular learning rate schedual method, more details refer to https://arxiv.org/pdf/1801.06146.pdf . Apply linear warmup of learning rate from 0 to learning_rate until warmup_steps, and then decay to 0 linearly until num_train_steps."""
def __init__(self, warmup_steps, num_train_steps):
"""Create a new TriangularSchedualer object.
Args:
warmup_steps: the learning rate will grow from 0 to max_learning_rate over `warmup_steps` steps.
num_train_steps: the number of train steps.
"""
BaseSchedualer.__init__(self)
assert num_train_steps > warmup_steps > 0
self.warmup_steps = warmup_steps
self.num_train_steps = num_train_steps
def build(self, learning_rate):
def _build(self, learning_rate):
with self._prog._lr_schedule_guard():
lr = fluid.layers.tensor.create_global_var(
shape=[1],
......
from paddlepalm.lr_sched.schedualer import BaseSchedualer
from paddlepalm.lr_sched.schedualer import Schedualer
import paddle.fluid as fluid
def WarmupSchedualer(BaseSchedualer):
def WarmupSchedualer(Schedualer):
""" Applies linear warmup of learning rate from 0 to learning_rate until warmup_steps, and then decay to 0 linearly until num_train_steps."""
def __init__(self, warmup_steps):
schedualer.__init__(self)
self.warmup_steps = warmup_steps
def build(self, learning_rate):
def _build(self, learning_rate):
with self._prog._lr_schedule_guard():
lr = fluid.layers.tensor.create_global_var(
......
......@@ -20,9 +20,9 @@ from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
from paddlepalm.optimizer.base_optimizer import BaseOptimizer
from paddlepalm.optimizer.base_optimizer import Optimizer
class Adam(BaseOptimizer):
class Adam(Optimizer):
def __init__(self, loss_var, lr, lr_schedualer=None):
......@@ -32,10 +32,10 @@ class Adam(BaseOptimizer):
self._lr = lr
self._lr_schedualer = lr_schedualer
def build(self, grad_clip=None):
def _build(self, grad_clip=None):
if self._lr_schedualer is not None:
self._lr = self._lr_schedualer.build(self._lr)
self._lr = self._lr_schedualer._build(self._lr)
optimizer = fluid.optimizer.Adam(learning_rate=self._lr)
......
class BaseOptimizer():
class Optimizer(object):
def __init__(self, loss_var, lr, lr_schedualer=None):
self._prog = None
self._lr_schedualer = lr_schedualer
def build(self, grad_clip=None):
pass
def _build(self, grad_clip=None):
raise NotImplementedError()
def _set_prog(self, prog, init_prog):
self._prog = prog
......
from cls import ClassifyReader
from match import MatchReader
from ner import SequenceLabelReader
from mrc import MrcReader
from seq_label import SequenceLabelReader
from mrc import MRCReader
from mlm import MaskLMReader
......@@ -38,7 +38,7 @@ class ClassifyReader(Reader):
def __init__(self, vocab_path, max_len, tokenizer='wordpiece', \
lang='en', seed=None, do_lower_case=False, phase='train'):
"""Create a new Reader for classification task data.
"""Create a new Reader for loading and processing classification task data.
Args:
vocab_path: the vocab file path to do tokenization and token_ids generation.
......
......@@ -36,21 +36,30 @@ class MatchReader(Reader):
text_a [TAB] text_b [TAB] text_b_neg
Today is a good day. [TAB] what a nice day! [TAB] terriable day!
Such a terriable day! [TAB] So terriable today! [TAB] There is a dog.
I feel lucky to meet you, dear. [TAB] You are my lucky, darling. [TAB]
He likes sunshine and I like him :). [TAB] I like him. He like sunshine.
JUST! GO! OUT! [TAB] Come in please.
I feel lucky to meet you, dear. [TAB] You are my lucky, darling. [TAB] Buy some bananas, okey?
He likes sunshine and I like him :). [TAB] I like him. He like sunshine. [TAB] He has a dog.
JUST! GO! OUT! [TAB] go out now! [TAB] Come in please.
CAUTIOUS: The first line of the file must be header! And areas are splited by tab (\\t).
CAUTIOUS: the HEADER is required for each dataset file! And fields (columns) should be splited by Tab (\\t).
"""
def __init__(self, vocab_path, max_len, tokenizer='wordpiece', lang='en', seed=None, \
do_lower_case=False, learning_strategy='pointwise', phase='train', dev_count=1, print_prefix=''): # 需要什么加什么
"""
"""Create a new Reader for classification task data.
Args:
phase: train, eval, pred
lang: en, ch, ...
learning_strategy: pointwise, pairwise
vocab_path: the vocab file path to do tokenization and token_ids generation.
max_len: The maximum length of the sequence (after word segmentation). The part exceeding max_len will be removed from right.
tokenizer: string type. The name of the used tokenizer. A tokenizer is to convert raw text into tokens. Avaliable tokenizers: wordpiece.
lang: the language of dataset. Supported language: en (English), cn (Chinese). Default is en (English).
seed: int type. The random seed to shuffle dataset. Default is None, means no use of random seed.
do_lower_case: bool type. Whether to do lowercase on English text. Default is False. This argument only works on English text.
learning_strategy: string type. This only works for training phase. Available strategies: pointwise, pairwise.
phase: the running phase of this reader. Supported phase: train, predict. Default is train.
Return:
a Reader object for matching-like task.
"""
Reader.__init__(self, phase)
......@@ -106,6 +115,16 @@ class MatchReader(Reader):
def load_data(self, input_file, batch_size, num_epochs=None, \
file_format='tsv', shuffle_train=True):
"""Load matching data into reader.
Args:
input_file: the dataset file path. File format should keep consistent with `file_format` argument.
batch_size: number of examples for once yield. CAUSIOUS! If your environment exists multiple GPU devices (marked as dev_count), the batch_size should be divided by dev_count with no remainder!
num_epochs: the travelsal times of input examples. Default is None, means once for single-task learning and automatically calculated for multi-task learning. This argument only works on train phase.
file_format: the file format of input file. Supported format: tsv. Default is tsv.
shuffle_train: whether to shuffle training dataset. Default is True. This argument only works on training phase.
"""
self._batch_size = batch_size
self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \
......
......@@ -14,17 +14,65 @@
# limitations under the License.
from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import MRCReader
from paddlepalm.reader.utils.reader4ernie import MRCReader as MRCReader_t
import numpy as np
class MrcReader(Reader):
class MRCReader(Reader):
"""
The reader completes the loading and processing of SQuAD like machine reading comprehension dataset. Supported file format: json.
The outermost data structure of a dataset is a dictionary, which contains the dataset version number field and data field. In the data field, each example contains the title of the article and several paragraphs. Each paragraph contains a paragraph context corresponed question-answer pairs. For each q-a pair, it contains a question with globally unique ID, as well as (several) answers. Each answer item contains the text of the answer itself and its starting position of the context. Note that the starting position is at the character level. In addition, for the test set, answers field is not necessary.
A typical case is shown as follows.
{"version": "1.0",
"data": [
{"title": "...",
"paragraphs": [
{"context": "...",
"qas": [
{"question": "..."
"id": "..."
"answers": [
{"text": "...",
"answer_start": ...}
{...}
...
]
}
{...}
...
]
}
{...},
...
]
}
{...}
...
]
}
"""
def __init__(self, vocab_path, max_len, max_query_len, doc_stride, \
tokenizer='wordpiece', lang='en', seed=None, do_lower_case=False, \
remove_noanswer=True, phase='train'):
"""Create a new Reader for loading and processing machine reading comprehension task data.
def __init__(self, vocab_path, max_len, max_query_len, doc_stride, tokenizer='FullTokenizer', lang='en', seed=None, do_lower_case=False, \
remove_noanswer=True, phase='train', dev_count=1, print_prefix=''):
"""
Args:
phase: train, eval, pred
lang: en, ch, ...
vocab_path: the vocab file path to do tokenization and token_ids generation.
max_len: the maximum length of the sequence (after word segmentation). The part exceeding max_len will be removed from right.
max_query_len: the maximum length of query/question (after word segmentation).
doc_stride: the slice stride of context window.
tokenizer: string type. The name of the used tokenizer. A tokenizer is to convert raw text into tokens. Avaliable tokenizers: wordpiece.
lang: the language of dataset. Supported language: en (English), cn (Chinese). Default is en (English).
seed: int type. The random seed to shuffle dataset. Default is None, means no use of random seed.
do_lower_case: bool type. Whether to do lowercase on English text. Default is False. This argument only works on English text.
remove_noanswer: bool type. Whether to remove no answer question and invalid answer.
phase: the running phase of this reader. Supported phase: train, predict. Default is train.
Return:
a Reader object for classification task.
"""
Reader.__init__(self, phase)
......@@ -46,19 +94,18 @@ class MrcReader(Reader):
self._is_training = phase == 'train'
mrc_reader = MRCReader(vocab_path,
max_seq_len=max_len,
do_lower_case=do_lower_case,
tokenizer=tokenizer,
doc_stride=doc_stride,
remove_noanswer=remove_noanswer,
max_query_length=max_query_len,
for_cn=for_cn,
random_seed=seed)
mrc_reader = MRCReader_t(vocab_path,
max_seq_len=max_len,
do_lower_case=do_lower_case,
tokenizer=tokenizer,
doc_stride=doc_stride,
remove_noanswer=remove_noanswer,
max_query_length=max_query_len,
for_cn=for_cn,
random_seed=seed)
self._reader = mrc_reader
self._phase = phase
self._dev_count = dev_count
@property
......@@ -81,6 +128,16 @@ class MrcReader(Reader):
"features": None}
def load_data(self, input_file, batch_size, num_epochs=None, file_format='csv', shuffle_train=True):
"""Load mrc data into reader.
Args:
input_file: the dataset file path. File format should keep consistent with `file_format` argument.
batch_size: number of examples for once yield. CAUSIOUS! If your environment exists multiple GPU devices (marked as dev_count), the batch_size should be divided by dev_count with no remainder!
num_epochs: the travelsal times of input examples. Default is None, means once for single-task learning and automatically calculated for multi-task learning. This argument only works on train phase.
file_format: the file format of input file. Supported format: tsv. Default is tsv.
shuffle_train: whether to shuffle training dataset. Default is True. This argument only works on training phase.
"""
self._batch_size = batch_size
self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \
......
......@@ -17,6 +17,9 @@ from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import SequenceLabelReader as SLReader
class SequenceLabelReader(Reader):
"""
The reader completes the loading and processing of sequence labeling type task (e.g, pos tagging, named entity recognition) dataset. Supported file format: tsv.
"""
def __init__(self, vocab_path, max_len, label_map_config, tokenizer='wordpiece', \
lang='en', seed=None, do_lower_case=False, phase='train', dev_count=1, print_prefix=''):
......@@ -65,6 +68,16 @@ class SequenceLabelReader(Reader):
def load_data(self, input_file, batch_size, num_epochs=None, \
file_format='tsv', shuffle_train=True):
"""Load sequence labeling data into reader.
Args:
input_file: the dataset file path. File format should keep consistent with `file_format` argument.
batch_size: number of examples for once yield. CAUSIOUS! If your environment exists multiple GPU devices (marked as dev_count), the batch_size should be divided by dev_count with no remainder!
num_epochs: the travelsal times of input examples. Default is None, means once for single-task learning and automatically calculated for multi-task learning. This argument only works on train phase.
file_format: the file format of input file. Supported format: tsv. Default is tsv.
shuffle_train: whether to shuffle training dataset. Default is True. This argument only works on training phase.
"""
self._batch_size = batch_size
self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \
......
......@@ -29,15 +29,12 @@ import six
from io import open
from collections import namedtuple
# from . import gpu_dev_count
gpu_dev_count=1
import paddlepalm as palm
import paddlepalm.tokenizer.ernie_tokenizer as tokenization
from paddlepalm.reader.utils.batching4ernie import pad_batch_data
from paddlepalm.reader.utils.mlm_batching import prepare_batch_data
log = logging.getLogger(__name__)
if six.PY3:
......@@ -481,10 +478,8 @@ class MaskLMReader(Reader):
return_input_mask=True,
return_max_len=False,
return_num_token=False,
# dev_count=gpu_dev_count)
dev_count=1)
# yield batch
for piece in palm.distribute.yield_pieces(batch_data, ['s', 's', 's', 's', 's', 'u', 'u'], batch_size):
yield piece
......
......@@ -295,7 +295,7 @@ class Trainer(object):
assert self._loss_var is not None and self._train_init_prog is not None, "train graph not foung! You should build_forward first."
optimizer._set_prog(self._train_prog, self._train_init_prog)
with fluid.program_guard(self._train_prog, self._train_init_prog):
param_grads = optimizer.build()
param_grads = optimizer._build()
if weight_decay is not None:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册