提交 b44960b1 编写于 作者: X xixiaoyao

add docs

上级 b34720e4
from slanted_triangular_schedualer import TriangularSchedualer from slanted_triangular_schedualer import TriangularSchedualer
from warmup_schedualer import WarmupSchedualer from warmup_schedualer import WarmupSchedualer
class BaseSchedualer(): class Schedualer():
def __init__(self): def __init__(self):
self._prog = None self._prog = None
...@@ -7,6 +7,6 @@ class BaseSchedualer(): ...@@ -7,6 +7,6 @@ class BaseSchedualer():
def _set_prog(self, prog): def _set_prog(self, prog):
self._prog = prog self._prog = prog
def build(self, learning_rate): def _build(self, learning_rate):
raise NotImplementedError() raise NotImplementedError()
# scheduled_lr = fluid.layers.learning_rate_scheduler\
# .noam_decay(1/(warmup_steps *(config['learning_rate'] ** 2)),
# warmup_steps)
from paddlepalm.lr_sched.schedualer import BaseSchedualer from paddlepalm.lr_sched.schedualer import Schedualer
from paddle import fluid from paddle import fluid
class TriangularSchedualer(BaseSchedualer): class TriangularSchedualer(Schedualer):
""" Applies linear warmup of learning rate from 0 to learning_rate until warmup_steps, and then decay to 0 linearly until num_train_steps.""" """ Implementation of Slanted Triangular learning rate schedual method, more details refer to https://arxiv.org/pdf/1801.06146.pdf . Apply linear warmup of learning rate from 0 to learning_rate until warmup_steps, and then decay to 0 linearly until num_train_steps."""
def __init__(self, warmup_steps, num_train_steps): def __init__(self, warmup_steps, num_train_steps):
"""Create a new TriangularSchedualer object.
Args:
warmup_steps: the learning rate will grow from 0 to max_learning_rate over `warmup_steps` steps.
num_train_steps: the number of train steps.
"""
BaseSchedualer.__init__(self) BaseSchedualer.__init__(self)
assert num_train_steps > warmup_steps > 0 assert num_train_steps > warmup_steps > 0
self.warmup_steps = warmup_steps self.warmup_steps = warmup_steps
self.num_train_steps = num_train_steps self.num_train_steps = num_train_steps
def build(self, learning_rate): def _build(self, learning_rate):
with self._prog._lr_schedule_guard(): with self._prog._lr_schedule_guard():
lr = fluid.layers.tensor.create_global_var( lr = fluid.layers.tensor.create_global_var(
shape=[1], shape=[1],
......
from paddlepalm.lr_sched.schedualer import BaseSchedualer from paddlepalm.lr_sched.schedualer import Schedualer
import paddle.fluid as fluid
def WarmupSchedualer(BaseSchedualer): def WarmupSchedualer(Schedualer):
""" Applies linear warmup of learning rate from 0 to learning_rate until warmup_steps, and then decay to 0 linearly until num_train_steps.""" """ Applies linear warmup of learning rate from 0 to learning_rate until warmup_steps, and then decay to 0 linearly until num_train_steps."""
def __init__(self, warmup_steps): def __init__(self, warmup_steps):
schedualer.__init__(self) schedualer.__init__(self)
self.warmup_steps = warmup_steps self.warmup_steps = warmup_steps
def build(self, learning_rate): def _build(self, learning_rate):
with self._prog._lr_schedule_guard(): with self._prog._lr_schedule_guard():
lr = fluid.layers.tensor.create_global_var( lr = fluid.layers.tensor.create_global_var(
......
...@@ -20,9 +20,9 @@ from __future__ import print_function ...@@ -20,9 +20,9 @@ from __future__ import print_function
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
from paddlepalm.optimizer.base_optimizer import BaseOptimizer from paddlepalm.optimizer.base_optimizer import Optimizer
class Adam(BaseOptimizer): class Adam(Optimizer):
def __init__(self, loss_var, lr, lr_schedualer=None): def __init__(self, loss_var, lr, lr_schedualer=None):
...@@ -32,10 +32,10 @@ class Adam(BaseOptimizer): ...@@ -32,10 +32,10 @@ class Adam(BaseOptimizer):
self._lr = lr self._lr = lr
self._lr_schedualer = lr_schedualer self._lr_schedualer = lr_schedualer
def build(self, grad_clip=None): def _build(self, grad_clip=None):
if self._lr_schedualer is not None: if self._lr_schedualer is not None:
self._lr = self._lr_schedualer.build(self._lr) self._lr = self._lr_schedualer._build(self._lr)
optimizer = fluid.optimizer.Adam(learning_rate=self._lr) optimizer = fluid.optimizer.Adam(learning_rate=self._lr)
......
class BaseOptimizer(): class Optimizer(object):
def __init__(self, loss_var, lr, lr_schedualer=None): def __init__(self, loss_var, lr, lr_schedualer=None):
self._prog = None self._prog = None
self._lr_schedualer = lr_schedualer self._lr_schedualer = lr_schedualer
def build(self, grad_clip=None): def _build(self, grad_clip=None):
pass raise NotImplementedError()
def _set_prog(self, prog, init_prog): def _set_prog(self, prog, init_prog):
self._prog = prog self._prog = prog
......
from cls import ClassifyReader from cls import ClassifyReader
from match import MatchReader from match import MatchReader
from ner import SequenceLabelReader from seq_label import SequenceLabelReader
from mrc import MrcReader from mrc import MRCReader
from mlm import MaskLMReader from mlm import MaskLMReader
...@@ -38,7 +38,7 @@ class ClassifyReader(Reader): ...@@ -38,7 +38,7 @@ class ClassifyReader(Reader):
def __init__(self, vocab_path, max_len, tokenizer='wordpiece', \ def __init__(self, vocab_path, max_len, tokenizer='wordpiece', \
lang='en', seed=None, do_lower_case=False, phase='train'): lang='en', seed=None, do_lower_case=False, phase='train'):
"""Create a new Reader for classification task data. """Create a new Reader for loading and processing classification task data.
Args: Args:
vocab_path: the vocab file path to do tokenization and token_ids generation. vocab_path: the vocab file path to do tokenization and token_ids generation.
......
...@@ -36,21 +36,30 @@ class MatchReader(Reader): ...@@ -36,21 +36,30 @@ class MatchReader(Reader):
text_a [TAB] text_b [TAB] text_b_neg text_a [TAB] text_b [TAB] text_b_neg
Today is a good day. [TAB] what a nice day! [TAB] terriable day! Today is a good day. [TAB] what a nice day! [TAB] terriable day!
Such a terriable day! [TAB] So terriable today! [TAB] There is a dog. Such a terriable day! [TAB] So terriable today! [TAB] There is a dog.
I feel lucky to meet you, dear. [TAB] You are my lucky, darling. [TAB] I feel lucky to meet you, dear. [TAB] You are my lucky, darling. [TAB] Buy some bananas, okey?
He likes sunshine and I like him :). [TAB] I like him. He like sunshine. He likes sunshine and I like him :). [TAB] I like him. He like sunshine. [TAB] He has a dog.
JUST! GO! OUT! [TAB] Come in please. JUST! GO! OUT! [TAB] go out now! [TAB] Come in please.
CAUTIOUS: The first line of the file must be header! And areas are splited by tab (\\t). CAUTIOUS: the HEADER is required for each dataset file! And fields (columns) should be splited by Tab (\\t).
""" """
def __init__(self, vocab_path, max_len, tokenizer='wordpiece', lang='en', seed=None, \ def __init__(self, vocab_path, max_len, tokenizer='wordpiece', lang='en', seed=None, \
do_lower_case=False, learning_strategy='pointwise', phase='train', dev_count=1, print_prefix=''): # 需要什么加什么 do_lower_case=False, learning_strategy='pointwise', phase='train', dev_count=1, print_prefix=''): # 需要什么加什么
""" """Create a new Reader for classification task data.
Args: Args:
phase: train, eval, pred vocab_path: the vocab file path to do tokenization and token_ids generation.
lang: en, ch, ... max_len: The maximum length of the sequence (after word segmentation). The part exceeding max_len will be removed from right.
learning_strategy: pointwise, pairwise tokenizer: string type. The name of the used tokenizer. A tokenizer is to convert raw text into tokens. Avaliable tokenizers: wordpiece.
lang: the language of dataset. Supported language: en (English), cn (Chinese). Default is en (English).
seed: int type. The random seed to shuffle dataset. Default is None, means no use of random seed.
do_lower_case: bool type. Whether to do lowercase on English text. Default is False. This argument only works on English text.
learning_strategy: string type. This only works for training phase. Available strategies: pointwise, pairwise.
phase: the running phase of this reader. Supported phase: train, predict. Default is train.
Return:
a Reader object for matching-like task.
""" """
Reader.__init__(self, phase) Reader.__init__(self, phase)
...@@ -106,6 +115,16 @@ class MatchReader(Reader): ...@@ -106,6 +115,16 @@ class MatchReader(Reader):
def load_data(self, input_file, batch_size, num_epochs=None, \ def load_data(self, input_file, batch_size, num_epochs=None, \
file_format='tsv', shuffle_train=True): file_format='tsv', shuffle_train=True):
"""Load matching data into reader.
Args:
input_file: the dataset file path. File format should keep consistent with `file_format` argument.
batch_size: number of examples for once yield. CAUSIOUS! If your environment exists multiple GPU devices (marked as dev_count), the batch_size should be divided by dev_count with no remainder!
num_epochs: the travelsal times of input examples. Default is None, means once for single-task learning and automatically calculated for multi-task learning. This argument only works on train phase.
file_format: the file format of input file. Supported format: tsv. Default is tsv.
shuffle_train: whether to shuffle training dataset. Default is True. This argument only works on training phase.
"""
self._batch_size = batch_size self._batch_size = batch_size
self._num_epochs = num_epochs self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \ self._data_generator = self._reader.data_generator( \
......
...@@ -14,17 +14,65 @@ ...@@ -14,17 +14,65 @@
# limitations under the License. # limitations under the License.
from paddlepalm.reader.base_reader import Reader from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import MRCReader from paddlepalm.reader.utils.reader4ernie import MRCReader as MRCReader_t
import numpy as np import numpy as np
class MrcReader(Reader): class MRCReader(Reader):
"""
The reader completes the loading and processing of SQuAD like machine reading comprehension dataset. Supported file format: json.
The outermost data structure of a dataset is a dictionary, which contains the dataset version number field and data field. In the data field, each example contains the title of the article and several paragraphs. Each paragraph contains a paragraph context corresponed question-answer pairs. For each q-a pair, it contains a question with globally unique ID, as well as (several) answers. Each answer item contains the text of the answer itself and its starting position of the context. Note that the starting position is at the character level. In addition, for the test set, answers field is not necessary.
A typical case is shown as follows.
{"version": "1.0",
"data": [
{"title": "...",
"paragraphs": [
{"context": "...",
"qas": [
{"question": "..."
"id": "..."
"answers": [
{"text": "...",
"answer_start": ...}
{...}
...
]
}
{...}
...
]
}
{...},
...
]
}
{...}
...
]
}
"""
def __init__(self, vocab_path, max_len, max_query_len, doc_stride, \
tokenizer='wordpiece', lang='en', seed=None, do_lower_case=False, \
remove_noanswer=True, phase='train'):
"""Create a new Reader for loading and processing machine reading comprehension task data.
def __init__(self, vocab_path, max_len, max_query_len, doc_stride, tokenizer='FullTokenizer', lang='en', seed=None, do_lower_case=False, \
remove_noanswer=True, phase='train', dev_count=1, print_prefix=''):
"""
Args: Args:
phase: train, eval, pred vocab_path: the vocab file path to do tokenization and token_ids generation.
lang: en, ch, ... max_len: the maximum length of the sequence (after word segmentation). The part exceeding max_len will be removed from right.
max_query_len: the maximum length of query/question (after word segmentation).
doc_stride: the slice stride of context window.
tokenizer: string type. The name of the used tokenizer. A tokenizer is to convert raw text into tokens. Avaliable tokenizers: wordpiece.
lang: the language of dataset. Supported language: en (English), cn (Chinese). Default is en (English).
seed: int type. The random seed to shuffle dataset. Default is None, means no use of random seed.
do_lower_case: bool type. Whether to do lowercase on English text. Default is False. This argument only works on English text.
remove_noanswer: bool type. Whether to remove no answer question and invalid answer.
phase: the running phase of this reader. Supported phase: train, predict. Default is train.
Return:
a Reader object for classification task.
""" """
Reader.__init__(self, phase) Reader.__init__(self, phase)
...@@ -46,19 +94,18 @@ class MrcReader(Reader): ...@@ -46,19 +94,18 @@ class MrcReader(Reader):
self._is_training = phase == 'train' self._is_training = phase == 'train'
mrc_reader = MRCReader(vocab_path, mrc_reader = MRCReader_t(vocab_path,
max_seq_len=max_len, max_seq_len=max_len,
do_lower_case=do_lower_case, do_lower_case=do_lower_case,
tokenizer=tokenizer, tokenizer=tokenizer,
doc_stride=doc_stride, doc_stride=doc_stride,
remove_noanswer=remove_noanswer, remove_noanswer=remove_noanswer,
max_query_length=max_query_len, max_query_length=max_query_len,
for_cn=for_cn, for_cn=for_cn,
random_seed=seed) random_seed=seed)
self._reader = mrc_reader self._reader = mrc_reader
self._phase = phase self._phase = phase
self._dev_count = dev_count
@property @property
...@@ -81,6 +128,16 @@ class MrcReader(Reader): ...@@ -81,6 +128,16 @@ class MrcReader(Reader):
"features": None} "features": None}
def load_data(self, input_file, batch_size, num_epochs=None, file_format='csv', shuffle_train=True): def load_data(self, input_file, batch_size, num_epochs=None, file_format='csv', shuffle_train=True):
"""Load mrc data into reader.
Args:
input_file: the dataset file path. File format should keep consistent with `file_format` argument.
batch_size: number of examples for once yield. CAUSIOUS! If your environment exists multiple GPU devices (marked as dev_count), the batch_size should be divided by dev_count with no remainder!
num_epochs: the travelsal times of input examples. Default is None, means once for single-task learning and automatically calculated for multi-task learning. This argument only works on train phase.
file_format: the file format of input file. Supported format: tsv. Default is tsv.
shuffle_train: whether to shuffle training dataset. Default is True. This argument only works on training phase.
"""
self._batch_size = batch_size self._batch_size = batch_size
self._num_epochs = num_epochs self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \ self._data_generator = self._reader.data_generator( \
......
...@@ -17,6 +17,9 @@ from paddlepalm.reader.base_reader import Reader ...@@ -17,6 +17,9 @@ from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import SequenceLabelReader as SLReader from paddlepalm.reader.utils.reader4ernie import SequenceLabelReader as SLReader
class SequenceLabelReader(Reader): class SequenceLabelReader(Reader):
"""
The reader completes the loading and processing of sequence labeling type task (e.g, pos tagging, named entity recognition) dataset. Supported file format: tsv.
"""
def __init__(self, vocab_path, max_len, label_map_config, tokenizer='wordpiece', \ def __init__(self, vocab_path, max_len, label_map_config, tokenizer='wordpiece', \
lang='en', seed=None, do_lower_case=False, phase='train', dev_count=1, print_prefix=''): lang='en', seed=None, do_lower_case=False, phase='train', dev_count=1, print_prefix=''):
...@@ -65,6 +68,16 @@ class SequenceLabelReader(Reader): ...@@ -65,6 +68,16 @@ class SequenceLabelReader(Reader):
def load_data(self, input_file, batch_size, num_epochs=None, \ def load_data(self, input_file, batch_size, num_epochs=None, \
file_format='tsv', shuffle_train=True): file_format='tsv', shuffle_train=True):
"""Load sequence labeling data into reader.
Args:
input_file: the dataset file path. File format should keep consistent with `file_format` argument.
batch_size: number of examples for once yield. CAUSIOUS! If your environment exists multiple GPU devices (marked as dev_count), the batch_size should be divided by dev_count with no remainder!
num_epochs: the travelsal times of input examples. Default is None, means once for single-task learning and automatically calculated for multi-task learning. This argument only works on train phase.
file_format: the file format of input file. Supported format: tsv. Default is tsv.
shuffle_train: whether to shuffle training dataset. Default is True. This argument only works on training phase.
"""
self._batch_size = batch_size self._batch_size = batch_size
self._num_epochs = num_epochs self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \ self._data_generator = self._reader.data_generator( \
......
...@@ -29,15 +29,12 @@ import six ...@@ -29,15 +29,12 @@ import six
from io import open from io import open
from collections import namedtuple from collections import namedtuple
# from . import gpu_dev_count
gpu_dev_count=1
import paddlepalm as palm import paddlepalm as palm
import paddlepalm.tokenizer.ernie_tokenizer as tokenization import paddlepalm.tokenizer.ernie_tokenizer as tokenization
from paddlepalm.reader.utils.batching4ernie import pad_batch_data from paddlepalm.reader.utils.batching4ernie import pad_batch_data
from paddlepalm.reader.utils.mlm_batching import prepare_batch_data from paddlepalm.reader.utils.mlm_batching import prepare_batch_data
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
if six.PY3: if six.PY3:
...@@ -481,10 +478,8 @@ class MaskLMReader(Reader): ...@@ -481,10 +478,8 @@ class MaskLMReader(Reader):
return_input_mask=True, return_input_mask=True,
return_max_len=False, return_max_len=False,
return_num_token=False, return_num_token=False,
# dev_count=gpu_dev_count)
dev_count=1) dev_count=1)
# yield batch # yield batch
for piece in palm.distribute.yield_pieces(batch_data, ['s', 's', 's', 's', 's', 'u', 'u'], batch_size): for piece in palm.distribute.yield_pieces(batch_data, ['s', 's', 's', 's', 's', 'u', 'u'], batch_size):
yield piece yield piece
......
...@@ -295,7 +295,7 @@ class Trainer(object): ...@@ -295,7 +295,7 @@ class Trainer(object):
assert self._loss_var is not None and self._train_init_prog is not None, "train graph not foung! You should build_forward first." assert self._loss_var is not None and self._train_init_prog is not None, "train graph not foung! You should build_forward first."
optimizer._set_prog(self._train_prog, self._train_init_prog) optimizer._set_prog(self._train_prog, self._train_init_prog)
with fluid.program_guard(self._train_prog, self._train_init_prog): with fluid.program_guard(self._train_prog, self._train_init_prog):
param_grads = optimizer.build() param_grads = optimizer._build()
if weight_decay is not None: if weight_decay is not None:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册