未验证 提交 82d09960 编写于 作者: X Xiaoyao Xi 提交者: GitHub

Merge pull request #1 from PaddlePaddle/master

update from origin
此差异已折叠。
......@@ -24,6 +24,7 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.layer_helper import LayerHelper as LayerHelper
from functools import reduce # py3
def layer_norm(x, begin_norm_axis=1, epsilon=1e-6, param_attr=None, bias_attr=None):
helper = LayerHelper('layer_norm', **locals())
mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True)
......
......@@ -639,7 +639,8 @@ class MRCReader(BaseReader):
for_cn=True,
task_id=0,
doc_stride=128,
max_query_length=64):
max_query_length=64,
remove_noanswer=True):
self.max_seq_len = max_seq_len
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
......@@ -654,6 +655,7 @@ class MRCReader(BaseReader):
self.max_query_length = max_query_length
self.examples = {}
self.features = {}
self.remove_noanswer = remove_noanswer
if random_seed is not None:
np.random.seed(random_seed)
......@@ -758,7 +760,7 @@ class MRCReader(BaseReader):
return cur_span_index == best_span_index
def _convert_example_to_feature(self, examples, max_seq_length, tokenizer,
is_training):
is_training, remove_noanswer=True):
features = []
unique_id = 1000000000
......@@ -845,6 +847,8 @@ class MRCReader(BaseReader):
if out_of_span:
start_position = 0
end_position = 0
if remove_noanswer:
continue
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
......@@ -958,7 +962,7 @@ class MRCReader(BaseReader):
if not examples:
examples = self._read_json(input_file, phase == "train")
features = self._convert_example_to_feature(
examples, self.max_seq_len, self.tokenizer, phase == "train")
examples, self.max_seq_len, self.tokenizer, phase == "train", remove_noanswer=self.remove_noanswer)
self.examples[phase] = examples
self.features[phase] = features
......
......@@ -114,8 +114,10 @@ class TaskInstance(object):
conf = {}
for k, strv in self._save_protocol.items():
exec('v={}'.format(strv))
conf[k] = v
d = None
v = locals()
exec('d={}'.format(strv), globals(), v)
conf[k] = v['d']
with open(os.path.join(dirpath, '__conf__'), 'w') as writer:
writer.write(json.dumps(conf, indent=1))
print(self._name + ': inference model saved at ' + dirpath)
......
......@@ -162,10 +162,12 @@ class BasicTokenizer(object):
def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
self._never_lowercase = ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
def tokenize(self, text):
"""Tokenizes a piece of text."""
......@@ -183,9 +185,12 @@ class BasicTokenizer(object):
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
if self.do_lower_case and token not in self._never_lowercase:
token = token.lower()
token = self._run_strip_accents(token)
if token in self._never_lowercase:
split_tokens.extend([token])
else:
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
......@@ -281,14 +286,18 @@ class WordpieceTokenizer(object):
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
......
......@@ -24,8 +24,8 @@ import setuptools
with open("README.md", "r") as fh:
long_description = fh.read()
setuptools.setup(
name="paddle-palm",
version="1.2",
name="paddlepalm",
version="1.0.0",
author="PaddlePaddle",
author_email="zhangyiming04@baidu.com",
description="A Multi-task Learning Lib for PaddlePaddle Users.",
......@@ -63,6 +63,9 @@ setuptools.setup(
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
],
install_requires = [
'paddlepaddle-gpu>=1.6.1'
]
)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册