未验证 提交 82d09960 编写于 作者: X Xiaoyao Xi 提交者: GitHub

Merge pull request #1 from PaddlePaddle/master

update from origin
此差异已折叠。
...@@ -24,6 +24,7 @@ import paddle.fluid as fluid ...@@ -24,6 +24,7 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from paddle.fluid.layer_helper import LayerHelper as LayerHelper from paddle.fluid.layer_helper import LayerHelper as LayerHelper
from functools import reduce # py3
def layer_norm(x, begin_norm_axis=1, epsilon=1e-6, param_attr=None, bias_attr=None): def layer_norm(x, begin_norm_axis=1, epsilon=1e-6, param_attr=None, bias_attr=None):
helper = LayerHelper('layer_norm', **locals()) helper = LayerHelper('layer_norm', **locals())
mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True) mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True)
......
...@@ -639,7 +639,8 @@ class MRCReader(BaseReader): ...@@ -639,7 +639,8 @@ class MRCReader(BaseReader):
for_cn=True, for_cn=True,
task_id=0, task_id=0,
doc_stride=128, doc_stride=128,
max_query_length=64): max_query_length=64,
remove_noanswer=True):
self.max_seq_len = max_seq_len self.max_seq_len = max_seq_len
self.tokenizer = tokenization.FullTokenizer( self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case) vocab_file=vocab_path, do_lower_case=do_lower_case)
...@@ -654,6 +655,7 @@ class MRCReader(BaseReader): ...@@ -654,6 +655,7 @@ class MRCReader(BaseReader):
self.max_query_length = max_query_length self.max_query_length = max_query_length
self.examples = {} self.examples = {}
self.features = {} self.features = {}
self.remove_noanswer = remove_noanswer
if random_seed is not None: if random_seed is not None:
np.random.seed(random_seed) np.random.seed(random_seed)
...@@ -758,7 +760,7 @@ class MRCReader(BaseReader): ...@@ -758,7 +760,7 @@ class MRCReader(BaseReader):
return cur_span_index == best_span_index return cur_span_index == best_span_index
def _convert_example_to_feature(self, examples, max_seq_length, tokenizer, def _convert_example_to_feature(self, examples, max_seq_length, tokenizer,
is_training): is_training, remove_noanswer=True):
features = [] features = []
unique_id = 1000000000 unique_id = 1000000000
...@@ -845,6 +847,8 @@ class MRCReader(BaseReader): ...@@ -845,6 +847,8 @@ class MRCReader(BaseReader):
if out_of_span: if out_of_span:
start_position = 0 start_position = 0
end_position = 0 end_position = 0
if remove_noanswer:
continue
else: else:
doc_offset = len(query_tokens) + 2 doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset start_position = tok_start_position - doc_start + doc_offset
...@@ -958,7 +962,7 @@ class MRCReader(BaseReader): ...@@ -958,7 +962,7 @@ class MRCReader(BaseReader):
if not examples: if not examples:
examples = self._read_json(input_file, phase == "train") examples = self._read_json(input_file, phase == "train")
features = self._convert_example_to_feature( features = self._convert_example_to_feature(
examples, self.max_seq_len, self.tokenizer, phase == "train") examples, self.max_seq_len, self.tokenizer, phase == "train", remove_noanswer=self.remove_noanswer)
self.examples[phase] = examples self.examples[phase] = examples
self.features[phase] = features self.features[phase] = features
......
...@@ -114,8 +114,10 @@ class TaskInstance(object): ...@@ -114,8 +114,10 @@ class TaskInstance(object):
conf = {} conf = {}
for k, strv in self._save_protocol.items(): for k, strv in self._save_protocol.items():
exec('v={}'.format(strv)) d = None
conf[k] = v v = locals()
exec('d={}'.format(strv), globals(), v)
conf[k] = v['d']
with open(os.path.join(dirpath, '__conf__'), 'w') as writer: with open(os.path.join(dirpath, '__conf__'), 'w') as writer:
writer.write(json.dumps(conf, indent=1)) writer.write(json.dumps(conf, indent=1))
print(self._name + ': inference model saved at ' + dirpath) print(self._name + ': inference model saved at ' + dirpath)
......
...@@ -162,10 +162,12 @@ class BasicTokenizer(object): ...@@ -162,10 +162,12 @@ class BasicTokenizer(object):
def __init__(self, do_lower_case=True): def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer. """Constructs a BasicTokenizer.
Args: Args:
do_lower_case: Whether to lower case the input. do_lower_case: Whether to lower case the input.
""" """
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self._never_lowercase = ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
def tokenize(self, text): def tokenize(self, text):
"""Tokenizes a piece of text.""" """Tokenizes a piece of text."""
...@@ -183,9 +185,12 @@ class BasicTokenizer(object): ...@@ -183,9 +185,12 @@ class BasicTokenizer(object):
orig_tokens = whitespace_tokenize(text) orig_tokens = whitespace_tokenize(text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if self.do_lower_case: if self.do_lower_case and token not in self._never_lowercase:
token = token.lower() token = token.lower()
token = self._run_strip_accents(token) token = self._run_strip_accents(token)
if token in self._never_lowercase:
split_tokens.extend([token])
else:
split_tokens.extend(self._run_split_on_punc(token)) split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens)) output_tokens = whitespace_tokenize(" ".join(split_tokens))
...@@ -281,14 +286,18 @@ class WordpieceTokenizer(object): ...@@ -281,14 +286,18 @@ class WordpieceTokenizer(object):
def tokenize(self, text): def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces. """Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary. using the given vocabulary.
For example: For example:
input = "unaffable" input = "unaffable"
output = ["un", "##aff", "##able"] output = ["un", "##aff", "##able"]
Args: Args:
text: A single token or whitespace separated tokens. This should have text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer. already been passed through `BasicTokenizer.
Returns: Returns:
A list of wordpiece tokens. A list of wordpiece tokens.
""" """
......
...@@ -24,8 +24,8 @@ import setuptools ...@@ -24,8 +24,8 @@ import setuptools
with open("README.md", "r") as fh: with open("README.md", "r") as fh:
long_description = fh.read() long_description = fh.read()
setuptools.setup( setuptools.setup(
name="paddle-palm", name="paddlepalm",
version="1.2", version="1.0.0",
author="PaddlePaddle", author="PaddlePaddle",
author_email="zhangyiming04@baidu.com", author_email="zhangyiming04@baidu.com",
description="A Multi-task Learning Lib for PaddlePaddle Users.", description="A Multi-task Learning Lib for PaddlePaddle Users.",
...@@ -63,6 +63,9 @@ setuptools.setup( ...@@ -63,6 +63,9 @@ setuptools.setup(
'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.7',
], ],
install_requires = [
'paddlepaddle-gpu>=1.6.1'
]
) )
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册