Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PALM
提交
82d09960
P
PALM
项目概览
PaddlePaddle
/
PALM
通知
8
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
10
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PALM
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
10
Issue
10
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
82d09960
编写于
11月 26, 2019
作者:
X
Xiaoyao Xi
提交者:
GitHub
11月 26, 2019
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1 from PaddlePaddle/master
update from origin
上级
4c4807ad
7a7e7551
变更
6
展开全部
隐藏空白更改
内联
并排
Showing
6 changed file
with
895 addition
and
52 deletion
+895
-52
README.md
README.md
+865
-41
paddlepalm/backbone/utils/transformer.py
paddlepalm/backbone/utils/transformer.py
+1
-0
paddlepalm/reader/utils/reader4ernie.py
paddlepalm/reader/utils/reader4ernie.py
+7
-3
paddlepalm/task_instance.py
paddlepalm/task_instance.py
+6
-4
paddlepalm/tokenizer/ernie_tokenizer.py
paddlepalm/tokenizer/ernie_tokenizer.py
+11
-2
setup.py
setup.py
+5
-2
未找到文件。
README.md
浏览文件 @
82d09960
此差异已折叠。
点击以展开。
paddlepalm/backbone/utils/transformer.py
浏览文件 @
82d09960
...
@@ -24,6 +24,7 @@ import paddle.fluid as fluid
...
@@ -24,6 +24,7 @@ import paddle.fluid as fluid
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
from
paddle.fluid.layer_helper
import
LayerHelper
as
LayerHelper
from
paddle.fluid.layer_helper
import
LayerHelper
as
LayerHelper
from
functools
import
reduce
# py3
def
layer_norm
(
x
,
begin_norm_axis
=
1
,
epsilon
=
1e-6
,
param_attr
=
None
,
bias_attr
=
None
):
def
layer_norm
(
x
,
begin_norm_axis
=
1
,
epsilon
=
1e-6
,
param_attr
=
None
,
bias_attr
=
None
):
helper
=
LayerHelper
(
'layer_norm'
,
**
locals
())
helper
=
LayerHelper
(
'layer_norm'
,
**
locals
())
mean
=
layers
.
reduce_mean
(
x
,
dim
=
begin_norm_axis
,
keep_dim
=
True
)
mean
=
layers
.
reduce_mean
(
x
,
dim
=
begin_norm_axis
,
keep_dim
=
True
)
...
...
paddlepalm/reader/utils/reader4ernie.py
浏览文件 @
82d09960
...
@@ -639,7 +639,8 @@ class MRCReader(BaseReader):
...
@@ -639,7 +639,8 @@ class MRCReader(BaseReader):
for_cn
=
True
,
for_cn
=
True
,
task_id
=
0
,
task_id
=
0
,
doc_stride
=
128
,
doc_stride
=
128
,
max_query_length
=
64
):
max_query_length
=
64
,
remove_noanswer
=
True
):
self
.
max_seq_len
=
max_seq_len
self
.
max_seq_len
=
max_seq_len
self
.
tokenizer
=
tokenization
.
FullTokenizer
(
self
.
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
vocab_path
,
do_lower_case
=
do_lower_case
)
vocab_file
=
vocab_path
,
do_lower_case
=
do_lower_case
)
...
@@ -654,6 +655,7 @@ class MRCReader(BaseReader):
...
@@ -654,6 +655,7 @@ class MRCReader(BaseReader):
self
.
max_query_length
=
max_query_length
self
.
max_query_length
=
max_query_length
self
.
examples
=
{}
self
.
examples
=
{}
self
.
features
=
{}
self
.
features
=
{}
self
.
remove_noanswer
=
remove_noanswer
if
random_seed
is
not
None
:
if
random_seed
is
not
None
:
np
.
random
.
seed
(
random_seed
)
np
.
random
.
seed
(
random_seed
)
...
@@ -758,7 +760,7 @@ class MRCReader(BaseReader):
...
@@ -758,7 +760,7 @@ class MRCReader(BaseReader):
return
cur_span_index
==
best_span_index
return
cur_span_index
==
best_span_index
def
_convert_example_to_feature
(
self
,
examples
,
max_seq_length
,
tokenizer
,
def
_convert_example_to_feature
(
self
,
examples
,
max_seq_length
,
tokenizer
,
is_training
):
is_training
,
remove_noanswer
=
True
):
features
=
[]
features
=
[]
unique_id
=
1000000000
unique_id
=
1000000000
...
@@ -845,6 +847,8 @@ class MRCReader(BaseReader):
...
@@ -845,6 +847,8 @@ class MRCReader(BaseReader):
if
out_of_span
:
if
out_of_span
:
start_position
=
0
start_position
=
0
end_position
=
0
end_position
=
0
if
remove_noanswer
:
continue
else
:
else
:
doc_offset
=
len
(
query_tokens
)
+
2
doc_offset
=
len
(
query_tokens
)
+
2
start_position
=
tok_start_position
-
doc_start
+
doc_offset
start_position
=
tok_start_position
-
doc_start
+
doc_offset
...
@@ -958,7 +962,7 @@ class MRCReader(BaseReader):
...
@@ -958,7 +962,7 @@ class MRCReader(BaseReader):
if
not
examples
:
if
not
examples
:
examples
=
self
.
_read_json
(
input_file
,
phase
==
"train"
)
examples
=
self
.
_read_json
(
input_file
,
phase
==
"train"
)
features
=
self
.
_convert_example_to_feature
(
features
=
self
.
_convert_example_to_feature
(
examples
,
self
.
max_seq_len
,
self
.
tokenizer
,
phase
==
"train"
)
examples
,
self
.
max_seq_len
,
self
.
tokenizer
,
phase
==
"train"
,
remove_noanswer
=
self
.
remove_noanswer
)
self
.
examples
[
phase
]
=
examples
self
.
examples
[
phase
]
=
examples
self
.
features
[
phase
]
=
features
self
.
features
[
phase
]
=
features
...
...
paddlepalm/task_instance.py
浏览文件 @
82d09960
...
@@ -113,9 +113,11 @@ class TaskInstance(object):
...
@@ -113,9 +113,11 @@ class TaskInstance(object):
fluid
.
io
.
save_inference_model
(
dirpath
,
self
.
_pred_input_varname_list
,
self
.
_pred_fetch_var_list
,
self
.
_exe
,
prog
)
fluid
.
io
.
save_inference_model
(
dirpath
,
self
.
_pred_input_varname_list
,
self
.
_pred_fetch_var_list
,
self
.
_exe
,
prog
)
conf
=
{}
conf
=
{}
for
k
,
strv
in
self
.
_save_protocol
.
items
():
for
k
,
strv
in
self
.
_save_protocol
.
items
():
exec
(
'v={}'
.
format
(
strv
))
d
=
None
conf
[
k
]
=
v
v
=
locals
()
exec
(
'd={}'
.
format
(
strv
),
globals
(),
v
)
conf
[
k
]
=
v
[
'd'
]
with
open
(
os
.
path
.
join
(
dirpath
,
'__conf__'
),
'w'
)
as
writer
:
with
open
(
os
.
path
.
join
(
dirpath
,
'__conf__'
),
'w'
)
as
writer
:
writer
.
write
(
json
.
dumps
(
conf
,
indent
=
1
))
writer
.
write
(
json
.
dumps
(
conf
,
indent
=
1
))
print
(
self
.
_name
+
': inference model saved at '
+
dirpath
)
print
(
self
.
_name
+
': inference model saved at '
+
dirpath
)
...
@@ -123,7 +125,7 @@ class TaskInstance(object):
...
@@ -123,7 +125,7 @@ class TaskInstance(object):
def
load
(
self
,
infer_model_path
=
None
):
def
load
(
self
,
infer_model_path
=
None
):
if
infer_model_path
is
None
:
if
infer_model_path
is
None
:
infer_model_path
=
self
.
_save_infermodel_path
infer_model_path
=
self
.
_save_infermodel_path
for
k
,
v
in
json
.
load
(
open
(
os
.
path
.
join
(
infer_model_path
,
'__conf__'
))).
items
():
for
k
,
v
in
json
.
load
(
open
(
os
.
path
.
join
(
infer_model_path
,
'__conf__'
))).
items
():
strv
=
self
.
_save_protocol
[
k
]
strv
=
self
.
_save_protocol
[
k
]
exec
(
'{}=v'
.
format
(
strv
))
exec
(
'{}=v'
.
format
(
strv
))
pred_prog
,
self
.
_pred_input_varname_list
,
self
.
_pred_fetch_var_list
=
\
pred_prog
,
self
.
_pred_input_varname_list
,
self
.
_pred_fetch_var_list
=
\
...
...
paddlepalm/tokenizer/ernie_tokenizer.py
浏览文件 @
82d09960
...
@@ -162,10 +162,12 @@ class BasicTokenizer(object):
...
@@ -162,10 +162,12 @@ class BasicTokenizer(object):
def
__init__
(
self
,
do_lower_case
=
True
):
def
__init__
(
self
,
do_lower_case
=
True
):
"""Constructs a BasicTokenizer.
"""Constructs a BasicTokenizer.
Args:
Args:
do_lower_case: Whether to lower case the input.
do_lower_case: Whether to lower case the input.
"""
"""
self
.
do_lower_case
=
do_lower_case
self
.
do_lower_case
=
do_lower_case
self
.
_never_lowercase
=
[
'[UNK]'
,
'[SEP]'
,
'[PAD]'
,
'[CLS]'
,
'[MASK]'
]
def
tokenize
(
self
,
text
):
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text."""
"""Tokenizes a piece of text."""
...
@@ -183,10 +185,13 @@ class BasicTokenizer(object):
...
@@ -183,10 +185,13 @@ class BasicTokenizer(object):
orig_tokens
=
whitespace_tokenize
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
split_tokens
=
[]
split_tokens
=
[]
for
token
in
orig_tokens
:
for
token
in
orig_tokens
:
if
self
.
do_lower_case
:
if
self
.
do_lower_case
and
token
not
in
self
.
_never_lowercase
:
token
=
token
.
lower
()
token
=
token
.
lower
()
token
=
self
.
_run_strip_accents
(
token
)
token
=
self
.
_run_strip_accents
(
token
)
split_tokens
.
extend
(
self
.
_run_split_on_punc
(
token
))
if
token
in
self
.
_never_lowercase
:
split_tokens
.
extend
([
token
])
else
:
split_tokens
.
extend
(
self
.
_run_split_on_punc
(
token
))
output_tokens
=
whitespace_tokenize
(
" "
.
join
(
split_tokens
))
output_tokens
=
whitespace_tokenize
(
" "
.
join
(
split_tokens
))
return
output_tokens
return
output_tokens
...
@@ -281,14 +286,18 @@ class WordpieceTokenizer(object):
...
@@ -281,14 +286,18 @@ class WordpieceTokenizer(object):
def
tokenize
(
self
,
text
):
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text into its word pieces.
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
using the given vocabulary.
For example:
For example:
input = "unaffable"
input = "unaffable"
output = ["un", "##aff", "##able"]
output = ["un", "##aff", "##able"]
Args:
Args:
text: A single token or whitespace separated tokens. This should have
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
already been passed through `BasicTokenizer.
Returns:
Returns:
A list of wordpiece tokens.
A list of wordpiece tokens.
"""
"""
...
...
setup.py
浏览文件 @
82d09960
...
@@ -24,8 +24,8 @@ import setuptools
...
@@ -24,8 +24,8 @@ import setuptools
with
open
(
"README.md"
,
"r"
)
as
fh
:
with
open
(
"README.md"
,
"r"
)
as
fh
:
long_description
=
fh
.
read
()
long_description
=
fh
.
read
()
setuptools
.
setup
(
setuptools
.
setup
(
name
=
"paddle
-
palm"
,
name
=
"paddlepalm"
,
version
=
"1.
2
"
,
version
=
"1.
0.0
"
,
author
=
"PaddlePaddle"
,
author
=
"PaddlePaddle"
,
author_email
=
"zhangyiming04@baidu.com"
,
author_email
=
"zhangyiming04@baidu.com"
,
description
=
"A Multi-task Learning Lib for PaddlePaddle Users."
,
description
=
"A Multi-task Learning Lib for PaddlePaddle Users."
,
...
@@ -63,6 +63,9 @@ setuptools.setup(
...
@@ -63,6 +63,9 @@ setuptools.setup(
'Programming Language :: Python :: 3.6'
,
'Programming Language :: Python :: 3.6'
,
'Programming Language :: Python :: 3.7'
,
'Programming Language :: Python :: 3.7'
,
],
],
install_requires
=
[
'paddlepaddle-gpu>=1.6.1'
]
)
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录