Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
980ddd32
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
980ddd32
编写于
6月 20, 2020
作者:
Q
qianlong
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
change output of WordpieceTokenizer and BertTokenizer to 1-D string tensors
上级
e8639ad9
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
25 addition
and
68 deletion
+25
-68
mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc
...pore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc
+7
-28
mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h
...spore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h
+0
-2
mindspore/dataset/text/transforms.py
mindspore/dataset/text/transforms.py
+1
-1
tests/ut/python/dataset/test_bert_tokenizer.py
tests/ut/python/dataset/test_bert_tokenizer.py
+17
-37
未找到文件。
mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc
浏览文件 @
980ddd32
...
...
@@ -32,23 +32,6 @@ WordpieceTokenizerOp::WordpieceTokenizerOp(const std::shared_ptr<Vocab> &vocab,
max_bytes_per_token_
(
max_bytes_per_token
),
unknown_token_
(
unknown_token
)
{}
void
WordpieceTokenizerOp
::
PadTokens
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>>
&
tokens
,
const
std
::
string
&
padded_str
,
std
::
vector
<
std
::
string
>
*
out_padded_tokens
,
int
*
out_cols
)
const
{
int
rows
=
tokens
.
size
();
int
max_cols
=
0
;
for
(
int
i
=
0
;
i
<
rows
;
i
++
)
{
max_cols
=
std
::
max
(
max_cols
,
static_cast
<
int
>
(
tokens
[
i
].
size
()));
}
out_padded_tokens
->
resize
(
rows
*
max_cols
,
padded_str
);
for
(
int
i
=
0
;
i
<
rows
;
i
++
)
{
int
index
=
i
*
max_cols
;
for
(
int
j
=
0
;
j
<
tokens
[
i
].
size
();
j
++
)
{
(
*
out_padded_tokens
)[
index
++
]
=
tokens
[
i
][
j
];
}
}
*
out_cols
=
max_cols
;
}
Status
WordpieceTokenizerOp
::
LookupWord
(
const
std
::
string
&
input_token
,
const
RuneStrArray
&
runes
,
const
int
start
,
bool
*
out_found
,
int
*
out_end
)
const
{
CHECK_FAIL_RETURN_UNEXPECTED
(
start
>=
0
&&
start
<
input_token
.
size
(),
"Out of range"
);
...
...
@@ -117,20 +100,16 @@ Status WordpieceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::
if
(
input
->
Rank
()
>
1
||
input
->
type
()
!=
DataType
::
DE_STRING
)
{
RETURN_STATUS_UNEXPECTED
(
"The input tensor should be scalar or 1-D string tensor"
);
}
std
::
vector
<
std
::
vector
<
std
::
string
>>
out_tokens
(
input
->
Size
());
int
i
=
0
;
std
::
vector
<
std
::
string
>
out_tokens
;
for
(
auto
iter
=
input
->
begin
<
std
::
string_view
>
();
iter
!=
input
->
end
<
std
::
string_view
>
();
iter
++
)
{
RETURN_IF_NOT_OK
(
GetTokens
(
std
::
string
(
*
iter
),
&
out_tokens
[
i
++
]));
std
::
vector
<
std
::
string
>
temp_tokens
;
RETURN_IF_NOT_OK
(
GetTokens
(
std
::
string
(
*
iter
),
&
temp_tokens
));
out_tokens
.
insert
(
out_tokens
.
end
(),
temp_tokens
.
begin
(),
temp_tokens
.
end
());
}
std
::
vector
<
std
::
string
>
padded_tokens
;
int
cols
=
0
;
PadTokens
(
out_tokens
,
"<pad>"
,
&
padded_tokens
,
&
cols
);
std
::
vector
<
dsize_t
>
shapes
;
if
(
input
->
Rank
()
==
1
)
{
shapes
.
push_back
(
out_tokens
.
size
());
if
(
out_tokens
.
empty
())
{
out_tokens
.
emplace_back
(
""
);
}
shapes
.
push_back
(
cols
);
*
output
=
std
::
make_shared
<
Tensor
>
(
std
::
move
(
padded_tokens
),
TensorShape
(
shapes
));
*
output
=
std
::
make_shared
<
Tensor
>
(
out_tokens
,
TensorShape
({(
dsize_t
)
out_tokens
.
size
()}));
return
Status
::
OK
();
}
...
...
mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h
浏览文件 @
980ddd32
...
...
@@ -48,8 +48,6 @@ class WordpieceTokenizerOp : public TensorOp {
Status
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
override
;
protected:
void
PadTokens
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>>
&
tokens
,
const
std
::
string
&
padded_str
,
std
::
vector
<
std
::
string
>
*
out_padded_tokens
,
int
*
out_cols
)
const
;
Status
AddSubword
(
const
std
::
string
&
input_token
,
const
int
start
,
const
int
end
,
std
::
vector
<
std
::
string
>
*
out_token
)
const
;
Status
FoundNoToken
(
const
std
::
string
&
input_token
,
std
::
vector
<
std
::
string
>
*
out_tokens
)
const
;
...
...
mindspore/dataset/text/transforms.py
浏览文件 @
980ddd32
...
...
@@ -188,7 +188,7 @@ class UnicodeCharTokenizer(cde.UnicodeCharTokenizerOp):
class
WordpieceTokenizer
(
cde
.
WordpieceTokenizerOp
):
"""
Tokenize scalar token or 1-D tokens to subword tokens.
Tokenize scalar token or 1-D tokens to
1-D
subword tokens.
Args
vocab(Vocab): a Vocab object.
...
...
tests/ut/python/dataset/test_bert_tokenizer.py
浏览文件 @
980ddd32
...
...
@@ -35,38 +35,24 @@ test_paras = [
dict
(
first
=
1
,
last
=
4
,
expect_str
=
[[
[
'床'
],
[
'前'
],
[
'明'
],
[
'月'
],
[
'光'
]
],
[
[
'疑'
],
[
'是'
],
[
'地'
],
[
'上'
],
[
'霜'
]
],
[
[
'举'
],
[
'头'
],
[
'望'
],
[
'明'
],
[
'月'
]
],
[
[
'低'
],
[
'头'
],
[
'思'
],
[
'故'
],
[
'乡'
]
]],
expect_str
=
[[
'床'
,
'前'
,
'明'
,
'月'
,
'光'
],
[
'疑'
,
'是'
,
'地'
,
'上'
,
'霜'
],
[
'举'
,
'头'
,
'望'
,
'明'
,
'月'
],
[
'低'
,
'头'
,
'思'
,
'故'
,
'乡'
]],
vocab_list
=
vocab_bert
),
# test english text
dict
(
first
=
5
,
last
=
5
,
expect_str
=
[[[
'i'
,
pad
],
[
"am"
,
pad
],
[
'mak'
,
'##ing'
],
[
'small'
,
pad
],
[
'mistake'
,
'##s'
],
[
'during'
,
pad
],
[
'work'
,
'##ing'
],
[
'hour'
,
'##s'
]]],
expect_str
=
[[
'i'
,
'am'
,
'mak'
,
'##ing'
,
'small'
,
'mistake'
,
'##s'
,
'during'
,
'work'
,
'##ing'
,
'hour'
,
'##s'
]],
lower_case
=
True
,
vocab_list
=
vocab_bert
),
dict
(
first
=
5
,
last
=
5
,
expect_str
=
[[[
'I'
,
pad
],
[
"am"
,
pad
],
[
'mak'
,
'##ing'
],
[
'small'
,
pad
],
[
'mistake'
,
'##s'
],
[
'during'
,
pad
],
[
'work'
,
'##ing'
],
[
'hour'
,
'##s'
]]],
expect_str
=
[[
'I'
,
"am"
,
'mak'
,
'##ing'
,
'small'
,
'mistake'
,
'##s'
,
'during'
,
'work'
,
'##ing'
,
'hour'
,
'##s'
]],
lower_case
=
False
,
vocab_list
=
vocab_bert
),
...
...
@@ -75,8 +61,8 @@ test_paras = [
first
=
6
,
last
=
7
,
expect_str
=
[
[
[
'😀'
],
[
'嘿'
],
[
'嘿'
],
[
'😃'
],
[
'哈'
],
[
'哈'
],
[
'😄'
],
[
'大'
],
[
'笑'
],
[
'😁'
],
[
'嘻'
],
[
'嘻'
]
],
[
[
'繁'
],
[
'體'
],
[
'字'
]
]],
[
'😀'
,
'嘿'
,
'嘿'
,
'😃'
,
'哈'
,
'哈'
,
'😄'
,
'大'
,
'笑'
,
'😁'
,
'嘻'
,
'嘻'
],
[
'繁'
,
'體'
,
'字'
]],
normalization_form
=
nlp
.
utils
.
NormalizeForm
.
NFKC
,
vocab_list
=
vocab_bert
),
...
...
@@ -85,11 +71,11 @@ test_paras = [
first
=
8
,
last
=
12
,
expect_str
=
[
[
[
'[UNK]'
],
[
'[CLS]'
]
],
[
[
'[UNK]'
],
[
'[SEP]'
]
],
[
[
'[UNK]'
],
[
'[UNK]'
]
],
[
[
'[UNK]'
],
[
'[PAD]'
]
],
[
[
'[UNK]'
],
[
'[MASK]'
]
],
[
'[UNK]'
,
'[CLS]'
],
[
'[UNK]'
,
'[SEP]'
],
[
'[UNK]'
,
'[UNK]'
],
[
'[UNK]'
,
'[PAD]'
],
[
'[UNK]'
,
'[MASK]'
],
],
lower_case
=
False
,
vocab_list
=
vocab_bert
,
...
...
@@ -99,7 +85,7 @@ test_paras = [
dict
(
first
=
13
,
last
=
13
,
expect_str
=
[[
[
'12'
],
[
'+'
],
[
'/'
],
[
'-'
],
[
'28'
],
[
'='
],
[
'40'
],
[
'/'
],
[
'-'
],
[
'16'
]
]],
expect_str
=
[[
'12'
,
'+'
,
'/'
,
'-'
,
'28'
,
'='
,
'40'
,
'/'
,
'-'
,
'16'
]],
preserve_unused_token
=
True
,
vocab_list
=
vocab_bert
),
...
...
@@ -107,9 +93,7 @@ test_paras = [
dict
(
first
=
8
,
last
=
8
,
expect_str
=
[
[[
'[UNK]'
],
[
' '
],
[
'[CLS]'
]],
],
expect_str
=
[[
'[UNK]'
,
' '
,
'[CLS]'
]],
lower_case
=
False
,
vocab_list
=
vocab_bert
,
preserve_unused_token
=
True
,
...
...
@@ -118,9 +102,7 @@ test_paras = [
dict
(
first
=
8
,
last
=
8
,
expect_str
=
[
[[
'unused'
],
[
' '
],
[
'[CLS]'
]],
],
expect_str
=
[[
'unused'
,
' '
,
'[CLS]'
]],
lower_case
=
False
,
vocab_list
=
vocab_bert
,
preserve_unused_token
=
True
,
...
...
@@ -130,9 +112,7 @@ test_paras = [
dict
(
first
=
8
,
last
=
8
,
expect_str
=
[
[[
'unused'
],
[
' '
],
[
'['
],
[
'CLS'
],
[
']'
]],
],
expect_str
=
[[
'unused'
,
' '
,
'['
,
'CLS'
,
']'
]],
lower_case
=
False
,
vocab_list
=
vocab_bert
,
preserve_unused_token
=
False
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录