Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
363489d0
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
363489d0
编写于
6月 28, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
6月 28, 2020
浏览文件
操作
浏览文件
下载
差异文件
!2580 BasicTokenizer do not case fold on preserved words
Merge pull request !2580 from qianlong21st/fix_basic_tokenizer
上级
db6c4bf7
cae77c0c
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
109 addition
and
10 deletion
+109
-10
mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc
mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc
+77
-4
mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h
mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h
+7
-0
tests/ut/data/dataset/testTokenizerData/bert_tokenizer.txt
tests/ut/data/dataset/testTokenizerData/bert_tokenizer.txt
+3
-2
tests/ut/python/dataset/test_bert_tokenizer.py
tests/ut/python/dataset/test_bert_tokenizer.py
+22
-4
未找到文件。
mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc
浏览文件 @
363489d0
...
...
@@ -15,11 +15,16 @@
*/
#include "dataset/text/kernels/basic_tokenizer_op.h"
#include <memory>
#include <queue>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "unicode/errorcode.h"
#include "unicode/normalizer2.h"
#include "unicode/utypes.h"
namespace
mindspore
{
namespace
dataset
{
const
bool
BasicTokenizerOp
::
kDefLowerCase
=
false
;
...
...
@@ -40,8 +45,8 @@ const char BasicTokenizerOp::kCommonPattern[] =
"|[
\\
x{2B820}-
\\
x{2CEAF}]"
"|[
\\
x{F900}-
\\
x{FAFF}]"
"|[
\\
x{2F800}-
\\
x{2FA1F}]"
;
const
char
BasicTokenizerOp
::
kUnusedPattern
[]
=
"
\\
[CLS
\\
]|
\\
[SEP
\\
]|
\\
[UNK
\\
]|
\\
[PAD
\\
]|
\\
[MASK
\\
]|"
;
const
char
BasicTokenizerOp
::
kUnusedPattern
[]
=
"
\\
[CLS
\\
]|
\\
[SEP
\\
]|
\\
[UNK
\\
]|
\\
[PAD
\\
]|
\\
[MASK
\\
]|
\\
[unused
\\
d+
\\
]|
"
;
const
std
::
unordered_set
<
std
::
string
>
BasicTokenizerOp
::
kUnusedWords
{
"[CLS]"
,
"[SEP]"
,
"[UNK]"
,
"[PAD]"
,
"[MASK]"
};
BasicTokenizerOp
::
BasicTokenizerOp
(
bool
lower_case
,
bool
keep_whitespace
,
NormalizeForm
normalization_form
,
bool
preserve_unused_token
)
:
lower_case_
(
lower_case
),
...
...
@@ -67,6 +72,69 @@ BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, Normal
regex_tokenizer_
=
std
::
make_unique
<
RegexTokenizerOp
>
(
delim_pattern
,
keep_delim_pattern
);
}
Status
BasicTokenizerOp
::
CaseFoldWithoutUnusedWords
(
const
std
::
string_view
&
text
,
const
std
::
unordered_set
<
std
::
string
>
&
unused_words
,
std
::
string
*
outupt
)
{
icu
::
ErrorCode
error
;
const
icu
::
Normalizer2
*
nfkc_case_fold
=
icu
::
Normalizer2
::
getNFKCCasefoldInstance
(
error
);
CHECK_FAIL_RETURN_UNEXPECTED
(
error
.
isSuccess
(),
"getNFKCCasefoldInstance failed."
);
outupt
->
clear
();
// 1. get start and end offsets of not case fold strs
std
::
queue
<
std
::
pair
<
int
,
int
>>
offsets
;
// offsets of not used words
int
start
=
-
1
;
int
len
=
0
;
for
(
int
i
=
0
;
i
<
text
.
length
();
i
++
)
{
if
(
text
[
i
]
==
'['
)
{
start
=
i
;
++
len
;
}
else
if
(
text
[
i
]
==
']'
&&
start
>=
0
)
{
++
len
;
std
::
string
word
(
text
.
substr
(
start
,
len
));
if
(
unused_words
.
find
(
word
)
!=
unused_words
.
end
())
{
offsets
.
push
(
std
::
make_pair
(
start
,
start
+
len
-
1
));
}
start
=
-
1
;
len
=
0
;
}
else
if
(
start
>=
0
)
{
++
len
;
}
}
// 2. Do not apply case fold on `unused_words`
start
=
0
;
for
(
int
i
=
0
;
i
<
text
.
length
();)
{
std
::
string_view
process_text
;
std
::
string
preserve_token
;
if
(
offsets
.
empty
())
{
i
=
text
.
length
();
process_text
=
text
.
substr
(
start
,
i
-
start
);
}
else
{
preserve_token
=
text
.
substr
(
offsets
.
front
().
first
,
offsets
.
front
().
second
-
offsets
.
front
().
first
+
1
);
process_text
=
text
.
substr
(
start
,
offsets
.
front
().
first
-
start
);
i
=
offsets
.
front
().
second
+
1
;
offsets
.
pop
();
}
std
::
string
temp
;
icu
::
StringByteSink
<
std
::
string
>
sink
(
&
temp
);
nfkc_case_fold
->
normalizeUTF8
(
0
,
icu
::
StringPiece
(
process_text
.
data
(),
process_text
.
size
()),
sink
,
nullptr
,
error
);
*
outupt
+=
temp
+
preserve_token
;
}
return
Status
::
OK
();
}
Status
BasicTokenizerOp
::
CaseFoldWithoutUnusedWords
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
{
IO_CHECK
(
input
,
output
);
std
::
vector
<
std
::
string
>
strs
(
input
->
Size
());
int
i
=
0
;
for
(
auto
iter
=
input
->
begin
<
std
::
string_view
>
();
iter
!=
input
->
end
<
std
::
string_view
>
();
iter
++
)
{
RETURN_IF_NOT_OK
(
CaseFoldWithoutUnusedWords
(
*
iter
,
kUnusedWords
,
&
strs
[
i
++
]));
}
*
output
=
std
::
make_shared
<
Tensor
>
(
std
::
move
(
strs
),
input
->
shape
());
return
Status
::
OK
();
}
Status
BasicTokenizerOp
::
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
{
IO_CHECK
(
input
,
output
);
if
(
input
->
Rank
()
!=
0
||
input
->
type
()
!=
DataType
::
DE_STRING
)
{
...
...
@@ -75,8 +143,13 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar
std
::
shared_ptr
<
Tensor
>
cur_input
;
std
::
shared_ptr
<
Tensor
>
processed_tensor
;
if
(
lower_case_
)
{
// to lower case
RETURN_IF_NOT_OK
(
case_fold_
->
Compute
(
input
,
&
processed_tensor
));
if
(
!
preserve_unused_token_
)
{
// to lower case
RETURN_IF_NOT_OK
(
case_fold_
->
Compute
(
input
,
&
processed_tensor
));
}
else
{
// to lower case except words in kUnusedWords
RETURN_IF_NOT_OK
(
CaseFoldWithoutUnusedWords
(
input
,
&
processed_tensor
));
}
cur_input
=
processed_tensor
;
// strip accent characters
RETURN_IF_NOT_OK
(
nfd_normalize_
->
Compute
(
cur_input
,
&
processed_tensor
));
...
...
mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h
浏览文件 @
363489d0
...
...
@@ -17,6 +17,7 @@
#define DATASET_TEXT_KERNELS_BASIC_TOKENIZER_OP_H_
#include <memory>
#include <string>
#include <unordered_set>
#include "dataset/core/tensor.h"
#include "dataset/kernels/tensor_op.h"
...
...
@@ -45,9 +46,15 @@ class BasicTokenizerOp : public TensorOp {
Status
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
override
;
protected:
Status
CaseFoldWithoutUnusedWords
(
const
std
::
string_view
&
text
,
const
std
::
unordered_set
<
std
::
string
>
&
unused_words
,
std
::
string
*
outupt
);
Status
CaseFoldWithoutUnusedWords
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
);
private:
static
const
char
kCommonPattern
[];
static
const
char
kUnusedPattern
[];
static
const
std
::
unordered_set
<
std
::
string
>
kUnusedWords
;
bool
lower_case_
;
bool
keep_whitespace_
;
NormalizeForm
normalization_form_
;
...
...
tests/ut/data/dataset/testTokenizerData/bert_tokenizer.txt
浏览文件 @
363489d0
...
...
@@ -10,5 +10,6 @@ unused [SEP]
unused [UNK]
unused [PAD]
unused [MASK]
12+/-28=40/-16
Hello World!
\ No newline at end of file
[unused1]
[unused10]
12+/-28=40/-16
\ No newline at end of file
tests/ut/python/dataset/test_bert_tokenizer.py
浏览文件 @
363489d0
...
...
@@ -27,7 +27,7 @@ vocab_bert = [
"繁"
,
"體"
,
"字"
,
"嘿"
,
"哈"
,
"大"
,
"笑"
,
"嘻"
,
"i"
,
"am"
,
"mak"
,
"make"
,
"small"
,
"mistake"
,
"##s"
,
"during"
,
"work"
,
"##ing"
,
"hour"
,
"😀"
,
"😃"
,
"😄"
,
"😁"
,
"+"
,
"/"
,
"-"
,
"="
,
"12"
,
"28"
,
"40"
,
"16"
,
" "
,
"I"
,
"[CLS]"
,
"[SEP]"
,
"[UNK]"
,
"[PAD]"
,
"[MASK]"
"[CLS]"
,
"[SEP]"
,
"[UNK]"
,
"[PAD]"
,
"[MASK]"
,
"[unused1]"
,
"[unused10]"
]
pad
=
'<pad>'
test_paras
=
[
...
...
@@ -69,22 +69,40 @@ test_paras = [
# test preserved tokens
dict
(
first
=
8
,
last
=
1
2
,
last
=
1
4
,
expect_str
=
[
[
'[UNK]'
,
'[CLS]'
],
[
'[UNK]'
,
'[SEP]'
],
[
'[UNK]'
,
'[UNK]'
],
[
'[UNK]'
,
'[PAD]'
],
[
'[UNK]'
,
'[MASK]'
],
[
'[unused1]'
],
[
'[unused10]'
]
],
lower_case
=
False
,
vocab_list
=
vocab_bert
,
preserve_unused_token
=
True
,
),
dict
(
first
=
8
,
last
=
14
,
expect_str
=
[
[
'[UNK]'
,
'[CLS]'
],
[
'[UNK]'
,
'[SEP]'
],
[
'[UNK]'
,
'[UNK]'
],
[
'[UNK]'
,
'[PAD]'
],
[
'[UNK]'
,
'[MASK]'
],
[
'[unused1]'
],
[
'[unused10]'
]
],
lower_case
=
True
,
vocab_list
=
vocab_bert
,
preserve_unused_token
=
True
,
),
# test special symbol
dict
(
first
=
1
3
,
last
=
1
3
,
first
=
1
5
,
last
=
1
5
,
expect_str
=
[[
'12'
,
'+'
,
'/'
,
'-'
,
'28'
,
'='
,
'40'
,
'/'
,
'-'
,
'16'
]],
preserve_unused_token
=
True
,
vocab_list
=
vocab_bert
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录