Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
47060631
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
47060631
编写于
7月 08, 2020
作者:
X
xiefangqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add offsets feature to tokenizer
上级
4bdd8e16
变更
26
隐藏空白更改
内联
并排
Showing
26 changed file
with
2067 addition
and
330 deletion
+2067
-330
mindspore/ccsrc/dataset/api/python_bindings.cc
mindspore/ccsrc/dataset/api/python_bindings.cc
+24
-15
mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc
mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc
+16
-10
mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h
mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h
+8
-4
mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc
mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc
+3
-3
mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h
mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h
+8
-7
mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc
mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc
+38
-10
mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.h
mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.h
+8
-3
mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc
mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc
+46
-11
mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h
mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h
+10
-4
mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc
...e/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc
+25
-5
mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h
...re/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h
+7
-2
mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc
...ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc
+26
-5
mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h
.../ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h
+6
-2
mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc
...ore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc
+29
-5
mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h
...pore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h
+7
-2
mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc
...pore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc
+54
-13
mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h
...spore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h
+9
-5
mindspore/dataset/text/transforms.py
mindspore/dataset/text/transforms.py
+169
-18
mindspore/dataset/text/validators.py
mindspore/dataset/text/validators.py
+135
-3
tests/ut/cpp/dataset/jieba_tokenizer_op_test.cc
tests/ut/cpp/dataset/jieba_tokenizer_op_test.cc
+24
-21
tests/ut/cpp/dataset/tokenizer_op_test.cc
tests/ut/cpp/dataset/tokenizer_op_test.cc
+183
-164
tests/ut/python/dataset/test_text_basic_tokenizer.py
tests/ut/python/dataset/test_text_basic_tokenizer.py
+138
-0
tests/ut/python/dataset/test_text_bert_tokenizer.py
tests/ut/python/dataset/test_text_bert_tokenizer.py
+83
-18
tests/ut/python/dataset/test_text_jieba_tokenizer.py
tests/ut/python/dataset/test_text_jieba_tokenizer.py
+471
-0
tests/ut/python/dataset/test_text_tokenizer.py
tests/ut/python/dataset/test_text_tokenizer.py
+380
-0
tests/ut/python/dataset/test_text_wordpiece_tokenizer.py
tests/ut/python/dataset/test_text_wordpiece_tokenizer.py
+160
-0
未找到文件。
mindspore/ccsrc/dataset/api/python_bindings.cc
浏览文件 @
47060631
...
@@ -601,13 +601,14 @@ void bindTensorOps4(py::module *m) {
...
@@ -601,13 +601,14 @@ void bindTensorOps4(py::module *m) {
void
bindTokenizerOps
(
py
::
module
*
m
)
{
void
bindTokenizerOps
(
py
::
module
*
m
)
{
(
void
)
py
::
class_
<
JiebaTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
JiebaTokenizerOp
>>
(
*
m
,
"JiebaTokenizerOp"
,
""
)
(
void
)
py
::
class_
<
JiebaTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
JiebaTokenizerOp
>>
(
*
m
,
"JiebaTokenizerOp"
,
""
)
.
def
(
py
::
init
<
const
std
::
string
,
std
::
string
,
JiebaMode
>
(),
py
::
arg
(
"hmm_path"
),
py
::
arg
(
"mp_path"
),
.
def
(
py
::
init
<
const
std
::
string
&
,
const
std
::
string
&
,
const
JiebaMode
&
,
const
bool
&>
(),
py
::
arg
(
"hmm_path"
),
py
::
arg
(
"mode"
)
=
JiebaMode
::
kMix
)
py
::
arg
(
"mp_path"
),
py
::
arg
(
"mode"
)
=
JiebaMode
::
kMix
,
py
::
arg
(
"with_offsets"
)
=
JiebaTokenizerOp
::
kDefWithOffsets
)
.
def
(
"add_word"
,
.
def
(
"add_word"
,
[](
JiebaTokenizerOp
&
self
,
const
std
::
string
word
,
int
freq
)
{
THROW_IF_ERROR
(
self
.
AddWord
(
word
,
freq
));
});
[](
JiebaTokenizerOp
&
self
,
const
std
::
string
word
,
int
freq
)
{
THROW_IF_ERROR
(
self
.
AddWord
(
word
,
freq
));
});
(
void
)
py
::
class_
<
UnicodeCharTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
UnicodeCharTokenizerOp
>>
(
(
void
)
py
::
class_
<
UnicodeCharTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
UnicodeCharTokenizerOp
>>
(
*
m
,
"UnicodeCharTokenizerOp"
,
"Tokenize a scalar tensor of UTF-8 string to Unicode characters."
)
*
m
,
"UnicodeCharTokenizerOp"
,
"Tokenize a scalar tensor of UTF-8 string to Unicode characters."
)
.
def
(
py
::
init
<
>
()
);
.
def
(
py
::
init
<
const
bool
&>
(),
py
::
arg
(
"with_offsets"
)
=
UnicodeCharTokenizerOp
::
kDefWithOffsets
);
(
void
)
py
::
class_
<
LookupOp
,
TensorOp
,
std
::
shared_ptr
<
LookupOp
>>
(
*
m
,
"LookupOp"
,
(
void
)
py
::
class_
<
LookupOp
,
TensorOp
,
std
::
shared_ptr
<
LookupOp
>>
(
*
m
,
"LookupOp"
,
"Tensor operation to LookUp each word"
)
"Tensor operation to LookUp each word"
)
.
def
(
py
::
init
<
std
::
shared_ptr
<
Vocab
>
,
WordIdType
>
(),
py
::
arg
(
"vocab"
),
py
::
arg
(
"unknown"
))
.
def
(
py
::
init
<
std
::
shared_ptr
<
Vocab
>
,
WordIdType
>
(),
py
::
arg
(
"vocab"
),
py
::
arg
(
"unknown"
))
...
@@ -619,21 +620,25 @@ void bindTokenizerOps(py::module *m) {
...
@@ -619,21 +620,25 @@ void bindTokenizerOps(py::module *m) {
py
::
arg
(
"separator"
));
py
::
arg
(
"separator"
));
(
void
)
py
::
class_
<
WordpieceTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
WordpieceTokenizerOp
>>
(
(
void
)
py
::
class_
<
WordpieceTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
WordpieceTokenizerOp
>>
(
*
m
,
"WordpieceTokenizerOp"
,
"Tokenize scalar token or 1-D tokens to subword tokens."
)
*
m
,
"WordpieceTokenizerOp"
,
"Tokenize scalar token or 1-D tokens to subword tokens."
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
Vocab
>
&
,
const
std
::
string
&
,
const
int
&
,
const
std
::
string
&>
(),
.
def
(
py
::
arg
(
"vocab"
),
py
::
arg
(
"suffix_indicator"
)
=
std
::
string
(
WordpieceTokenizerOp
::
kDefSuffixIndicator
),
py
::
init
<
const
std
::
shared_ptr
<
Vocab
>
&
,
const
std
::
string
&
,
const
int
&
,
const
std
::
string
&
,
const
bool
&>
(),
py
::
arg
(
"max_bytes_per_token"
)
=
WordpieceTokenizerOp
::
kDefMaxBytesPerToken
,
py
::
arg
(
"vocab"
),
py
::
arg
(
"suffix_indicator"
)
=
std
::
string
(
WordpieceTokenizerOp
::
kDefSuffixIndicator
),
py
::
arg
(
"unknown_token"
)
=
std
::
string
(
WordpieceTokenizerOp
::
kDefUnknownToken
));
py
::
arg
(
"max_bytes_per_token"
)
=
WordpieceTokenizerOp
::
kDefMaxBytesPerToken
,
py
::
arg
(
"unknown_token"
)
=
std
::
string
(
WordpieceTokenizerOp
::
kDefUnknownToken
),
py
::
arg
(
"with_offsets"
)
=
WordpieceTokenizerOp
::
kDefWithOffsets
);
}
}
void
bindDependIcuTokenizerOps
(
py
::
module
*
m
)
{
void
bindDependIcuTokenizerOps
(
py
::
module
*
m
)
{
#ifdef ENABLE_ICU4C
#ifdef ENABLE_ICU4C
(
void
)
py
::
class_
<
WhitespaceTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
WhitespaceTokenizerOp
>>
(
(
void
)
py
::
class_
<
WhitespaceTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
WhitespaceTokenizerOp
>>
(
*
m
,
"WhitespaceTokenizerOp"
,
"Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces."
)
*
m
,
"WhitespaceTokenizerOp"
,
"Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces."
)
.
def
(
py
::
init
<
>
()
);
.
def
(
py
::
init
<
const
bool
&>
(),
py
::
arg
(
"with_offsets"
)
=
WhitespaceTokenizerOp
::
kDefWithOffsets
);
(
void
)
py
::
class_
<
UnicodeScriptTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
UnicodeScriptTokenizerOp
>>
(
(
void
)
py
::
class_
<
UnicodeScriptTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
UnicodeScriptTokenizerOp
>>
(
*
m
,
"UnicodeScriptTokenizerOp"
,
"Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries."
)
*
m
,
"UnicodeScriptTokenizerOp"
,
"Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries."
)
.
def
(
py
::
init
<>
())
.
def
(
py
::
init
<>
())
.
def
(
py
::
init
<
bool
>
(),
py
::
arg
(
"keep_whitespace"
)
=
UnicodeScriptTokenizerOp
::
kDefKeepWhitespace
);
.
def
(
py
::
init
<
const
bool
&
,
const
bool
&>
(),
py
::
arg
(
"keep_whitespace"
)
=
UnicodeScriptTokenizerOp
::
kDefKeepWhitespace
,
py
::
arg
(
"with_offsets"
)
=
UnicodeScriptTokenizerOp
::
kDefWithOffsets
);
(
void
)
py
::
class_
<
CaseFoldOp
,
TensorOp
,
std
::
shared_ptr
<
CaseFoldOp
>>
(
(
void
)
py
::
class_
<
CaseFoldOp
,
TensorOp
,
std
::
shared_ptr
<
CaseFoldOp
>>
(
*
m
,
"CaseFoldOp"
,
"Apply case fold operation on utf-8 string tensor"
)
*
m
,
"CaseFoldOp"
,
"Apply case fold operation on utf-8 string tensor"
)
.
def
(
py
::
init
<>
());
.
def
(
py
::
init
<>
());
...
@@ -647,24 +652,28 @@ void bindDependIcuTokenizerOps(py::module *m) {
...
@@ -647,24 +652,28 @@ void bindDependIcuTokenizerOps(py::module *m) {
py
::
arg
(
"replace_all"
));
py
::
arg
(
"replace_all"
));
(
void
)
py
::
class_
<
RegexTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
RegexTokenizerOp
>>
(
(
void
)
py
::
class_
<
RegexTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
RegexTokenizerOp
>>
(
*
m
,
"RegexTokenizerOp"
,
"Tokenize a scalar tensor of UTF-8 string by regex expression pattern."
)
*
m
,
"RegexTokenizerOp"
,
"Tokenize a scalar tensor of UTF-8 string by regex expression pattern."
)
.
def
(
py
::
init
<
const
std
::
string
&
,
const
std
::
string
&>
(),
py
::
arg
(
"delim_pattern"
),
py
::
arg
(
"keep_delim_pattern"
));
.
def
(
py
::
init
<
const
std
::
string
&
,
const
std
::
string
&
,
const
bool
&>
(),
py
::
arg
(
"delim_pattern"
),
py
::
arg
(
"keep_delim_pattern"
),
py
::
arg
(
"with_offsets"
)
=
RegexTokenizerOp
::
kDefWithOffsets
);
(
void
)
py
::
class_
<
BasicTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
BasicTokenizerOp
>>
(
(
void
)
py
::
class_
<
BasicTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
BasicTokenizerOp
>>
(
*
m
,
"BasicTokenizerOp"
,
"Tokenize a scalar tensor of UTF-8 string by specific rules."
)
*
m
,
"BasicTokenizerOp"
,
"Tokenize a scalar tensor of UTF-8 string by specific rules."
)
.
def
(
py
::
init
<
bool
,
bool
,
NormalizeForm
,
bool
>
(),
py
::
arg
(
"lower_case"
)
=
BasicTokenizerOp
::
kDefLowerCase
,
.
def
(
py
::
init
<
const
bool
&
,
const
bool
&
,
const
NormalizeForm
&
,
const
bool
&
,
const
bool
&>
(),
py
::
arg
(
"lower_case"
)
=
BasicTokenizerOp
::
kDefLowerCase
,
py
::
arg
(
"keep_whitespace"
)
=
BasicTokenizerOp
::
kDefKeepWhitespace
,
py
::
arg
(
"keep_whitespace"
)
=
BasicTokenizerOp
::
kDefKeepWhitespace
,
py
::
arg
(
"normalization_form"
)
=
BasicTokenizerOp
::
kDefNormalizationForm
,
py
::
arg
(
"normalization_form"
)
=
BasicTokenizerOp
::
kDefNormalizationForm
,
py
::
arg
(
"preserve_unused_token"
)
=
BasicTokenizerOp
::
kDefPreserveUnusedToken
);
py
::
arg
(
"preserve_unused_token"
)
=
BasicTokenizerOp
::
kDefPreserveUnusedToken
,
py
::
arg
(
"with_offsets"
)
=
BasicTokenizerOp
::
kDefWithOffsets
);
(
void
)
py
::
class_
<
BertTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
BertTokenizerOp
>>
(
*
m
,
"BertTokenizerOp"
,
(
void
)
py
::
class_
<
BertTokenizerOp
,
TensorOp
,
std
::
shared_ptr
<
BertTokenizerOp
>>
(
*
m
,
"BertTokenizerOp"
,
"Tokenizer used for Bert text process."
)
"Tokenizer used for Bert text process."
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
Vocab
>
&
,
const
std
::
string
&
,
const
int
&
,
const
std
::
string
&
,
bool
,
bool
,
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
Vocab
>
&
,
const
std
::
string
&
,
const
int
&
,
const
std
::
string
&
,
const
bool
&
,
NormalizeForm
,
bool
>
(),
const
bool
&
,
const
NormalizeForm
&
,
const
bool
&
,
const
bool
&
>
(),
py
::
arg
(
"vocab"
),
py
::
arg
(
"suffix_indicator"
)
=
std
::
string
(
WordpieceTokenizerOp
::
kDefSuffixIndicator
),
py
::
arg
(
"vocab"
),
py
::
arg
(
"suffix_indicator"
)
=
std
::
string
(
WordpieceTokenizerOp
::
kDefSuffixIndicator
),
py
::
arg
(
"max_bytes_per_token"
)
=
WordpieceTokenizerOp
::
kDefMaxBytesPerToken
,
py
::
arg
(
"max_bytes_per_token"
)
=
WordpieceTokenizerOp
::
kDefMaxBytesPerToken
,
py
::
arg
(
"unknown_token"
)
=
std
::
string
(
WordpieceTokenizerOp
::
kDefUnknownToken
),
py
::
arg
(
"unknown_token"
)
=
std
::
string
(
WordpieceTokenizerOp
::
kDefUnknownToken
),
py
::
arg
(
"lower_case"
)
=
BasicTokenizerOp
::
kDefLowerCase
,
py
::
arg
(
"lower_case"
)
=
BasicTokenizerOp
::
kDefLowerCase
,
py
::
arg
(
"keep_whitespace"
)
=
BasicTokenizerOp
::
kDefKeepWhitespace
,
py
::
arg
(
"keep_whitespace"
)
=
BasicTokenizerOp
::
kDefKeepWhitespace
,
py
::
arg
(
"normalization_form"
)
=
BasicTokenizerOp
::
kDefNormalizationForm
,
py
::
arg
(
"normalization_form"
)
=
BasicTokenizerOp
::
kDefNormalizationForm
,
py
::
arg
(
"preserve_unused_token"
)
=
BasicTokenizerOp
::
kDefPreserveUnusedToken
);
py
::
arg
(
"preserve_unused_token"
)
=
BasicTokenizerOp
::
kDefPreserveUnusedToken
,
py
::
arg
(
"with_offsets"
)
=
WordpieceTokenizerOp
::
kDefWithOffsets
);
#endif
#endif
}
}
...
...
mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.cc
浏览文件 @
47060631
...
@@ -27,10 +27,12 @@
...
@@ -27,10 +27,12 @@
namespace
mindspore
{
namespace
mindspore
{
namespace
dataset
{
namespace
dataset
{
const
bool
BasicTokenizerOp
::
kDefLowerCase
=
false
;
const
bool
BasicTokenizerOp
::
kDefLowerCase
=
false
;
const
bool
BasicTokenizerOp
::
kDefKeepWhitespace
=
false
;
const
bool
BasicTokenizerOp
::
kDefKeepWhitespace
=
false
;
const
NormalizeForm
BasicTokenizerOp
::
kDefNormalizationForm
=
NormalizeForm
::
kNone
;
const
NormalizeForm
BasicTokenizerOp
::
kDefNormalizationForm
=
NormalizeForm
::
kNone
;
const
bool
BasicTokenizerOp
::
kDefPreserveUnusedToken
=
true
;
const
bool
BasicTokenizerOp
::
kDefPreserveUnusedToken
=
true
;
const
bool
BasicTokenizerOp
::
kDefWithOffsets
=
false
;
const
char
BasicTokenizerOp
::
kCommonPattern
[]
=
const
char
BasicTokenizerOp
::
kCommonPattern
[]
=
"[!-/]"
"[!-/]"
"|[:-@]"
"|[:-@]"
...
@@ -47,11 +49,14 @@ const char BasicTokenizerOp::kCommonPattern[] =
...
@@ -47,11 +49,14 @@ const char BasicTokenizerOp::kCommonPattern[] =
"|[
\\
x{2F800}-
\\
x{2FA1F}]"
;
"|[
\\
x{2F800}-
\\
x{2FA1F}]"
;
const
char
BasicTokenizerOp
::
kUnusedPattern
[]
=
"
\\
[CLS
\\
]|
\\
[SEP
\\
]|
\\
[UNK
\\
]|
\\
[PAD
\\
]|
\\
[MASK
\\
]|
\\
[unused
\\
d+
\\
]|"
;
const
char
BasicTokenizerOp
::
kUnusedPattern
[]
=
"
\\
[CLS
\\
]|
\\
[SEP
\\
]|
\\
[UNK
\\
]|
\\
[PAD
\\
]|
\\
[MASK
\\
]|
\\
[unused
\\
d+
\\
]|"
;
const
std
::
unordered_set
<
std
::
string
>
BasicTokenizerOp
::
kUnusedWords
{
"[CLS]"
,
"[SEP]"
,
"[UNK]"
,
"[PAD]"
,
"[MASK]"
};
const
std
::
unordered_set
<
std
::
string
>
BasicTokenizerOp
::
kUnusedWords
{
"[CLS]"
,
"[SEP]"
,
"[UNK]"
,
"[PAD]"
,
"[MASK]"
};
BasicTokenizerOp
::
BasicTokenizerOp
(
bool
lower_case
,
bool
keep_whitespace
,
NormalizeForm
normalization_form
,
bool
preserve_unused_token
)
BasicTokenizerOp
::
BasicTokenizerOp
(
const
bool
&
lower_case
,
const
bool
&
keep_whitespace
,
const
NormalizeForm
&
normalization_form
,
const
bool
&
preserve_unused_token
,
const
bool
&
with_offsets
)
:
lower_case_
(
lower_case
),
:
lower_case_
(
lower_case
),
keep_whitespace_
(
keep_whitespace
),
keep_whitespace_
(
keep_whitespace
),
preserve_unused_token_
(
preserve_unused_token
),
preserve_unused_token_
(
preserve_unused_token
),
with_offsets_
(
with_offsets
),
case_fold_
(
std
::
make_unique
<
CaseFoldOp
>
()),
case_fold_
(
std
::
make_unique
<
CaseFoldOp
>
()),
nfd_normalize_
(
std
::
make_unique
<
NormalizeUTF8Op
>
(
NormalizeForm
::
kNfd
)),
nfd_normalize_
(
std
::
make_unique
<
NormalizeUTF8Op
>
(
NormalizeForm
::
kNfd
)),
normalization_form_
(
normalization_form
),
normalization_form_
(
normalization_form
),
...
@@ -69,7 +74,7 @@ BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, Normal
...
@@ -69,7 +74,7 @@ BasicTokenizerOp::BasicTokenizerOp(bool lower_case, bool keep_whitespace, Normal
keep_delim_pattern
=
kUnusedPattern
+
keep_delim_pattern
;
keep_delim_pattern
=
kUnusedPattern
+
keep_delim_pattern
;
delim_pattern
=
kUnusedPattern
+
delim_pattern
;
delim_pattern
=
kUnusedPattern
+
delim_pattern
;
}
}
regex_tokenizer_
=
std
::
make_unique
<
RegexTokenizerOp
>
(
delim_pattern
,
keep_delim_pattern
);
regex_tokenizer_
=
std
::
make_unique
<
RegexTokenizerOp
>
(
delim_pattern
,
keep_delim_pattern
,
with_offsets_
);
}
}
Status
BasicTokenizerOp
::
CaseFoldWithoutUnusedWords
(
const
std
::
string_view
&
text
,
Status
BasicTokenizerOp
::
CaseFoldWithoutUnusedWords
(
const
std
::
string_view
&
text
,
...
@@ -135,9 +140,10 @@ Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::shared_ptr<Tensor
...
@@ -135,9 +140,10 @@ Status BasicTokenizerOp::CaseFoldWithoutUnusedWords(const std::shared_ptr<Tensor
return
Status
::
OK
();
return
Status
::
OK
();
}
}
Status
BasicTokenizerOp
::
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
{
Status
BasicTokenizerOp
::
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
{
IO_CHECK
(
input
,
output
);
IO_CHECK_VECTOR
(
input
,
output
);
if
(
input
->
Rank
()
!=
0
||
input
->
type
()
!=
DataType
::
DE_STRING
)
{
CHECK_FAIL_RETURN_UNEXPECTED
(
input
.
size
()
==
1
,
"Input should be one tensor"
);
if
(
input
[
0
]
->
Rank
()
!=
0
||
input
[
0
]
->
type
()
!=
DataType
::
DE_STRING
)
{
RETURN_STATUS_UNEXPECTED
(
"The input tensor should be scalar string tensor"
);
RETURN_STATUS_UNEXPECTED
(
"The input tensor should be scalar string tensor"
);
}
}
std
::
shared_ptr
<
Tensor
>
cur_input
;
std
::
shared_ptr
<
Tensor
>
cur_input
;
...
@@ -145,10 +151,10 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar
...
@@ -145,10 +151,10 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar
if
(
lower_case_
)
{
if
(
lower_case_
)
{
if
(
!
preserve_unused_token_
)
{
if
(
!
preserve_unused_token_
)
{
// to lower case
// to lower case
RETURN_IF_NOT_OK
(
case_fold_
->
Compute
(
input
,
&
processed_tensor
));
RETURN_IF_NOT_OK
(
case_fold_
->
Compute
(
input
[
0
]
,
&
processed_tensor
));
}
else
{
}
else
{
// to lower case except words in kUnusedWords
// to lower case except words in kUnusedWords
RETURN_IF_NOT_OK
(
CaseFoldWithoutUnusedWords
(
input
,
&
processed_tensor
));
RETURN_IF_NOT_OK
(
CaseFoldWithoutUnusedWords
(
input
[
0
]
,
&
processed_tensor
));
}
}
cur_input
=
processed_tensor
;
cur_input
=
processed_tensor
;
// strip accent characters
// strip accent characters
...
@@ -156,12 +162,12 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar
...
@@ -156,12 +162,12 @@ Status BasicTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std::shar
cur_input
=
processed_tensor
;
cur_input
=
processed_tensor
;
RETURN_IF_NOT_OK
(
replace_accent_chars_
->
Compute
(
cur_input
,
&
processed_tensor
));
RETURN_IF_NOT_OK
(
replace_accent_chars_
->
Compute
(
cur_input
,
&
processed_tensor
));
}
else
{
}
else
{
RETURN_IF_NOT_OK
(
common_normalize_
->
Compute
(
input
,
&
processed_tensor
));
RETURN_IF_NOT_OK
(
common_normalize_
->
Compute
(
input
[
0
]
,
&
processed_tensor
));
}
}
// strip control characters
// strip control characters
cur_input
=
processed_tensor
;
cur_input
=
processed_tensor
;
RETURN_IF_NOT_OK
(
replace_control_chars_
->
Compute
(
cur_input
,
&
processed_tensor
));
RETURN_IF_NOT_OK
(
replace_control_chars_
->
Compute
(
cur_input
,
&
processed_tensor
));
return
regex_tokenizer_
->
Compute
(
processed_tensor
,
output
);
return
regex_tokenizer_
->
Compute
(
TensorRow
(
0
,
{
std
::
move
(
processed_tensor
)})
,
output
);
}
}
}
// namespace dataset
}
// namespace dataset
}
// namespace mindspore
}
// namespace mindspore
mindspore/ccsrc/dataset/text/kernels/basic_tokenizer_op.h
浏览文件 @
47060631
...
@@ -36,15 +36,18 @@ class BasicTokenizerOp : public TensorOp {
...
@@ -36,15 +36,18 @@ class BasicTokenizerOp : public TensorOp {
static
const
bool
kDefKeepWhitespace
;
static
const
bool
kDefKeepWhitespace
;
static
const
NormalizeForm
kDefNormalizationForm
;
static
const
NormalizeForm
kDefNormalizationForm
;
static
const
bool
kDefPreserveUnusedToken
;
static
const
bool
kDefPreserveUnusedToken
;
explicit
BasicTokenizerOp
(
bool
lower_case
=
kDefLowerCase
,
bool
keep_whitespace
=
kDefKeepWhitespace
,
static
const
bool
kDefWithOffsets
;
NormalizeForm
normalization_form
=
kDefNormalizationForm
,
bool
preserve_unused_token
=
kDefPreserveUnusedToken
);
explicit
BasicTokenizerOp
(
const
bool
&
lower_case
=
kDefLowerCase
,
const
bool
&
keep_whitespace
=
kDefKeepWhitespace
,
const
NormalizeForm
&
normalization_form
=
kDefNormalizationForm
,
const
bool
&
preserve_unused_token
=
kDefPreserveUnusedToken
,
const
bool
&
with_offsets
=
kDefWithOffsets
);
~
BasicTokenizerOp
()
override
=
default
;
~
BasicTokenizerOp
()
override
=
default
;
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"BasicTokenizerOp"
;
}
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"BasicTokenizerOp"
;
}
Status
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
override
;
Status
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
override
;
protected:
protected:
Status
CaseFoldWithoutUnusedWords
(
const
std
::
string_view
&
text
,
const
std
::
unordered_set
<
std
::
string
>
&
unused_words
,
Status
CaseFoldWithoutUnusedWords
(
const
std
::
string_view
&
text
,
const
std
::
unordered_set
<
std
::
string
>
&
unused_words
,
...
@@ -55,6 +58,7 @@ class BasicTokenizerOp : public TensorOp {
...
@@ -55,6 +58,7 @@ class BasicTokenizerOp : public TensorOp {
static
const
char
kCommonPattern
[];
static
const
char
kCommonPattern
[];
static
const
char
kUnusedPattern
[];
static
const
char
kUnusedPattern
[];
static
const
std
::
unordered_set
<
std
::
string
>
kUnusedWords
;
static
const
std
::
unordered_set
<
std
::
string
>
kUnusedWords
;
bool
with_offsets_
;
bool
lower_case_
;
bool
lower_case_
;
bool
keep_whitespace_
;
bool
keep_whitespace_
;
NormalizeForm
normalization_form_
;
NormalizeForm
normalization_form_
;
...
...
mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.cc
浏览文件 @
47060631
...
@@ -16,9 +16,9 @@
...
@@ -16,9 +16,9 @@
#include "dataset/text/kernels/bert_tokenizer_op.h"
#include "dataset/text/kernels/bert_tokenizer_op.h"
namespace
mindspore
{
namespace
mindspore
{
namespace
dataset
{
namespace
dataset
{
Status
BertTokenizerOp
::
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
{
Status
BertTokenizerOp
::
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
{
IO_CHECK
(
input
,
output
);
IO_CHECK
_VECTOR
(
input
,
output
);
std
::
shared_ptr
<
Tensor
>
basic_tensor
;
TensorRow
basic_tensor
;
RETURN_IF_NOT_OK
(
basic_tokenizer_
.
Compute
(
input
,
&
basic_tensor
));
RETURN_IF_NOT_OK
(
basic_tokenizer_
.
Compute
(
input
,
&
basic_tensor
));
RETURN_IF_NOT_OK
(
wordpiece_tokenizer_
.
Compute
(
basic_tensor
,
output
));
RETURN_IF_NOT_OK
(
wordpiece_tokenizer_
.
Compute
(
basic_tensor
,
output
));
return
Status
::
OK
();
return
Status
::
OK
();
...
...
mindspore/ccsrc/dataset/text/kernels/bert_tokenizer_op.h
浏览文件 @
47060631
...
@@ -32,18 +32,19 @@ class BertTokenizerOp : public TensorOp {
...
@@ -32,18 +32,19 @@ class BertTokenizerOp : public TensorOp {
const
std
::
string
&
suffix_indicator
=
WordpieceTokenizerOp
::
kDefSuffixIndicator
,
const
std
::
string
&
suffix_indicator
=
WordpieceTokenizerOp
::
kDefSuffixIndicator
,
const
int
&
max_bytes_per_token
=
WordpieceTokenizerOp
::
kDefMaxBytesPerToken
,
const
int
&
max_bytes_per_token
=
WordpieceTokenizerOp
::
kDefMaxBytesPerToken
,
const
std
::
string
&
unknown_token
=
WordpieceTokenizerOp
::
kDefUnknownToken
,
const
std
::
string
&
unknown_token
=
WordpieceTokenizerOp
::
kDefUnknownToken
,
bool
lower_case
=
BasicTokenizerOp
::
kDefLowerCase
,
const
bool
&
lower_case
=
BasicTokenizerOp
::
kDefLowerCase
,
bool
keep_whitespace
=
BasicTokenizerOp
::
kDefKeepWhitespace
,
const
bool
&
keep_whitespace
=
BasicTokenizerOp
::
kDefKeepWhitespace
,
NormalizeForm
normalization_form
=
BasicTokenizerOp
::
kDefNormalizationForm
,
const
NormalizeForm
&
normalization_form
=
BasicTokenizerOp
::
kDefNormalizationForm
,
bool
preserve_unused_token
=
BasicTokenizerOp
::
kDefPreserveUnusedToken
)
const
bool
&
preserve_unused_token
=
BasicTokenizerOp
::
kDefPreserveUnusedToken
,
:
wordpiece_tokenizer_
(
vocab
,
suffix_indicator
,
max_bytes_per_token
,
unknown_token
),
const
bool
&
with_offsets
=
WordpieceTokenizerOp
::
kDefWithOffsets
)
basic_tokenizer_
(
lower_case
,
keep_whitespace
,
normalization_form
,
preserve_unused_token
)
{}
:
wordpiece_tokenizer_
(
vocab
,
suffix_indicator
,
max_bytes_per_token
,
unknown_token
,
with_offsets
),
basic_tokenizer_
(
lower_case
,
keep_whitespace
,
normalization_form
,
preserve_unused_token
,
with_offsets
)
{}
~
BertTokenizerOp
()
override
=
default
;
~
BertTokenizerOp
()
override
=
default
;
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"BertTokenizerOp"
;
}
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"BertTokenizerOp"
;
}
Status
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
override
;
Status
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
override
;
private:
private:
WordpieceTokenizerOp
wordpiece_tokenizer_
;
WordpieceTokenizerOp
wordpiece_tokenizer_
;
...
...
mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.cc
浏览文件 @
47060631
...
@@ -23,35 +23,63 @@
...
@@ -23,35 +23,63 @@
namespace
mindspore
{
namespace
mindspore
{
namespace
dataset
{
namespace
dataset
{
JiebaTokenizerOp
::
JiebaTokenizerOp
(
const
std
::
string
&
hmm_path
,
const
std
::
string
&
dict_path
,
JiebaMode
mode
)
const
bool
JiebaTokenizerOp
::
kDefWithOffsets
=
false
;
:
jieba_mode_
(
mode
),
hmm_model_path_
(
hmm_path
),
mp_dict_path_
(
dict_path
)
{
JiebaTokenizerOp
::
JiebaTokenizerOp
(
const
std
::
string
&
hmm_path
,
const
std
::
string
&
dict_path
,
const
JiebaMode
&
mode
,
const
bool
&
with_offsets
)
:
jieba_mode_
(
mode
),
hmm_model_path_
(
hmm_path
),
mp_dict_path_
(
dict_path
),
with_offsets_
(
with_offsets
)
{
jieba_parser_
=
std
::
make_unique
<
cppjieba
::
Jieba
>
(
mp_dict_path_
,
hmm_model_path_
,
""
);
jieba_parser_
=
std
::
make_unique
<
cppjieba
::
Jieba
>
(
mp_dict_path_
,
hmm_model_path_
,
""
);
}
}
Status
JiebaTokenizerOp
::
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
{
Status
JiebaTokenizerOp
::
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
{
IO_CHECK
(
input
,
output
);
IO_CHECK_VECTOR
(
input
,
output
);
CHECK_FAIL_RETURN_UNEXPECTED
(
input
.
size
()
==
1
,
"Input should be one tensor"
);
RETURN_UNEXPECTED_IF_NULL
(
jieba_parser_
);
RETURN_UNEXPECTED_IF_NULL
(
jieba_parser_
);
if
(
input
->
Rank
()
!=
0
||
input
->
type
()
!=
DataType
::
DE_STRING
)
{
if
(
input
[
0
]
->
Rank
()
!=
0
||
input
[
0
]
->
type
()
!=
DataType
::
DE_STRING
)
{
RETURN_STATUS_UNEXPECTED
(
"the input tensor should be scalar string tensor"
);
RETURN_STATUS_UNEXPECTED
(
"the input tensor should be scalar string tensor"
);
}
}
std
::
string_view
sentence_v
;
std
::
string_view
sentence_v
;
RETURN_IF_NOT_OK
(
input
->
GetItemAt
(
&
sentence_v
,
{}));
RETURN_IF_NOT_OK
(
input
[
0
]
->
GetItemAt
(
&
sentence_v
,
{}));
std
::
string
sentence
{
sentence_v
};
std
::
string
sentence
{
sentence_v
};
std
::
vector
<
std
::
string
>
words
;
std
::
vector
<
std
::
string
>
words
;
std
::
vector
<
uint32_t
>
offsets_start
,
offsets_limit
;
std
::
shared_ptr
<
Tensor
>
token_tensor
,
offsets_start_tensor
,
offsets_limit_tensor
;
if
(
sentence
==
""
)
{
if
(
sentence
==
""
)
{
words
.
push_back
(
""
);
words
.
push_back
(
""
);
}
else
{
}
else
{
std
::
vector
<
cppjieba
::
Word
>
tmp
;
if
(
jieba_mode_
==
JiebaMode
::
kMp
)
{
if
(
jieba_mode_
==
JiebaMode
::
kMp
)
{
jieba_parser_
->
CutSmall
(
sentence
,
words
,
MAX_WORD_LENGTH
);
std
::
unique_ptr
<
cppjieba
::
MPSegment
>
mp_seg
=
std
::
make_unique
<
cppjieba
::
MPSegment
>
(
jieba_parser_
->
GetDictTrie
());
mp_seg
->
Cut
(
sentence
,
tmp
,
MAX_WORD_LENGTH
);
}
else
if
(
jieba_mode_
==
JiebaMode
::
kHmm
)
{
}
else
if
(
jieba_mode_
==
JiebaMode
::
kHmm
)
{
jieba_parser_
->
CutHMM
(
sentence
,
words
);
std
::
unique_ptr
<
cppjieba
::
HMMSegment
>
hmm_seg
=
std
::
make_unique
<
cppjieba
::
HMMSegment
>
(
jieba_parser_
->
GetHMMModel
());
hmm_seg
->
Cut
(
sentence
,
tmp
);
}
else
{
// Mix
}
else
{
// Mix
jieba_parser_
->
Cut
(
sentence
,
words
,
true
);
std
::
unique_ptr
<
cppjieba
::
MixSegment
>
mix_seg
=
std
::
make_unique
<
cppjieba
::
MixSegment
>
(
jieba_parser_
->
GetDictTrie
(),
jieba_parser_
->
GetHMMModel
());
mix_seg
->
Cut
(
sentence
,
tmp
,
true
);
}
GetStringsFromWords
(
tmp
,
words
);
for
(
auto
item
:
tmp
)
{
offsets_start
.
push_back
(
static_cast
<
uint32_t
>
(
item
.
offset
));
offsets_limit
.
push_back
(
static_cast
<
uint32_t
>
(
item
.
offset
+
item
.
word
.
length
()));
}
}
}
}
*
output
=
std
::
make_shared
<
Tensor
>
(
words
,
TensorShape
({(
dsize_t
)
words
.
size
()}));
token_tensor
=
std
::
make_shared
<
Tensor
>
(
words
,
TensorShape
({(
dsize_t
)
words
.
size
()}));
output
->
push_back
(
token_tensor
);
if
(
with_offsets_
)
{
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
offsets_start_tensor
,
TensorImpl
::
kFlexible
,
TensorShape
({(
dsize_t
)
offsets_start
.
size
()}),
DataType
(
DataType
::
DE_UINT32
),
reinterpret_cast
<
unsigned
char
*>
(
&
offsets_start
[
0
])));
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
offsets_limit_tensor
,
TensorImpl
::
kFlexible
,
TensorShape
({(
dsize_t
)
offsets_limit
.
size
()}),
DataType
(
DataType
::
DE_UINT32
),
reinterpret_cast
<
unsigned
char
*>
(
&
offsets_limit
[
0
])));
output
->
push_back
(
offsets_start_tensor
);
output
->
push_back
(
offsets_limit_tensor
);
}
return
Status
::
OK
();
return
Status
::
OK
();
}
}
...
...
mindspore/ccsrc/dataset/text/kernels/jieba_tokenizer_op.h
浏览文件 @
47060631
...
@@ -30,15 +30,19 @@ enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 };
...
@@ -30,15 +30,19 @@ enum class JiebaMode { kMix = 0, kMp = 1, kHmm = 2 };
class
JiebaTokenizerOp
:
public
TensorOp
{
class
JiebaTokenizerOp
:
public
TensorOp
{
public:
public:
// def
f
ault constant for Jieba MPSegment algorithm.
// default constant for Jieba MPSegment algorithm.
static
constexpr
size_t
MAX_WORD_LENGTH
=
512
;
static
constexpr
size_t
MAX_WORD_LENGTH
=
512
;
// default const for set whether Jieba output offsets tensor.
static
const
bool
kDefWithOffsets
;
// Constructor for JiebaTokenizerOp.
// Constructor for JiebaTokenizerOp.
// @param hmm_path HMM model file.
// @param hmm_path HMM model file.
// @param mp_path MP model file.
// @param mp_path MP model file.
// @mode tokenization mode [Default "MIX"], "MP" model will tokenize with MPSegment algorithm, "HMM" mode will
// @mode tokenization mode [Default "MIX"], "MP" model will tokenize with MPSegment algorithm, "HMM" mode will
// tokenize with Hiddel Markov Model Segment algorithm, "MIx" model will tokenize with a mix of MPSegment and
// tokenize with Hiddel Markov Model Segment algorithm, "MIx" model will tokenize with a mix of MPSegment and
// HMMSegment algorithm.
// HMMSegment algorithm.
JiebaTokenizerOp
(
const
std
::
string
&
hmm_path
,
const
std
::
string
&
mp_path
,
JiebaMode
mode
=
JiebaMode
::
kMix
);
// @with_offsets user set this value to choose whether output offset tensor.
JiebaTokenizerOp
(
const
std
::
string
&
hmm_path
,
const
std
::
string
&
mp_path
,
const
JiebaMode
&
mode
=
JiebaMode
::
kMix
,
const
bool
&
with_offsets
=
kDefWithOffsets
);
~
JiebaTokenizerOp
()
override
=
default
;
~
JiebaTokenizerOp
()
override
=
default
;
void
Print
(
std
::
ostream
&
out
)
const
override
{
void
Print
(
std
::
ostream
&
out
)
const
override
{
...
@@ -46,7 +50,7 @@ class JiebaTokenizerOp : public TensorOp {
...
@@ -46,7 +50,7 @@ class JiebaTokenizerOp : public TensorOp {
<<
mp_dict_path_
;
<<
mp_dict_path_
;
}
}
Status
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
override
;
Status
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
override
;
// @word the word to be added to the JiebaTokenizer.
// @word the word to be added to the JiebaTokenizer.
// @freq [Default 0] the frequency fo the word to be added.
// @freq [Default 0] the frequency fo the word to be added.
...
@@ -58,6 +62,7 @@ class JiebaTokenizerOp : public TensorOp {
...
@@ -58,6 +62,7 @@ class JiebaTokenizerOp : public TensorOp {
std
::
string
mp_dict_path_
;
std
::
string
mp_dict_path_
;
std
::
unique_ptr
<
cppjieba
::
Jieba
>
jieba_parser_
;
std
::
unique_ptr
<
cppjieba
::
Jieba
>
jieba_parser_
;
JiebaMode
jieba_mode_
;
JiebaMode
jieba_mode_
;
bool
with_offsets_
;
};
};
}
// namespace dataset
}
// namespace dataset
}
// namespace mindspore
}
// namespace mindspore
...
...
mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.cc
浏览文件 @
47060631
...
@@ -22,8 +22,11 @@
...
@@ -22,8 +22,11 @@
namespace
mindspore
{
namespace
mindspore
{
namespace
dataset
{
namespace
dataset
{
Status
RegexTokenizerOp
::
GetUnicodeSubstr
(
const
icu
::
UnicodeString
&
input
,
int
start
,
int
len
,
std
::
string
*
out_utf8
,
icu
::
UnicodeString
*
out_unicode
)
const
{
const
bool
RegexTokenizerOp
::
kDefWithOffsets
=
false
;
Status
RegexTokenizerOp
::
GetUnicodeSubstr
(
const
icu
::
UnicodeString
&
input
,
const
int
&
start
,
const
int
&
len
,
std
::
string
*
out_utf8
,
icu
::
UnicodeString
*
out_unicode
)
const
{
CHECK_FAIL_RETURN_UNEXPECTED
((
out_utf8
!=
nullptr
||
out_unicode
!=
nullptr
),
"Wrong input"
);
CHECK_FAIL_RETURN_UNEXPECTED
((
out_utf8
!=
nullptr
||
out_unicode
!=
nullptr
),
"Wrong input"
);
int
total_len
=
input
.
length
();
int
total_len
=
input
.
length
();
int
end
=
start
+
len
;
int
end
=
start
+
len
;
...
@@ -39,7 +42,9 @@ Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int s
...
@@ -39,7 +42,9 @@ Status RegexTokenizerOp::GetUnicodeSubstr(const icu::UnicodeString &input, int s
return
Status
::
OK
();
return
Status
::
OK
();
}
}
Status
RegexTokenizerOp
::
GetRegexTokens
(
const
std
::
string
&
text
,
std
::
vector
<
std
::
string
>
*
out_tokens
)
const
{
Status
RegexTokenizerOp
::
GetRegexTokens
(
const
std
::
string
&
text
,
std
::
vector
<
std
::
string
>
*
out_tokens
,
std
::
vector
<
uint32_t
>
*
offsets_start
,
std
::
vector
<
uint32_t
>
*
offsets_limit
)
const
{
UErrorCode
status
=
U_ZERO_ERROR
;
UErrorCode
status
=
U_ZERO_ERROR
;
out_tokens
->
clear
();
out_tokens
->
clear
();
icu
::
RegexMatcher
token_matcher
(
delim_pattern_
,
0
,
status
);
icu
::
RegexMatcher
token_matcher
(
delim_pattern_
,
0
,
status
);
...
@@ -50,6 +55,7 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std
...
@@ -50,6 +55,7 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std
icu
::
UnicodeString
utext
(
icu
::
UnicodeString
::
fromUTF8
(
text
));
icu
::
UnicodeString
utext
(
icu
::
UnicodeString
::
fromUTF8
(
text
));
token_matcher
.
reset
(
utext
);
token_matcher
.
reset
(
utext
);
int
text_start_index
=
0
;
int
token_start_index
=
0
;
int
token_start_index
=
0
;
status
=
U_ZERO_ERROR
;
status
=
U_ZERO_ERROR
;
while
(
token_matcher
.
find
(
status
)
&&
U_SUCCESS
(
status
))
{
while
(
token_matcher
.
find
(
status
)
&&
U_SUCCESS
(
status
))
{
...
@@ -62,41 +68,70 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std
...
@@ -62,41 +68,70 @@ Status RegexTokenizerOp::GetRegexTokens(const std::string &text, std::vector<std
int
token_len
=
deli_start_index
-
token_start_index
;
int
token_len
=
deli_start_index
-
token_start_index
;
if
(
token_len
>
0
)
{
if
(
token_len
>
0
)
{
std
::
string
token
;
std
::
string
token
;
uint32_t
token_offset
=
0
;
RETURN_IF_NOT_OK
(
GetUnicodeSubstr
(
utext
,
token_start_index
,
token_len
,
&
token
));
RETURN_IF_NOT_OK
(
GetUnicodeSubstr
(
utext
,
token_start_index
,
token_len
,
&
token
));
token_offset
=
token
.
length
();
out_tokens
->
emplace_back
(
std
::
move
(
token
));
out_tokens
->
emplace_back
(
std
::
move
(
token
));
offsets_start
->
push_back
(
static_cast
<
uint32_t
>
(
text_start_index
));
offsets_limit
->
push_back
(
static_cast
<
uint32_t
>
(
text_start_index
+
token_offset
));
text_start_index
+=
token_offset
;
}
}
int
delim_len
=
deli_end_index
-
deli_start_index
;
int
delim_len
=
deli_end_index
-
deli_start_index
;
if
(
keep_delim_
&&
delim_len
>
0
)
{
if
(
delim_len
>
0
)
{
icu
::
UnicodeString
delim_str
;
icu
::
UnicodeString
delim_str
;
std
::
string
delim_utf8_str
;
std
::
string
delim_utf8_str
;
uint32_t
delim_str_offset
=
0
;
RETURN_IF_NOT_OK
(
GetUnicodeSubstr
(
utext
,
deli_start_index
,
delim_len
,
&
delim_utf8_str
,
&
delim_str
));
RETURN_IF_NOT_OK
(
GetUnicodeSubstr
(
utext
,
deli_start_index
,
delim_len
,
&
delim_utf8_str
,
&
delim_str
));
delim_matcher
.
reset
(
delim_str
);
delim_matcher
.
reset
(
delim_str
);
if
(
delim_matcher
.
matches
(
status
)
&&
U_SUCCESS
(
status
))
{
delim_str_offset
=
delim_utf8_str
.
length
();
if
(
keep_delim_
&&
delim_matcher
.
matches
(
status
)
&&
U_SUCCESS
(
status
))
{
out_tokens
->
emplace_back
(
std
::
move
(
delim_utf8_str
));
out_tokens
->
emplace_back
(
std
::
move
(
delim_utf8_str
));
offsets_start
->
push_back
(
static_cast
<
uint32_t
>
(
text_start_index
));
offsets_limit
->
push_back
(
static_cast
<
uint32_t
>
(
text_start_index
+
delim_str_offset
));
}
}
text_start_index
+=
delim_str_offset
;
}
}
token_start_index
=
deli_end_index
;
token_start_index
=
deli_end_index
;
}
}
if
(
token_start_index
<
utext
.
length
())
{
if
(
token_start_index
<
utext
.
length
())
{
std
::
string
temp
;
std
::
string
temp
;
uint32_t
temp_offset
=
0
;
RETURN_IF_NOT_OK
(
GetUnicodeSubstr
(
utext
,
token_start_index
,
utext
.
length
()
-
token_start_index
,
&
temp
));
RETURN_IF_NOT_OK
(
GetUnicodeSubstr
(
utext
,
token_start_index
,
utext
.
length
()
-
token_start_index
,
&
temp
));
temp_offset
=
temp
.
length
();
out_tokens
->
emplace_back
(
std
::
move
(
temp
));
out_tokens
->
emplace_back
(
std
::
move
(
temp
));
offsets_start
->
push_back
(
static_cast
<
uint32_t
>
(
text_start_index
));
offsets_limit
->
push_back
(
static_cast
<
uint32_t
>
(
text_start_index
+
temp_offset
));
}
}
return
Status
::
OK
();
return
Status
::
OK
();
}
}
Status
RegexTokenizerOp
::
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
{
Status
RegexTokenizerOp
::
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
{
IO_CHECK
(
input
,
output
);
IO_CHECK_VECTOR
(
input
,
output
);
if
(
input
->
Rank
()
!=
0
||
input
->
type
()
!=
DataType
::
DE_STRING
)
{
CHECK_FAIL_RETURN_UNEXPECTED
(
input
.
size
()
==
1
,
"Input should be one tensor"
);
if
(
input
[
0
]
->
Rank
()
!=
0
||
input
[
0
]
->
type
()
!=
DataType
::
DE_STRING
)
{
RETURN_STATUS_UNEXPECTED
(
"The input tensor should be scalar string tensor"
);
RETURN_STATUS_UNEXPECTED
(
"The input tensor should be scalar string tensor"
);
}
}
std
::
string_view
text
;
std
::
string_view
text
;
RETURN_IF_NOT_OK
(
input
->
GetItemAt
(
&
text
,
{}));
std
::
vector
<
std
::
string
>
tokens
;
std
::
vector
<
std
::
string
>
tokens
;
RETURN_IF_NOT_OK
(
GetRegexTokens
(
std
::
string
(
text
.
data
(),
text
.
size
()),
&
tokens
));
std
::
vector
<
uint32_t
>
offsets_start
;
*
output
=
std
::
make_shared
<
Tensor
>
(
std
::
move
(
tokens
),
TensorShape
({(
dsize_t
)
tokens
.
size
()}));
std
::
vector
<
uint32_t
>
offsets_limit
;
std
::
shared_ptr
<
Tensor
>
token_tensor
,
offsets_start_tensor
,
offsets_limit_tensor
;
RETURN_IF_NOT_OK
(
input
[
0
]
->
GetItemAt
(
&
text
,
{}));
RETURN_IF_NOT_OK
(
GetRegexTokens
(
std
::
string
(
text
.
data
(),
text
.
size
()),
&
tokens
,
&
offsets_start
,
&
offsets_limit
));
token_tensor
=
std
::
make_shared
<
Tensor
>
(
std
::
move
(
tokens
),
TensorShape
({(
dsize_t
)
tokens
.
size
()}));
output
->
push_back
(
token_tensor
);
if
(
with_offsets_
)
{
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
offsets_start_tensor
,
TensorImpl
::
kFlexible
,
TensorShape
({(
dsize_t
)
offsets_start
.
size
()}),
DataType
(
DataType
::
DE_UINT32
),
reinterpret_cast
<
unsigned
char
*>
(
&
offsets_start
[
0
])));
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
offsets_limit_tensor
,
TensorImpl
::
kFlexible
,
TensorShape
({(
dsize_t
)
offsets_limit
.
size
()}),
DataType
(
DataType
::
DE_UINT32
),
reinterpret_cast
<
unsigned
char
*>
(
&
offsets_limit
[
0
])));
output
->
push_back
(
offsets_start_tensor
);
output
->
push_back
(
offsets_limit_tensor
);
}
return
Status
::
OK
();
return
Status
::
OK
();
}
}
}
// namespace dataset
}
// namespace dataset
...
...
mindspore/ccsrc/dataset/text/kernels/regex_tokenizer_op.h
浏览文件 @
47060631
...
@@ -32,25 +32,31 @@ namespace dataset {
...
@@ -32,25 +32,31 @@ namespace dataset {
class
RegexTokenizerOp
:
public
TensorOp
{
class
RegexTokenizerOp
:
public
TensorOp
{
public:
public:
RegexTokenizerOp
(
const
std
::
string
&
delim_pattern
,
const
std
::
string
&
keep_delim_pattern
)
static
const
bool
kDefWithOffsets
;
RegexTokenizerOp
(
const
std
::
string
&
delim_pattern
,
const
std
::
string
&
keep_delim_pattern
,
const
bool
&
with_offsets
=
kDefWithOffsets
)
:
delim_pattern_
(
icu
::
UnicodeString
::
fromUTF8
(
delim_pattern
)),
:
delim_pattern_
(
icu
::
UnicodeString
::
fromUTF8
(
delim_pattern
)),
keep_delim_pattern_
(
icu
::
UnicodeString
::
fromUTF8
(
keep_delim_pattern
)),
keep_delim_pattern_
(
icu
::
UnicodeString
::
fromUTF8
(
keep_delim_pattern
)),
with_offsets_
(
with_offsets
),
keep_delim_
(
!
keep_delim_pattern
.
empty
())
{}
keep_delim_
(
!
keep_delim_pattern
.
empty
())
{}
~
RegexTokenizerOp
()
override
=
default
;
~
RegexTokenizerOp
()
override
=
default
;
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"RegexTokenizerOp"
;
}
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"RegexTokenizerOp"
;
}
Status
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
override
;
Status
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
override
;
protected:
protected:
Status
GetUnicodeSubstr
(
const
icu
::
UnicodeString
&
input
,
int
start
,
int
len
,
std
::
string
*
out_utf8
,
Status
GetUnicodeSubstr
(
const
icu
::
UnicodeString
&
input
,
const
int
&
start
,
const
int
&
len
,
std
::
string
*
out_utf8
,
icu
::
UnicodeString
*
out_unicode
=
nullptr
)
const
;
icu
::
UnicodeString
*
out_unicode
=
nullptr
)
const
;
Status
GetRegexTokens
(
const
std
::
string
&
text
,
std
::
vector
<
std
::
string
>
*
out_tokens
)
const
;
Status
GetRegexTokens
(
const
std
::
string
&
text
,
std
::
vector
<
std
::
string
>
*
out_tokens
,
std
::
vector
<
uint32_t
>
*
offsets_start
,
std
::
vector
<
uint32_t
>
*
offsets_limit
)
const
;
private:
private:
const
icu
::
UnicodeString
delim_pattern_
;
const
icu
::
UnicodeString
delim_pattern_
;
const
icu
::
UnicodeString
keep_delim_pattern_
;
const
icu
::
UnicodeString
keep_delim_pattern_
;
bool
with_offsets_
;
const
bool
keep_delim_
;
const
bool
keep_delim_
;
};
};
}
// namespace dataset
}
// namespace dataset
...
...
mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.cc
浏览文件 @
47060631
...
@@ -27,26 +27,46 @@ using cppjieba::RuneStrArray;
...
@@ -27,26 +27,46 @@ using cppjieba::RuneStrArray;
namespace
mindspore
{
namespace
mindspore
{
namespace
dataset
{
namespace
dataset
{
Status
UnicodeCharTokenizerOp
::
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
{
const
bool
UnicodeCharTokenizerOp
::
kDefWithOffsets
=
false
;
IO_CHECK
(
input
,
output
);
if
(
input
->
Rank
()
!=
0
||
input
->
type
()
!=
DataType
::
DE_STRING
)
{
Status
UnicodeCharTokenizerOp
::
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
{
IO_CHECK_VECTOR
(
input
,
output
);
CHECK_FAIL_RETURN_UNEXPECTED
(
input
.
size
()
==
1
,
"Input should be one tensor"
);
if
(
input
[
0
]
->
Rank
()
!=
0
||
input
[
0
]
->
type
()
!=
DataType
::
DE_STRING
)
{
RETURN_STATUS_UNEXPECTED
(
"The input tensor should be scalar string tensor"
);
RETURN_STATUS_UNEXPECTED
(
"The input tensor should be scalar string tensor"
);
}
}
std
::
string_view
str
;
std
::
string_view
str
;
RETURN_IF_NOT_OK
(
input
->
GetItemAt
(
&
str
,
{}));
RETURN_IF_NOT_OK
(
input
[
0
]
->
GetItemAt
(
&
str
,
{}));
RuneStrArray
runes
;
RuneStrArray
runes
;
if
(
!
DecodeRunesInString
(
str
.
data
(),
str
.
size
(),
runes
))
{
if
(
!
DecodeRunesInString
(
str
.
data
(),
str
.
size
(),
runes
))
{
RETURN_STATUS_UNEXPECTED
(
"Decode utf8 string failed."
);
RETURN_STATUS_UNEXPECTED
(
"Decode utf8 string failed."
);
}
}
std
::
shared_ptr
<
Tensor
>
token_tensor
,
offsets_start_tensor
,
offsets_limit_tensor
;
std
::
vector
<
std
::
string
>
splits
(
runes
.
size
());
std
::
vector
<
std
::
string
>
splits
(
runes
.
size
());
std
::
vector
<
uint32_t
>
offsets_start
,
offsets_limit
;
for
(
size_t
i
=
0
;
i
<
runes
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
runes
.
size
();
i
++
)
{
offsets_start
.
push_back
(
runes
[
i
].
offset
);
offsets_limit
.
push_back
(
runes
[
i
].
offset
+
runes
[
i
].
len
);
splits
[
i
]
=
str
.
substr
(
runes
[
i
].
offset
,
runes
[
i
].
len
);
splits
[
i
]
=
str
.
substr
(
runes
[
i
].
offset
,
runes
[
i
].
len
);
}
}
if
(
splits
.
empty
())
{
if
(
splits
.
empty
())
{
splits
.
emplace_back
(
""
);
splits
.
emplace_back
(
""
);
offsets_start
.
push_back
(
0
);
offsets_limit
.
push_back
(
0
);
}
token_tensor
=
std
::
make_shared
<
Tensor
>
(
splits
,
TensorShape
({(
dsize_t
)
splits
.
size
()}));
output
->
push_back
(
token_tensor
);
if
(
with_offsets_
)
{
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
offsets_start_tensor
,
TensorImpl
::
kFlexible
,
TensorShape
({(
dsize_t
)
offsets_start
.
size
()}),
DataType
(
DataType
::
DE_UINT32
),
reinterpret_cast
<
unsigned
char
*>
(
&
offsets_start
[
0
])));
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
offsets_limit_tensor
,
TensorImpl
::
kFlexible
,
TensorShape
({(
dsize_t
)
offsets_limit
.
size
()}),
DataType
(
DataType
::
DE_UINT32
),
reinterpret_cast
<
unsigned
char
*>
(
&
offsets_limit
[
0
])));
output
->
push_back
(
offsets_start_tensor
);
output
->
push_back
(
offsets_limit_tensor
);
}
}
*
output
=
std
::
make_shared
<
Tensor
>
(
splits
,
TensorShape
({(
dsize_t
)
splits
.
size
()}));
return
Status
::
OK
();
return
Status
::
OK
();
}
}
}
// namespace dataset
}
// namespace dataset
...
...
mindspore/ccsrc/dataset/text/kernels/unicode_char_tokenizer_op.h
浏览文件 @
47060631
...
@@ -26,13 +26,18 @@ namespace dataset {
...
@@ -26,13 +26,18 @@ namespace dataset {
class
UnicodeCharTokenizerOp
:
public
TensorOp
{
class
UnicodeCharTokenizerOp
:
public
TensorOp
{
public:
public:
UnicodeCharTokenizerOp
()
{}
static
const
bool
kDefWithOffsets
;
explicit
UnicodeCharTokenizerOp
(
const
bool
&
with_offsets
=
kDefWithOffsets
)
:
with_offsets_
(
with_offsets
)
{}
~
UnicodeCharTokenizerOp
()
override
=
default
;
~
UnicodeCharTokenizerOp
()
override
=
default
;
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"UnicodeCharTokenizerOp"
;
}
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"UnicodeCharTokenizerOp"
;
}
Status
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
override
;
Status
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
override
;
private:
bool
with_offsets_
;
};
};
}
// namespace dataset
}
// namespace dataset
...
...
mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.cc
浏览文件 @
47060631
...
@@ -32,24 +32,28 @@ namespace mindspore {
...
@@ -32,24 +32,28 @@ namespace mindspore {
namespace
dataset
{
namespace
dataset
{
const
bool
UnicodeScriptTokenizerOp
::
kDefKeepWhitespace
=
false
;
const
bool
UnicodeScriptTokenizerOp
::
kDefKeepWhitespace
=
false
;
const
bool
UnicodeScriptTokenizerOp
::
kDefWithOffsets
=
false
;
Status
UnicodeScriptTokenizerOp
::
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
{
Status
UnicodeScriptTokenizerOp
::
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
{
IO_CHECK
(
input
,
output
);
IO_CHECK_VECTOR
(
input
,
output
);
if
(
input
->
Rank
()
!=
0
||
input
->
type
()
!=
DataType
::
DE_STRING
)
{
CHECK_FAIL_RETURN_UNEXPECTED
(
input
.
size
()
==
1
,
"Input should be one tensor"
);
if
(
input
[
0
]
->
Rank
()
!=
0
||
input
[
0
]
->
type
()
!=
DataType
::
DE_STRING
)
{
RETURN_STATUS_UNEXPECTED
(
"The input tensor should be scalar string tensor"
);
RETURN_STATUS_UNEXPECTED
(
"The input tensor should be scalar string tensor"
);
}
}
std
::
string_view
str
;
std
::
string_view
str
;
RETURN_IF_NOT_OK
(
input
->
GetItemAt
(
&
str
,
{}));
RETURN_IF_NOT_OK
(
input
[
0
]
->
GetItemAt
(
&
str
,
{}));
RuneStrArray
runes
;
RuneStrArray
runes
;
if
(
!
DecodeRunesInString
(
str
.
data
(),
str
.
size
(),
runes
))
{
if
(
!
DecodeRunesInString
(
str
.
data
(),
str
.
size
(),
runes
))
{
RETURN_STATUS_UNEXPECTED
(
"Decode utf8 string failed."
);
RETURN_STATUS_UNEXPECTED
(
"Decode utf8 string failed."
);
}
}
std
::
shared_ptr
<
Tensor
>
token_tensor
,
offsets_start_tensor
,
offsets_limit_tensor
;
UScriptCode
last_script
=
USCRIPT_INVALID_CODE
;
UScriptCode
last_script
=
USCRIPT_INVALID_CODE
;
icu
::
ErrorCode
status
;
icu
::
ErrorCode
status
;
int
start
=
0
;
int
start
=
0
;
int
len
=
0
;
int
len
=
0
;
std
::
vector
<
std
::
string
>
splits
;
std
::
vector
<
std
::
string
>
splits
;
std
::
vector
<
uint32_t
>
offsets_start
,
offsets_limit
;
bool
was_space
=
false
;
bool
was_space
=
false
;
for
(
size_t
i
=
0
;
i
<
runes
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
runes
.
size
();
i
++
)
{
...
@@ -66,6 +70,8 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s
...
@@ -66,6 +70,8 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s
if
(
len
>
0
&&
(
script
!=
last_script
||
is_space
!=
was_space
))
{
if
(
len
>
0
&&
(
script
!=
last_script
||
is_space
!=
was_space
))
{
// 3) If keep_whitespace_ is false, all the whitespace characters will be discard
// 3) If keep_whitespace_ is false, all the whitespace characters will be discard
if
(
keep_whitespace_
||
!
was_space
)
{
if
(
keep_whitespace_
||
!
was_space
)
{
offsets_start
.
push_back
(
static_cast
<
uint32_t
>
(
start
));
offsets_limit
.
push_back
(
static_cast
<
uint32_t
>
(
start
+
len
));
std
::
string
temp
(
str
.
substr
(
start
,
len
));
std
::
string
temp
(
str
.
substr
(
start
,
len
));
splits
.
emplace_back
(
std
::
move
(
temp
));
splits
.
emplace_back
(
std
::
move
(
temp
));
}
}
...
@@ -79,14 +85,29 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s
...
@@ -79,14 +85,29 @@ Status UnicodeScriptTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, s
}
}
if
(
len
>
0
&&
(
keep_whitespace_
||
!
was_space
))
{
if
(
len
>
0
&&
(
keep_whitespace_
||
!
was_space
))
{
offsets_start
.
push_back
(
static_cast
<
uint32_t
>
(
start
));
offsets_limit
.
push_back
(
static_cast
<
uint32_t
>
(
start
+
len
));
std
::
string
temp
(
str
.
substr
(
start
,
len
));
std
::
string
temp
(
str
.
substr
(
start
,
len
));
splits
.
emplace_back
(
std
::
move
(
temp
));
splits
.
emplace_back
(
std
::
move
(
temp
));
}
}
// 4) If the input is empty scalar string, the output will be 1-D empty string.
// 4) If the input is empty scalar string, the output will be 1-D empty string.
if
(
splits
.
empty
())
{
if
(
splits
.
empty
())
{
splits
.
emplace_back
(
""
);
splits
.
emplace_back
(
""
);
offsets_start
.
push_back
(
0
);
offsets_limit
.
push_back
(
0
);
}
token_tensor
=
std
::
make_shared
<
Tensor
>
(
splits
,
TensorShape
({(
dsize_t
)
splits
.
size
()}));
output
->
push_back
(
token_tensor
);
if
(
with_offsets_
)
{
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
offsets_start_tensor
,
TensorImpl
::
kFlexible
,
TensorShape
({(
dsize_t
)
offsets_start
.
size
()}),
DataType
(
DataType
::
DE_UINT32
),
reinterpret_cast
<
unsigned
char
*>
(
&
offsets_start
[
0
])));
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
offsets_limit_tensor
,
TensorImpl
::
kFlexible
,
TensorShape
({(
dsize_t
)
offsets_limit
.
size
()}),
DataType
(
DataType
::
DE_UINT32
),
reinterpret_cast
<
unsigned
char
*>
(
&
offsets_limit
[
0
])));
output
->
push_back
(
offsets_start_tensor
);
output
->
push_back
(
offsets_limit_tensor
);
}
}
*
output
=
std
::
make_shared
<
Tensor
>
(
splits
,
TensorShape
({(
dsize_t
)
splits
.
size
()}));
return
Status
::
OK
();
return
Status
::
OK
();
}
}
}
// namespace dataset
}
// namespace dataset
...
...
mindspore/ccsrc/dataset/text/kernels/unicode_script_tokenizer_op.h
浏览文件 @
47060631
...
@@ -27,17 +27,21 @@ namespace dataset {
...
@@ -27,17 +27,21 @@ namespace dataset {
class
UnicodeScriptTokenizerOp
:
public
TensorOp
{
class
UnicodeScriptTokenizerOp
:
public
TensorOp
{
public:
public:
static
const
bool
kDefKeepWhitespace
;
static
const
bool
kDefKeepWhitespace
;
static
const
bool
kDefWithOffsets
;
explicit
UnicodeScriptTokenizerOp
(
bool
keep_whitespace
=
kDefKeepWhitespace
)
:
keep_whitespace_
(
keep_whitespace
)
{}
explicit
UnicodeScriptTokenizerOp
(
const
bool
&
keep_whitespace
=
kDefKeepWhitespace
,
const
bool
&
with_offsets
=
kDefWithOffsets
)
:
keep_whitespace_
(
keep_whitespace
),
with_offsets_
(
with_offsets
)
{}
~
UnicodeScriptTokenizerOp
()
override
=
default
;
~
UnicodeScriptTokenizerOp
()
override
=
default
;
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"UnicodeScriptTokenizerOp"
;
}
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"UnicodeScriptTokenizerOp"
;
}
Status
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
override
;
Status
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
override
;
private:
private:
bool
keep_whitespace_
;
// If or not keep whitespace tokens
bool
keep_whitespace_
;
// If or not keep whitespace tokens
bool
with_offsets_
;
};
};
}
// namespace dataset
}
// namespace dataset
}
// namespace mindspore
}
// namespace mindspore
...
...
mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.cc
浏览文件 @
47060631
...
@@ -30,24 +30,33 @@ using cppjieba::RuneStrArray;
...
@@ -30,24 +30,33 @@ using cppjieba::RuneStrArray;
namespace
mindspore
{
namespace
mindspore
{
namespace
dataset
{
namespace
dataset
{
Status
WhitespaceTokenizerOp
::
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
{
IO_CHECK
(
input
,
output
);
const
bool
WhitespaceTokenizerOp
::
kDefWithOffsets
=
false
;
if
(
input
->
Rank
()
!=
0
||
input
->
type
()
!=
DataType
::
DE_STRING
)
{
Status
WhitespaceTokenizerOp
::
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
{
IO_CHECK_VECTOR
(
input
,
output
);
CHECK_FAIL_RETURN_UNEXPECTED
(
input
.
size
()
==
1
,
"Input should be one tensor"
);
if
(
input
[
0
]
->
Rank
()
!=
0
||
input
[
0
]
->
type
()
!=
DataType
::
DE_STRING
)
{
RETURN_STATUS_UNEXPECTED
(
"The input tensor should be scalar string tensor"
);
RETURN_STATUS_UNEXPECTED
(
"The input tensor should be scalar string tensor"
);
}
}
std
::
string_view
str
;
std
::
string_view
str
;
RETURN_IF_NOT_OK
(
input
->
GetItemAt
(
&
str
,
{}));
RETURN_IF_NOT_OK
(
input
[
0
]
->
GetItemAt
(
&
str
,
{}));
RuneStrArray
runes
;
RuneStrArray
runes
;
if
(
!
DecodeRunesInString
(
str
.
data
(),
str
.
size
(),
runes
))
{
if
(
!
DecodeRunesInString
(
str
.
data
(),
str
.
size
(),
runes
))
{
RETURN_STATUS_UNEXPECTED
(
"Decode utf8 string failed."
);
RETURN_STATUS_UNEXPECTED
(
"Decode utf8 string failed."
);
}
}
std
::
shared_ptr
<
Tensor
>
token_tensor
,
offsets_start_tensor
,
offsets_limit_tensor
;
std
::
vector
<
uint32_t
>
offsets_start
,
offsets_limit
;
std
::
vector
<
std
::
string
>
splits
;
std
::
vector
<
std
::
string
>
splits
;
int
start
=
0
;
int
start
=
0
;
int
len
=
0
;
int
len
=
0
;
for
(
size_t
i
=
0
;
i
<
runes
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
runes
.
size
();
i
++
)
{
if
(
u_isUWhiteSpace
(
runes
[
i
].
rune
))
{
if
(
u_isUWhiteSpace
(
runes
[
i
].
rune
))
{
if
(
len
>
0
)
{
if
(
len
>
0
)
{
offsets_start
.
push_back
(
static_cast
<
uint32_t
>
(
start
));
offsets_limit
.
push_back
(
static_cast
<
uint32_t
>
(
start
+
len
));
std
::
string
temp
(
str
.
substr
(
start
,
len
));
std
::
string
temp
(
str
.
substr
(
start
,
len
));
splits
.
emplace_back
(
std
::
move
(
temp
));
splits
.
emplace_back
(
std
::
move
(
temp
));
len
=
0
;
len
=
0
;
...
@@ -60,13 +69,28 @@ Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std:
...
@@ -60,13 +69,28 @@ Status WhitespaceTokenizerOp::Compute(const std::shared_ptr<Tensor> &input, std:
}
}
}
}
if
(
len
>
0
)
{
if
(
len
>
0
)
{
offsets_start
.
push_back
(
static_cast
<
uint32_t
>
(
start
));
offsets_limit
.
push_back
(
static_cast
<
uint32_t
>
(
start
+
len
));
std
::
string
temp
(
str
.
substr
(
start
,
len
));
std
::
string
temp
(
str
.
substr
(
start
,
len
));
splits
.
emplace_back
(
std
::
move
(
temp
));
splits
.
emplace_back
(
std
::
move
(
temp
));
}
}
if
(
splits
.
empty
())
{
if
(
splits
.
empty
())
{
splits
.
emplace_back
(
""
);
splits
.
emplace_back
(
""
);
offsets_start
.
push_back
(
0
);
offsets_limit
.
push_back
(
0
);
}
token_tensor
=
std
::
make_shared
<
Tensor
>
(
splits
,
TensorShape
({(
dsize_t
)
splits
.
size
()}));
output
->
push_back
(
token_tensor
);
if
(
with_offsets_
)
{
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
offsets_start_tensor
,
TensorImpl
::
kFlexible
,
TensorShape
({(
dsize_t
)
offsets_start
.
size
()}),
DataType
(
DataType
::
DE_UINT32
),
reinterpret_cast
<
unsigned
char
*>
(
&
offsets_start
[
0
])));
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
offsets_limit_tensor
,
TensorImpl
::
kFlexible
,
TensorShape
({(
dsize_t
)
offsets_limit
.
size
()}),
DataType
(
DataType
::
DE_UINT32
),
reinterpret_cast
<
unsigned
char
*>
(
&
offsets_limit
[
0
])));
output
->
push_back
(
offsets_start_tensor
);
output
->
push_back
(
offsets_limit_tensor
);
}
}
*
output
=
std
::
make_shared
<
Tensor
>
(
splits
,
TensorShape
({(
dsize_t
)
splits
.
size
()}));
return
Status
::
OK
();
return
Status
::
OK
();
}
}
}
// namespace dataset
}
// namespace dataset
...
...
mindspore/ccsrc/dataset/text/kernels/whitespace_tokenizer_op.h
浏览文件 @
47060631
...
@@ -26,13 +26,18 @@ namespace dataset {
...
@@ -26,13 +26,18 @@ namespace dataset {
class
WhitespaceTokenizerOp
:
public
TensorOp
{
class
WhitespaceTokenizerOp
:
public
TensorOp
{
public:
public:
WhitespaceTokenizerOp
()
{}
static
const
bool
kDefWithOffsets
;
explicit
WhitespaceTokenizerOp
(
const
bool
&
with_offsets
=
kDefWithOffsets
)
:
with_offsets_
(
with_offsets
)
{}
~
WhitespaceTokenizerOp
()
override
=
default
;
~
WhitespaceTokenizerOp
()
override
=
default
;
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"WhitespaceTokenizerOp"
;
}
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"WhitespaceTokenizerOp"
;
}
Status
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
override
;
Status
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
override
;
private:
bool
with_offsets_
;
};
};
}
// namespace dataset
}
// namespace dataset
}
// namespace mindspore
}
// namespace mindspore
...
...
mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.cc
浏览文件 @
47060631
...
@@ -24,13 +24,16 @@ namespace dataset {
...
@@ -24,13 +24,16 @@ namespace dataset {
const
char
WordpieceTokenizerOp
::
kDefSuffixIndicator
[]
=
"##"
;
const
char
WordpieceTokenizerOp
::
kDefSuffixIndicator
[]
=
"##"
;
const
int
WordpieceTokenizerOp
::
kDefMaxBytesPerToken
=
100
;
const
int
WordpieceTokenizerOp
::
kDefMaxBytesPerToken
=
100
;
const
char
WordpieceTokenizerOp
::
kDefUnknownToken
[]
=
"[UNK]"
;
const
char
WordpieceTokenizerOp
::
kDefUnknownToken
[]
=
"[UNK]"
;
const
bool
WordpieceTokenizerOp
::
kDefWithOffsets
=
false
;
WordpieceTokenizerOp
::
WordpieceTokenizerOp
(
const
std
::
shared_ptr
<
Vocab
>
&
vocab
,
const
std
::
string
&
suffix_indicator
,
WordpieceTokenizerOp
::
WordpieceTokenizerOp
(
const
std
::
shared_ptr
<
Vocab
>
&
vocab
,
const
std
::
string
&
suffix_indicator
,
const
int
&
max_bytes_per_token
,
const
std
::
string
&
unknown_token
)
const
int
&
max_bytes_per_token
,
const
std
::
string
&
unknown_token
,
const
bool
&
with_offsets
)
:
vocab_
(
vocab
),
:
vocab_
(
vocab
),
suffix_indicator_
(
suffix_indicator
),
suffix_indicator_
(
suffix_indicator
),
max_bytes_per_token_
(
max_bytes_per_token
),
max_bytes_per_token_
(
max_bytes_per_token
),
unknown_token_
(
unknown_token
)
{}
unknown_token_
(
unknown_token
),
with_offsets_
(
with_offsets
)
{}
Status
WordpieceTokenizerOp
::
LookupWord
(
const
std
::
string
&
input_token
,
const
RuneStrArray
&
runes
,
const
int
start
,
Status
WordpieceTokenizerOp
::
LookupWord
(
const
std
::
string
&
input_token
,
const
RuneStrArray
&
runes
,
const
int
start
,
bool
*
out_found
,
int
*
out_end
)
const
{
bool
*
out_found
,
int
*
out_end
)
const
{
...
@@ -52,17 +55,22 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru
...
@@ -52,17 +55,22 @@ Status WordpieceTokenizerOp::LookupWord(const std::string &input_token, const Ru
return
Status
::
OK
();
return
Status
::
OK
();
}
}
Status
WordpieceTokenizerOp
::
FoundNoToken
(
const
std
::
string
&
input_token
,
std
::
vector
<
std
::
string
>
*
out_tokens
)
const
{
Status
WordpieceTokenizerOp
::
FoundNoToken
(
const
std
::
string
&
input_token
,
const
uint32_t
&
basic_start
,
std
::
vector
<
std
::
string
>
*
out_tokens
,
std
::
vector
<
uint32_t
>
*
offsets_start
,
std
::
vector
<
uint32_t
>
*
offsets_limit
)
const
{
out_tokens
->
clear
();
out_tokens
->
clear
();
offsets_start
->
push_back
(
basic_start
);
if
(
unknown_token_
.
empty
())
{
if
(
unknown_token_
.
empty
())
{
out_tokens
->
emplace_back
(
input_token
);
out_tokens
->
emplace_back
(
input_token
);
offsets_limit
->
push_back
(
basic_start
+
input_token
.
length
());
}
else
{
}
else
{
out_tokens
->
emplace_back
(
unknown_token_
);
out_tokens
->
emplace_back
(
unknown_token_
);
offsets_limit
->
push_back
(
basic_start
+
input_token
.
length
());
}
}
return
Status
::
OK
();
return
Status
::
OK
();
}
}
Status
WordpieceTokenizerOp
::
AddSubword
(
const
std
::
string
&
input_token
,
const
int
start
,
const
int
end
,
Status
WordpieceTokenizerOp
::
AddSubword
(
const
std
::
string
&
input_token
,
const
int
&
start
,
const
int
&
end
,
std
::
vector
<
std
::
string
>
*
out_tokens
)
const
{
std
::
vector
<
std
::
string
>
*
out_tokens
)
const
{
CHECK_FAIL_RETURN_UNEXPECTED
(
start
>=
0
&&
end
>
start
&&
end
<=
input_token
.
size
(),
"Out of range"
);
CHECK_FAIL_RETURN_UNEXPECTED
(
start
>=
0
&&
end
>
start
&&
end
<=
input_token
.
size
(),
"Out of range"
);
std
::
string
subword
=
input_token
.
substr
(
start
,
end
-
start
);
std
::
string
subword
=
input_token
.
substr
(
start
,
end
-
start
);
...
@@ -73,9 +81,19 @@ Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const in
...
@@ -73,9 +81,19 @@ Status WordpieceTokenizerOp::AddSubword(const std::string &input_token, const in
return
Status
::
OK
();
return
Status
::
OK
();
}
}
Status
WordpieceTokenizerOp
::
GetTokens
(
const
std
::
string
&
input_token
,
std
::
vector
<
std
::
string
>
*
out_tokens
)
const
{
Status
WordpieceTokenizerOp
::
GetTokens
(
const
std
::
string
&
input_token
,
const
uint32_t
&
basic_start
,
std
::
vector
<
std
::
string
>
*
out_tokens
,
std
::
vector
<
uint32_t
>
*
offsets_start
,
std
::
vector
<
uint32_t
>
*
offsets_limit
)
const
{
if
(
input_token
.
size
()
>
max_bytes_per_token_
)
{
if
(
input_token
.
size
()
>
max_bytes_per_token_
)
{
return
FoundNoToken
(
input_token
,
out_tokens
);
offsets_start
->
push_back
(
basic_start
);
if
(
!
unknown_token_
.
empty
())
{
offsets_limit
->
push_back
(
basic_start
+
unknown_token_
.
size
());
out_tokens
->
emplace_back
(
unknown_token_
);
}
else
{
out_tokens
->
emplace_back
(
input_token
);
offsets_limit
->
push_back
(
basic_start
+
input_token
.
size
());
}
return
Status
::
OK
();
}
}
RuneStrArray
runes
;
RuneStrArray
runes
;
if
(
!
DecodeRunesInString
(
input_token
.
data
(),
input_token
.
size
(),
runes
))
{
if
(
!
DecodeRunesInString
(
input_token
.
data
(),
input_token
.
size
(),
runes
))
{
...
@@ -87,29 +105,52 @@ Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vect
...
@@ -87,29 +105,52 @@ Status WordpieceTokenizerOp::GetTokens(const std::string &input_token, std::vect
RETURN_IF_NOT_OK
(
LookupWord
(
input_token
,
runes
,
start
,
&
found
,
&
end
));
RETURN_IF_NOT_OK
(
LookupWord
(
input_token
,
runes
,
start
,
&
found
,
&
end
));
if
(
found
)
{
if
(
found
)
{
RETURN_IF_NOT_OK
(
AddSubword
(
input_token
,
start
,
end
,
out_tokens
));
RETURN_IF_NOT_OK
(
AddSubword
(
input_token
,
start
,
end
,
out_tokens
));
offsets_start
->
push_back
(
static_cast
<
uint32_t
>
(
basic_start
+
start
));
offsets_limit
->
push_back
(
static_cast
<
uint32_t
>
(
basic_start
+
end
));
start
=
end
;
start
=
end
;
}
else
{
}
else
{
return
FoundNoToken
(
input_token
,
out_tokens
);
return
FoundNoToken
(
input_token
,
basic_start
,
out_tokens
,
offsets_start
,
offsets_limit
);
}
}
}
}
return
Status
::
OK
();
return
Status
::
OK
();
}
}
Status
WordpieceTokenizerOp
::
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
{
Status
WordpieceTokenizerOp
::
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
{
IO_CHECK
(
input
,
output
);
IO_CHECK
_VECTOR
(
input
,
output
);
if
(
input
->
Rank
()
>
1
||
input
->
type
()
!=
DataType
::
DE_STRING
)
{
if
(
input
[
0
]
->
Rank
()
>
1
||
input
[
0
]
->
type
()
!=
DataType
::
DE_STRING
)
{
RETURN_STATUS_UNEXPECTED
(
"The input tensor should be scalar or 1-D string tensor"
);
RETURN_STATUS_UNEXPECTED
(
"The input tensor should be scalar or 1-D string tensor"
);
}
}
dsize_t
count
=
0
;
std
::
vector
<
std
::
string
>
out_tokens
;
std
::
vector
<
std
::
string
>
out_tokens
;
for
(
auto
iter
=
input
->
begin
<
std
::
string_view
>
();
iter
!=
input
->
end
<
std
::
string_view
>
();
iter
++
)
{
std
::
vector
<
uint32_t
>
offsets_start
,
offsets_limit
;
std
::
shared_ptr
<
Tensor
>
token_tensor
,
offsets_start_tensor
,
offsets_limit_tensor
;
for
(
auto
iter
=
input
[
0
]
->
begin
<
std
::
string_view
>
();
iter
!=
input
[
0
]
->
end
<
std
::
string_view
>
();
iter
++
)
{
uint32_t
basic_start
=
0
;
std
::
vector
<
std
::
string
>
temp_tokens
;
std
::
vector
<
std
::
string
>
temp_tokens
;
RETURN_IF_NOT_OK
(
GetTokens
(
std
::
string
(
*
iter
),
&
temp_tokens
));
if
(
with_offsets_
&&
input
.
size
()
==
3
)
{
RETURN_IF_NOT_OK
(
input
[
1
]
->
GetItemAt
<
uint32_t
>
(
&
basic_start
,
{
count
,
0
}));
}
RETURN_IF_NOT_OK
(
GetTokens
(
std
::
string
(
*
iter
),
basic_start
,
&
temp_tokens
,
&
offsets_start
,
&
offsets_limit
));
out_tokens
.
insert
(
out_tokens
.
end
(),
temp_tokens
.
begin
(),
temp_tokens
.
end
());
out_tokens
.
insert
(
out_tokens
.
end
(),
temp_tokens
.
begin
(),
temp_tokens
.
end
());
count
++
;
}
}
if
(
out_tokens
.
empty
())
{
if
(
out_tokens
.
empty
())
{
out_tokens
.
emplace_back
(
""
);
out_tokens
.
emplace_back
(
""
);
offsets_start
.
push_back
(
0
);
offsets_limit
.
push_back
(
0
);
}
token_tensor
=
std
::
make_shared
<
Tensor
>
(
out_tokens
,
TensorShape
({(
dsize_t
)
out_tokens
.
size
()}));
output
->
push_back
(
token_tensor
);
if
(
with_offsets_
)
{
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
offsets_start_tensor
,
TensorImpl
::
kFlexible
,
TensorShape
({(
dsize_t
)
offsets_start
.
size
()}),
DataType
(
DataType
::
DE_UINT32
),
reinterpret_cast
<
unsigned
char
*>
(
&
offsets_start
[
0
])));
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
offsets_limit_tensor
,
TensorImpl
::
kFlexible
,
TensorShape
({(
dsize_t
)
offsets_limit
.
size
()}),
DataType
(
DataType
::
DE_UINT32
),
reinterpret_cast
<
unsigned
char
*>
(
&
offsets_limit
[
0
])));
output
->
push_back
(
offsets_start_tensor
);
output
->
push_back
(
offsets_limit_tensor
);
}
}
*
output
=
std
::
make_shared
<
Tensor
>
(
out_tokens
,
TensorShape
({(
dsize_t
)
out_tokens
.
size
()}));
return
Status
::
OK
();
return
Status
::
OK
();
}
}
...
...
mindspore/ccsrc/dataset/text/kernels/wordpiece_tokenizer_op.h
浏览文件 @
47060631
...
@@ -37,27 +37,31 @@ class WordpieceTokenizerOp : public TensorOp {
...
@@ -37,27 +37,31 @@ class WordpieceTokenizerOp : public TensorOp {
static
const
char
kDefSuffixIndicator
[];
static
const
char
kDefSuffixIndicator
[];
static
const
int
kDefMaxBytesPerToken
;
static
const
int
kDefMaxBytesPerToken
;
static
const
char
kDefUnknownToken
[];
static
const
char
kDefUnknownToken
[];
static
const
bool
kDefWithOffsets
;
WordpieceTokenizerOp
(
const
std
::
shared_ptr
<
Vocab
>
&
vocab
,
const
std
::
string
&
suffix_indicator
=
kDefSuffixIndicator
,
WordpieceTokenizerOp
(
const
std
::
shared_ptr
<
Vocab
>
&
vocab
,
const
std
::
string
&
suffix_indicator
=
kDefSuffixIndicator
,
const
int
&
max_bytes_per_token
=
kDefMaxBytesPerToken
,
const
int
&
max_bytes_per_token
=
kDefMaxBytesPerToken
,
const
std
::
string
&
unknown_token
=
kDefUnknownToken
);
const
std
::
string
&
unknown_token
=
kDefUnknownToken
,
const
bool
&
with_offsets
=
kDefWithOffsets
);
~
WordpieceTokenizerOp
()
override
=
default
;
~
WordpieceTokenizerOp
()
override
=
default
;
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"WordpieceTokenizerOp"
;
}
void
Print
(
std
::
ostream
&
out
)
const
override
{
out
<<
"WordpieceTokenizerOp"
;
}
Status
Compute
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
)
override
;
Status
Compute
(
const
TensorRow
&
input
,
TensorRow
*
output
)
override
;
protected:
protected:
Status
AddSubword
(
const
std
::
string
&
input_token
,
const
int
start
,
const
int
end
,
Status
AddSubword
(
const
std
::
string
&
input_token
,
const
int
&
start
,
const
int
&
end
,
std
::
vector
<
std
::
string
>
*
out_token
)
const
;
std
::
vector
<
std
::
string
>
*
out_token
)
const
;
Status
FoundNoToken
(
const
std
::
string
&
input_token
,
std
::
vector
<
std
::
string
>
*
out_tokens
)
const
;
Status
FoundNoToken
(
const
std
::
string
&
input_token
,
const
uint32_t
&
basic_start
,
std
::
vector
<
std
::
string
>
*
out_tokens
,
std
::
vector
<
uint32_t
>
*
offsets_start
,
std
::
vector
<
uint32_t
>
*
offsets_limit
)
const
;
Status
LookupWord
(
const
std
::
string
&
input_token
,
const
RuneStrArray
&
runes
,
const
int
start
,
bool
*
out_found
,
Status
LookupWord
(
const
std
::
string
&
input_token
,
const
RuneStrArray
&
runes
,
const
int
start
,
bool
*
out_found
,
int
*
out_end
)
const
;
int
*
out_end
)
const
;
Status
GetTokens
(
const
std
::
string
&
input_token
,
std
::
vector
<
std
::
string
>
*
out_tokens
)
const
;
Status
GetTokens
(
const
std
::
string
&
input_token
,
const
uint32_t
&
basic_start
,
std
::
vector
<
std
::
string
>
*
out_tokens
,
std
::
vector
<
uint32_t
>
*
offsets_start
,
std
::
vector
<
uint32_t
>
*
offsets_limit
)
const
;
private:
private:
const
std
::
shared_ptr
<
Vocab
>
vocab_
;
const
std
::
shared_ptr
<
Vocab
>
vocab_
;
const
std
::
string
suffix_indicator_
;
const
std
::
string
suffix_indicator_
;
const
bool
with_offsets_
;
const
int
max_bytes_per_token_
;
const
int
max_bytes_per_token_
;
const
std
::
string
unknown_token_
;
const
std
::
string
unknown_token_
;
};
};
...
...
mindspore/dataset/text/transforms.py
浏览文件 @
47060631
...
@@ -52,8 +52,9 @@ import mindspore._c_dataengine as cde
...
@@ -52,8 +52,9 @@ import mindspore._c_dataengine as cde
from
.utils
import
JiebaMode
,
NormalizeForm
,
to_str
from
.utils
import
JiebaMode
,
NormalizeForm
,
to_str
from
.validators
import
check_lookup
,
check_jieba_add_dict
,
\
from
.validators
import
check_lookup
,
check_jieba_add_dict
,
\
check_jieba_add_word
,
check_jieba_init
,
check_ngram
,
check_pair_truncate
,
\
check_jieba_add_word
,
check_jieba_init
,
check_with_offsets
,
check_unicode_script_tokenizer
,
\
check_to_number
,
check_python_tokenizer
check_wordpiece_tokenizer
,
check_regex_tokenizer
,
check_basic_tokenizer
,
check_ngram
,
check_pair_truncate
,
\
check_to_number
,
check_bert_tokenizer
,
check_python_tokenizer
from
..core.datatypes
import
mstype_to_detype
from
..core.datatypes
import
mstype_to_detype
...
@@ -125,15 +126,31 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
...
@@ -125,15 +126,31 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
- JiebaMode.MP, tokenize with MPSegment algorithm.
- JiebaMode.MP, tokenize with MPSegment algorithm.
- JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm.
- JiebaMode.HMM, tokenize with Hiddel Markov Model Segment algorithm.
- JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
- JiebaMode.MIX, tokenize with a mix of MPSegment and HMMSegment algorithm.
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=False)
>>> data = data.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
"""
"""
@
check_jieba_init
@
check_jieba_init
def
__init__
(
self
,
hmm_path
,
mp_path
,
mode
=
JiebaMode
.
MIX
):
def
__init__
(
self
,
hmm_path
,
mp_path
,
mode
=
JiebaMode
.
MIX
,
with_offsets
=
False
):
if
not
isinstance
(
mode
,
JiebaMode
):
raise
TypeError
(
"Wrong input type for mode, should be JiebaMode."
)
self
.
mode
=
mode
self
.
mode
=
mode
self
.
__check_path__
(
hmm_path
)
self
.
__check_path__
(
hmm_path
)
self
.
__check_path__
(
mp_path
)
self
.
__check_path__
(
mp_path
)
self
.
with_offsets
=
with_offsets
super
().
__init__
(
hmm_path
,
mp_path
,
super
().
__init__
(
hmm_path
,
mp_path
,
DE_C_INTER_JIEBA_MODE
[
mode
])
DE_C_INTER_JIEBA_MODE
[
mode
],
self
.
with_offsets
)
@
check_jieba_add_word
@
check_jieba_add_word
def
add_word
(
self
,
word
,
freq
=
None
):
def
add_word
(
self
,
word
,
freq
=
None
):
...
@@ -226,8 +243,26 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
...
@@ -226,8 +243,26 @@ class JiebaTokenizer(cde.JiebaTokenizerOp):
class
UnicodeCharTokenizer
(
cde
.
UnicodeCharTokenizerOp
):
class
UnicodeCharTokenizer
(
cde
.
UnicodeCharTokenizerOp
):
"""
"""
Tokenize a scalar tensor of UTF-8 string to Unicode characters.
Tokenize a scalar tensor of UTF-8 string to Unicode characters.
Args:
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.UnicodeCharTokenizer()
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.UnicodeCharTokenizer(True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
"""
"""
@
check_with_offsets
def
__init__
(
self
,
with_offsets
=
False
):
self
.
with_offsets
=
with_offsets
super
().
__init__
(
self
.
with_offsets
)
class
WordpieceTokenizer
(
cde
.
WordpieceTokenizerOp
):
class
WordpieceTokenizer
(
cde
.
WordpieceTokenizerOp
):
"""
"""
...
@@ -239,22 +274,58 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
...
@@ -239,22 +274,58 @@ class WordpieceTokenizer(cde.WordpieceTokenizerOp):
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100).
max_bytes_per_token (int, optional): Tokens exceeding this length will not be further split(default=100).
unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string,
unknown_token (str, optional): When we can not found the token: if 'unknown_token' is empty string,
return the token directly, else return 'unknown_token'(default='[UNK]').
return the token directly, else return 'unknown_token'(default='[UNK]').
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'],
>>> max_bytes_per_token=100, with_offsets=False)
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str], ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.WordpieceTokenizer(vocab=vocab, unknown_token=['UNK'],
>>> max_bytes_per_token=100, with_offsets=True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
"""
"""
def
__init__
(
self
,
vocab
,
suffix_indicator
=
'##'
,
max_bytes_per_token
=
100
,
unknown_token
=
'[UNK]'
):
@
check_wordpiece_tokenizer
def
__init__
(
self
,
vocab
,
suffix_indicator
=
'##'
,
max_bytes_per_token
=
100
,
unknown_token
=
'[UNK]'
,
with_offsets
=
False
):
self
.
vocab
=
vocab
self
.
vocab
=
vocab
self
.
suffix_indicator
=
suffix_indicator
self
.
suffix_indicator
=
suffix_indicator
self
.
max_bytes_per_token
=
max_bytes_per_token
self
.
max_bytes_per_token
=
max_bytes_per_token
self
.
unknown_token
=
unknown_token
self
.
unknown_token
=
unknown_token
super
().
__init__
(
self
.
vocab
,
self
.
suffix_indicator
,
self
.
max_bytes_per_token
,
self
.
unknown_token
)
self
.
with_offsets
=
with_offsets
super
().
__init__
(
self
.
vocab
,
self
.
suffix_indicator
,
self
.
max_bytes_per_token
,
self
.
unknown_token
,
self
.
with_offsets
)
if
platform
.
system
().
lower
()
!=
'windows'
:
if
platform
.
system
().
lower
()
!=
'windows'
:
class
WhitespaceTokenizer
(
cde
.
WhitespaceTokenizerOp
):
class
WhitespaceTokenizer
(
cde
.
WhitespaceTokenizerOp
):
"""
"""
Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '
\\\\
t', '
\\\\
r', '
\\\\
n').
Tokenize a scalar tensor of UTF-8 string on ICU defined whitespaces(such as: ' ', '
\\\\
t', '
\\\\
r', '
\\\\
n').
Args:
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.WhitespaceTokenizer()
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.WhitespaceTokenizer(True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
"""
"""
@
check_with_offsets
def
__init__
(
self
,
with_offsets
=
False
):
self
.
with_offsets
=
with_offsets
super
().
__init__
(
self
.
with_offsets
)
class
UnicodeScriptTokenizer
(
cde
.
UnicodeScriptTokenizerOp
):
class
UnicodeScriptTokenizer
(
cde
.
UnicodeScriptTokenizerOp
):
"""
"""
...
@@ -262,11 +333,25 @@ if platform.system().lower() != 'windows':
...
@@ -262,11 +333,25 @@ if platform.system().lower() != 'windows':
Args:
Args:
keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False).
keep_whitespace (bool, optional): If or not emit whitespace tokens (default=False).
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=False)
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.UnicodeScriptTokenizerOp(keep_whitespace=True, with_offsets=True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
"""
"""
def
__init__
(
self
,
keep_whitespace
=
False
):
@
check_unicode_script_tokenizer
def
__init__
(
self
,
keep_whitespace
=
False
,
with_offsets
=
False
):
self
.
keep_whitespace
=
keep_whitespace
self
.
keep_whitespace
=
keep_whitespace
super
().
__init__
(
self
.
keep_whitespace
)
self
.
with_offsets
=
with_offsets
super
().
__init__
(
self
.
keep_whitespace
,
self
.
with_offsets
)
class
CaseFold
(
cde
.
CaseFoldOp
):
class
CaseFold
(
cde
.
CaseFoldOp
):
...
@@ -302,6 +387,9 @@ if platform.system().lower() != 'windows':
...
@@ -302,6 +387,9 @@ if platform.system().lower() != 'windows':
"""
"""
def
__init__
(
self
,
normalize_form
=
NormalizeForm
.
NFKC
):
def
__init__
(
self
,
normalize_form
=
NormalizeForm
.
NFKC
):
if
not
isinstance
(
normalize_form
,
NormalizeForm
):
raise
TypeError
(
"Wrong input type for normalization_form, should be NormalizeForm."
)
self
.
normalize_form
=
DE_C_INTER_NORMALIZE_FORM
[
normalize_form
]
self
.
normalize_form
=
DE_C_INTER_NORMALIZE_FORM
[
normalize_form
]
super
().
__init__
(
self
.
normalize_form
)
super
().
__init__
(
self
.
normalize_form
)
...
@@ -338,12 +426,26 @@ if platform.system().lower() != 'windows':
...
@@ -338,12 +426,26 @@ if platform.system().lower() != 'windows':
keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token
keep_delim_pattern(str, optional): The string matched by 'delim_pattern' can be kept as a token
if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''),
if it can be matched by 'keep_delim_pattern'. And the default value is empty str(''),
in this situation, delimiters will not kept as a output token(default='').
in this situation, delimiters will not kept as a output token(default='').
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=False)
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
"""
"""
def
__init__
(
self
,
delim_pattern
,
keep_delim_pattern
=
''
):
@
check_regex_tokenizer
def
__init__
(
self
,
delim_pattern
,
keep_delim_pattern
=
''
,
with_offsets
=
False
):
self
.
delim_pattern
=
delim_pattern
self
.
delim_pattern
=
delim_pattern
self
.
keep_delim_pattern
=
keep_delim_pattern
self
.
keep_delim_pattern
=
keep_delim_pattern
super
().
__init__
(
self
.
delim_pattern
,
self
.
keep_delim_pattern
)
self
.
with_offsets
=
with_offsets
super
().
__init__
(
self
.
delim_pattern
,
self
.
keep_delim_pattern
,
self
.
with_offsets
)
class
BasicTokenizer
(
cde
.
BasicTokenizerOp
):
class
BasicTokenizer
(
cde
.
BasicTokenizerOp
):
...
@@ -359,16 +461,41 @@ if platform.system().lower() != 'windows':
...
@@ -359,16 +461,41 @@ if platform.system().lower() != 'windows':
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
preserve_unused_token(bool, optional): If True, do not split special tokens like
preserve_unused_token(bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.BasicTokenizer(lower_case=False,
>>> keep_whitespace=False,
>>> normalization_form=NormalizeForm.NONE,
>>> preserve_unused_token=True,
>>> with_offsets=False)
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.BasicTokenizer(lower_case=False,
>>> keep_whitespace=False,
>>> normalization_form=NormalizeForm.NONE,
>>> preserve_unused_token=True,
>>> with_offsets=True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
"""
"""
def
__init__
(
self
,
lower_case
=
False
,
keep_whitespace
=
False
,
@
check_basic_tokenizer
normalization_form
=
NormalizeForm
.
NONE
,
preserve_unused_token
=
True
):
def
__init__
(
self
,
lower_case
=
False
,
keep_whitespace
=
False
,
normalization_form
=
NormalizeForm
.
NONE
,
preserve_unused_token
=
True
,
with_offsets
=
False
):
if
not
isinstance
(
normalization_form
,
NormalizeForm
):
raise
TypeError
(
"Wrong input type for normalization_form, should be NormalizeForm."
)
self
.
lower_case
=
lower_case
self
.
lower_case
=
lower_case
self
.
keep_whitespace
=
keep_whitespace
self
.
keep_whitespace
=
keep_whitespace
self
.
normalization_form
=
DE_C_INTER_NORMALIZE_FORM
[
normalization_form
]
self
.
normalization_form
=
DE_C_INTER_NORMALIZE_FORM
[
normalization_form
]
self
.
preserve_unused_token
=
preserve_unused_token
self
.
preserve_unused_token
=
preserve_unused_token
super
().
__init__
(
self
.
lower_case
,
self
.
keep_whitespace
,
self
.
with_offsets
=
with_offsets
self
.
normalization_form
,
self
.
preserve_unused_token
)
super
().
__init__
(
self
.
lower_case
,
self
.
keep_whitespace
,
self
.
normalization_form
,
self
.
preserve_unused_token
,
self
.
with_offsets
)
class
BertTokenizer
(
cde
.
BertTokenizerOp
):
class
BertTokenizer
(
cde
.
BertTokenizerOp
):
...
@@ -389,11 +516,33 @@ if platform.system().lower() != 'windows':
...
@@ -389,11 +516,33 @@ if platform.system().lower() != 'windows':
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
only effective when 'lower_case' is False. See NormalizeUTF8 for details(default='NONE').
preserve_unused_token(bool, optional): If True, do not split special tokens like
preserve_unused_token(bool, optional): If True, do not split special tokens like
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
'[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]'(default=True).
with_offsets (bool, optional): If or not output offsets of tokens (default=False).
Examples:
>>> # If with_offsets=False, default output one column {["text", dtype=str]}
>>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
>>> unknown_token=100, lower_case=False, keep_whitespace=False,
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
>>> with_offsets=False)
>>> dataset = dataset.map(operations=tokenizer_op)
>>> # If with_offsets=False, then output three columns {["token", dtype=str],
>>> # ["offsets_start", dtype=uint32],
>>> # ["offsets_limit", dtype=uint32]}
>>> tokenizer_op = text.BertTokenizer(vocab=vocab, suffix_indicator='##', max_bytes_per_token=100,
>>> unknown_token=100, lower_case=False, keep_whitespace=False,
>>> normalization_form=NormalizeForm.NONE, preserve_unused_token=True,
>>> with_offsets=True)
>>> data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"],
>>> columns_order=["token", "offsets_start", "offsets_limit"], operations=tokenizer_op)
"""
"""
def
__init__
(
self
,
vocab
,
suffix_indicator
=
'##'
,
max_bytes_per_token
=
100
,
@
check_bert_tokenizer
unknown_token
=
'[UNK]'
,
lower_case
=
False
,
keep_whitespace
=
False
,
def
__init__
(
self
,
vocab
,
suffix_indicator
=
'##'
,
max_bytes_per_token
=
100
,
unknown_token
=
'[UNK]'
,
normalization_form
=
NormalizeForm
.
NONE
,
preserve_unused_token
=
True
):
lower_case
=
False
,
keep_whitespace
=
False
,
normalization_form
=
NormalizeForm
.
NONE
,
preserve_unused_token
=
True
,
with_offsets
=
False
):
if
not
isinstance
(
normalization_form
,
NormalizeForm
):
raise
TypeError
(
"Wrong input type for normalization_form, should be NormalizeForm."
)
self
.
vocab
=
vocab
self
.
vocab
=
vocab
self
.
suffix_indicator
=
suffix_indicator
self
.
suffix_indicator
=
suffix_indicator
self
.
max_bytes_per_token
=
max_bytes_per_token
self
.
max_bytes_per_token
=
max_bytes_per_token
...
@@ -402,8 +551,10 @@ if platform.system().lower() != 'windows':
...
@@ -402,8 +551,10 @@ if platform.system().lower() != 'windows':
self
.
keep_whitespace
=
keep_whitespace
self
.
keep_whitespace
=
keep_whitespace
self
.
normalization_form
=
DE_C_INTER_NORMALIZE_FORM
[
normalization_form
]
self
.
normalization_form
=
DE_C_INTER_NORMALIZE_FORM
[
normalization_form
]
self
.
preserve_unused_token
=
preserve_unused_token
self
.
preserve_unused_token
=
preserve_unused_token
self
.
with_offsets
=
with_offsets
super
().
__init__
(
self
.
vocab
,
self
.
suffix_indicator
,
self
.
max_bytes_per_token
,
self
.
unknown_token
,
super
().
__init__
(
self
.
vocab
,
self
.
suffix_indicator
,
self
.
max_bytes_per_token
,
self
.
unknown_token
,
self
.
lower_case
,
self
.
keep_whitespace
,
self
.
normalization_form
,
self
.
preserve_unused_token
)
self
.
lower_case
,
self
.
keep_whitespace
,
self
.
normalization_form
,
self
.
preserve_unused_token
,
self
.
with_offsets
)
class
TruncateSequencePair
(
cde
.
TruncateSequencePairOp
):
class
TruncateSequencePair
(
cde
.
TruncateSequencePairOp
):
...
...
mindspore/dataset/text/validators.py
浏览文件 @
47060631
...
@@ -25,7 +25,6 @@ from mindspore._c_expression import typing
...
@@ -25,7 +25,6 @@ from mindspore._c_expression import typing
from
..core.validator_helpers
import
parse_user_args
,
type_check
,
type_check_list
,
check_uint32
,
check_positive
,
\
from
..core.validator_helpers
import
parse_user_args
,
type_check
,
type_check_list
,
check_uint32
,
check_positive
,
\
INT32_MAX
,
check_value
INT32_MAX
,
check_value
def
check_unique_list_of_words
(
words
,
arg_name
):
def
check_unique_list_of_words
(
words
,
arg_name
):
"""Check that words is a list and each element is a str without any duplication"""
"""Check that words is a list and each element is a str without any duplication"""
...
@@ -116,11 +115,22 @@ def check_from_dict(method):
...
@@ -116,11 +115,22 @@ def check_from_dict(method):
def
check_jieba_init
(
method
):
def
check_jieba_init
(
method
):
"""Wrapper method to check the parameters of jieba
add word
."""
"""Wrapper method to check the parameters of jieba
init
."""
@
wraps
(
method
)
@
wraps
(
method
)
def
new_method
(
self
,
*
args
,
**
kwargs
):
def
new_method
(
self
,
*
args
,
**
kwargs
):
parse_user_args
(
method
,
*
args
,
**
kwargs
)
[
hmm_path
,
mp_path
,
_
,
with_offsets
],
_
=
parse_user_args
(
method
,
*
args
,
**
kwargs
)
if
hmm_path
is
None
:
raise
ValueError
(
"The dict of HMMSegment in cppjieba is not provided."
)
if
not
isinstance
(
hmm_path
,
str
):
raise
TypeError
(
"Wrong input type for hmm_path, should be string."
)
if
mp_path
is
None
:
raise
ValueError
(
"The dict of MPSegment in cppjieba is not provided."
)
if
not
isinstance
(
mp_path
,
str
):
raise
TypeError
(
"Wrong input type for mp_path, should be string."
)
if
not
isinstance
(
with_offsets
,
bool
):
raise
TypeError
(
"Wrong input type for with_offsets, should be boolean."
)
return
method
(
self
,
*
args
,
**
kwargs
)
return
method
(
self
,
*
args
,
**
kwargs
)
return
new_method
return
new_method
...
@@ -152,6 +162,128 @@ def check_jieba_add_dict(method):
...
@@ -152,6 +162,128 @@ def check_jieba_add_dict(method):
return
new_method
return
new_method
def
check_with_offsets
(
method
):
"""Wrapper method to check if with_offsets is the only one parameter."""
@
wraps
(
method
)
def
new_method
(
self
,
*
args
,
**
kwargs
):
[
with_offsets
],
_
=
parse_user_args
(
method
,
*
args
,
**
kwargs
)
if
not
isinstance
(
with_offsets
,
bool
):
raise
TypeError
(
"Wrong input type for with_offsets, should be boolean."
)
return
method
(
self
,
*
args
,
**
kwargs
)
return
new_method
def
check_unicode_script_tokenizer
(
method
):
"""Wrapper method to check the parameter of UnicodeScriptTokenizer."""
@
wraps
(
method
)
def
new_method
(
self
,
*
args
,
**
kwargs
):
[
keep_whitespace
,
with_offsets
],
_
=
parse_user_args
(
method
,
*
args
,
**
kwargs
)
if
not
isinstance
(
keep_whitespace
,
bool
):
raise
TypeError
(
"Wrong input type for keep_whitespace, should be boolean."
)
if
not
isinstance
(
with_offsets
,
bool
):
raise
TypeError
(
"Wrong input type for with_offsets, should be boolean."
)
return
method
(
self
,
*
args
,
**
kwargs
)
return
new_method
def
check_wordpiece_tokenizer
(
method
):
"""Wrapper method to check the parameter of WordpieceTokenizer."""
@
wraps
(
method
)
def
new_method
(
self
,
*
args
,
**
kwargs
):
[
vocab
,
suffix_indicator
,
max_bytes_per_token
,
unknown_token
,
with_offsets
],
_
=
\
parse_user_args
(
method
,
*
args
,
**
kwargs
)
if
vocab
is
None
:
raise
ValueError
(
"vocab is not provided."
)
if
not
isinstance
(
vocab
,
cde
.
Vocab
):
raise
TypeError
(
"Wrong input type for vocab, should be Vocab object."
)
if
not
isinstance
(
suffix_indicator
,
str
):
raise
TypeError
(
"Wrong input type for suffix_indicator, should be string."
)
if
not
isinstance
(
unknown_token
,
str
):
raise
TypeError
(
"Wrong input type for unknown_token, should be string."
)
if
not
isinstance
(
with_offsets
,
bool
):
raise
TypeError
(
"Wrong input type for with_offsets, should be boolean."
)
check_uint32
(
max_bytes_per_token
)
return
method
(
self
,
*
args
,
**
kwargs
)
return
new_method
def
check_regex_tokenizer
(
method
):
"""Wrapper method to check the parameter of RegexTokenizer."""
@
wraps
(
method
)
def
new_method
(
self
,
*
args
,
**
kwargs
):
[
delim_pattern
,
keep_delim_pattern
,
with_offsets
],
_
=
parse_user_args
(
method
,
*
args
,
**
kwargs
)
if
delim_pattern
is
None
:
raise
ValueError
(
"delim_pattern is not provided."
)
if
not
isinstance
(
delim_pattern
,
str
):
raise
TypeError
(
"Wrong input type for delim_pattern, should be string."
)
if
not
isinstance
(
keep_delim_pattern
,
str
):
raise
TypeError
(
"Wrong input type for keep_delim_pattern, should be string."
)
if
not
isinstance
(
with_offsets
,
bool
):
raise
TypeError
(
"Wrong input type for with_offsets, should be boolean."
)
return
method
(
self
,
*
args
,
**
kwargs
)
return
new_method
def
check_basic_tokenizer
(
method
):
"""Wrapper method to check the parameter of RegexTokenizer."""
@
wraps
(
method
)
def
new_method
(
self
,
*
args
,
**
kwargs
):
[
lower_case
,
keep_whitespace
,
_
,
preserve_unused
,
with_offsets
],
_
=
\
parse_user_args
(
method
,
*
args
,
**
kwargs
)
if
not
isinstance
(
lower_case
,
bool
):
raise
TypeError
(
"Wrong input type for lower_case, should be boolean."
)
if
not
isinstance
(
keep_whitespace
,
bool
):
raise
TypeError
(
"Wrong input type for keep_whitespace, should be boolean."
)
if
not
isinstance
(
preserve_unused
,
bool
):
raise
TypeError
(
"Wrong input type for preserve_unused_token, should be boolean."
)
if
not
isinstance
(
with_offsets
,
bool
):
raise
TypeError
(
"Wrong input type for with_offsets, should be boolean."
)
return
method
(
self
,
*
args
,
**
kwargs
)
return
new_method
def
check_bert_tokenizer
(
method
):
"""Wrapper method to check the parameter of BertTokenizer."""
@
wraps
(
method
)
def
new_method
(
self
,
*
args
,
**
kwargs
):
[
vocab
,
suffix_indicator
,
max_bytes_per_token
,
unknown_token
,
lower_case
,
keep_whitespace
,
_
,
preserve_unused_token
,
with_offsets
],
_
=
parse_user_args
(
method
,
*
args
,
**
kwargs
)
if
vocab
is
None
:
raise
ValueError
(
"vacab is not provided."
)
if
not
isinstance
(
vocab
,
cde
.
Vocab
):
raise
TypeError
(
"Wrong input type for vocab, should be Vocab object."
)
if
not
isinstance
(
suffix_indicator
,
str
):
raise
TypeError
(
"Wrong input type for suffix_indicator, should be string."
)
if
not
isinstance
(
max_bytes_per_token
,
int
):
raise
TypeError
(
"Wrong input type for max_bytes_per_token, should be int."
)
check_uint32
(
max_bytes_per_token
)
if
not
isinstance
(
unknown_token
,
str
):
raise
TypeError
(
"Wrong input type for unknown_token, should be string."
)
if
not
isinstance
(
lower_case
,
bool
):
raise
TypeError
(
"Wrong input type for lower_case, should be boolean."
)
if
not
isinstance
(
keep_whitespace
,
bool
):
raise
TypeError
(
"Wrong input type for keep_whitespace, should be boolean."
)
if
not
isinstance
(
preserve_unused_token
,
bool
):
raise
TypeError
(
"Wrong input type for preserve_unused_token, should be boolean."
)
if
not
isinstance
(
with_offsets
,
bool
):
raise
TypeError
(
"Wrong input type for with_offsets, should be boolean."
)
return
method
(
self
,
*
args
,
**
kwargs
)
return
new_method
def
check_from_dataset
(
method
):
def
check_from_dataset
(
method
):
"""A wrapper that wrap a parameter checker to the original function."""
"""A wrapper that wrap a parameter checker to the original function."""
...
...
tests/ut/cpp/dataset/jieba_tokenizer_op_test.cc
浏览文件 @
47060631
...
@@ -39,21 +39,22 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opFuntions) {
...
@@ -39,21 +39,22 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opFuntions) {
std
::
string
dataset_path
=
datasets_root_path_
+
"/jiebadict"
;
std
::
string
dataset_path
=
datasets_root_path_
+
"/jiebadict"
;
std
::
string
hmm_path
=
dataset_path
+
"/hmm_model.utf8"
;
std
::
string
hmm_path
=
dataset_path
+
"/hmm_model.utf8"
;
std
::
string
mp_path
=
dataset_path
+
"/jieba.dict.utf8"
;
std
::
string
mp_path
=
dataset_path
+
"/jieba.dict.utf8"
;
std
::
shared_ptr
<
Tensor
>
output_tensor
;
TensorRow
input
,
output
;
std
::
unique_ptr
<
JiebaTokenizerOp
>
op
(
new
JiebaTokenizerOp
(
hmm_path
,
mp_path
));
std
::
unique_ptr
<
JiebaTokenizerOp
>
op
(
new
JiebaTokenizerOp
(
hmm_path
,
mp_path
));
std
::
shared_ptr
<
Tensor
>
input_tensor
=
std
::
make_shared
<
Tensor
>
(
"今天天气太好了我们一起去外面玩吧"
);
std
::
shared_ptr
<
Tensor
>
input_tensor
=
std
::
make_shared
<
Tensor
>
(
"今天天气太好了我们一起去外面玩吧"
);
Status
s
=
op
->
Compute
(
input_tensor
,
&
output_tensor
);
input
.
push_back
(
input_tensor
);
Status
s
=
op
->
Compute
(
input
,
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
_tensor
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
EXPECT_EQ
(
output
_tensor
->
Size
(),
7
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
7
);
CheckEqual
(
output
_tensor
,
{
0
},
"今天天气"
);
CheckEqual
(
output
[
0
]
,
{
0
},
"今天天气"
);
CheckEqual
(
output
_tensor
,
{
1
},
"太好了"
);
CheckEqual
(
output
[
0
]
,
{
1
},
"太好了"
);
CheckEqual
(
output
_tensor
,
{
2
},
"我们"
);
CheckEqual
(
output
[
0
]
,
{
2
},
"我们"
);
CheckEqual
(
output
_tensor
,
{
3
},
"一起"
);
CheckEqual
(
output
[
0
]
,
{
3
},
"一起"
);
CheckEqual
(
output
_tensor
,
{
4
},
"去"
);
CheckEqual
(
output
[
0
]
,
{
4
},
"去"
);
CheckEqual
(
output
_tensor
,
{
5
},
"外面"
);
CheckEqual
(
output
[
0
]
,
{
5
},
"外面"
);
CheckEqual
(
output
_tensor
,
{
6
},
"玩吧"
);
CheckEqual
(
output
[
0
]
,
{
6
},
"玩吧"
);
}
}
TEST_F
(
MindDataTestJiebaTokenizerOp
,
TestJieba_opAdd
)
{
TEST_F
(
MindDataTestJiebaTokenizerOp
,
TestJieba_opAdd
)
{
...
@@ -61,16 +62,17 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) {
...
@@ -61,16 +62,17 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opAdd) {
std
::
string
dataset_path
=
datasets_root_path_
+
"/jiebadict"
;
std
::
string
dataset_path
=
datasets_root_path_
+
"/jiebadict"
;
std
::
string
hmm_path
=
dataset_path
+
"/hmm_model.utf8"
;
std
::
string
hmm_path
=
dataset_path
+
"/hmm_model.utf8"
;
std
::
string
mp_path
=
dataset_path
+
"/jieba.dict.utf8"
;
std
::
string
mp_path
=
dataset_path
+
"/jieba.dict.utf8"
;
std
::
shared_ptr
<
Tensor
>
output_tensor
;
TensorRow
input
,
output
;
std
::
unique_ptr
<
JiebaTokenizerOp
>
op
(
new
JiebaTokenizerOp
(
hmm_path
,
mp_path
));
std
::
unique_ptr
<
JiebaTokenizerOp
>
op
(
new
JiebaTokenizerOp
(
hmm_path
,
mp_path
));
op
->
AddWord
(
"男默女泪"
);
op
->
AddWord
(
"男默女泪"
);
std
::
shared_ptr
<
Tensor
>
input_tensor
=
std
::
make_shared
<
Tensor
>
(
"男默女泪"
);
std
::
shared_ptr
<
Tensor
>
input_tensor
=
std
::
make_shared
<
Tensor
>
(
"男默女泪"
);
Status
s
=
op
->
Compute
(
input_tensor
,
&
output_tensor
);
input
.
push_back
(
input_tensor
);
Status
s
=
op
->
Compute
(
input
,
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
_tensor
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
EXPECT_EQ
(
output
_tensor
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
CheckEqual
(
output
_tensor
,
{
0
},
"男默女泪"
);
CheckEqual
(
output
[
0
]
,
{
0
},
"男默女泪"
);
}
}
TEST_F
(
MindDataTestJiebaTokenizerOp
,
TestJieba_opEmpty
)
{
TEST_F
(
MindDataTestJiebaTokenizerOp
,
TestJieba_opEmpty
)
{
...
@@ -78,14 +80,15 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) {
...
@@ -78,14 +80,15 @@ TEST_F(MindDataTestJiebaTokenizerOp, TestJieba_opEmpty) {
std
::
string
dataset_path
=
datasets_root_path_
+
"/jiebadict"
;
std
::
string
dataset_path
=
datasets_root_path_
+
"/jiebadict"
;
std
::
string
hmm_path
=
dataset_path
+
"/hmm_model.utf8"
;
std
::
string
hmm_path
=
dataset_path
+
"/hmm_model.utf8"
;
std
::
string
mp_path
=
dataset_path
+
"/jieba.dict.utf8"
;
std
::
string
mp_path
=
dataset_path
+
"/jieba.dict.utf8"
;
std
::
shared_ptr
<
Tensor
>
output_tensor
;
TensorRow
input
,
output
;
std
::
unique_ptr
<
JiebaTokenizerOp
>
op
(
new
JiebaTokenizerOp
(
hmm_path
,
mp_path
));
std
::
unique_ptr
<
JiebaTokenizerOp
>
op
(
new
JiebaTokenizerOp
(
hmm_path
,
mp_path
));
op
->
AddWord
(
"男默女泪"
);
op
->
AddWord
(
"男默女泪"
);
std
::
shared_ptr
<
Tensor
>
input_tensor
=
std
::
make_shared
<
Tensor
>
(
""
);
std
::
shared_ptr
<
Tensor
>
input_tensor
=
std
::
make_shared
<
Tensor
>
(
""
);
Status
s
=
op
->
Compute
(
input_tensor
,
&
output_tensor
);
input
.
push_back
(
input_tensor
);
Status
s
=
op
->
Compute
(
input
,
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
_tensor
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
EXPECT_EQ
(
output
_tensor
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
CheckEqual
(
output
_tensor
,
{
0
},
""
);
CheckEqual
(
output
[
0
]
,
{
0
},
""
);
}
}
\ No newline at end of file
tests/ut/cpp/dataset/tokenizer_op_test.cc
浏览文件 @
47060631
...
@@ -45,227 +45,245 @@ class MindDataTestTokenizerOp : public UT::Common {
...
@@ -45,227 +45,245 @@ class MindDataTestTokenizerOp : public UT::Common {
TEST_F
(
MindDataTestTokenizerOp
,
TestUnicodeCharTokenizerOp
)
{
TEST_F
(
MindDataTestTokenizerOp
,
TestUnicodeCharTokenizerOp
)
{
MS_LOG
(
INFO
)
<<
"Doing TestUnicodeCharTokenizerOp."
;
MS_LOG
(
INFO
)
<<
"Doing TestUnicodeCharTokenizerOp."
;
std
::
unique_ptr
<
UnicodeCharTokenizerOp
>
op
(
new
UnicodeCharTokenizerOp
());
std
::
unique_ptr
<
UnicodeCharTokenizerOp
>
op
(
new
UnicodeCharTokenizerOp
(
true
));
std
::
shared_ptr
<
Tensor
>
input
=
std
::
make_shared
<
Tensor
>
(
"Hello World!"
);
std
::
shared_ptr
<
Tensor
>
input
=
std
::
make_shared
<
Tensor
>
(
"Hello World!"
);
std
::
shared_ptr
<
Tensor
>
output
;
TensorRow
output
;
Status
s
=
op
->
Compute
(
input
,
&
output
);
Status
s
=
op
->
Compute
(
TensorRow
(
0
,
{
input
})
,
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
12
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
12
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor1: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor1: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
"H"
);
CheckEqual
(
output
[
0
]
,
{
0
},
"H"
);
CheckEqual
(
output
,
{
1
},
"e"
);
CheckEqual
(
output
[
0
]
,
{
1
},
"e"
);
CheckEqual
(
output
,
{
2
},
"l"
);
CheckEqual
(
output
[
0
]
,
{
2
},
"l"
);
CheckEqual
(
output
,
{
3
},
"l"
);
CheckEqual
(
output
[
0
]
,
{
3
},
"l"
);
CheckEqual
(
output
,
{
4
},
"o"
);
CheckEqual
(
output
[
0
]
,
{
4
},
"o"
);
CheckEqual
(
output
,
{
5
},
" "
);
CheckEqual
(
output
[
0
]
,
{
5
},
" "
);
CheckEqual
(
output
,
{
6
},
"W"
);
CheckEqual
(
output
[
0
]
,
{
6
},
"W"
);
CheckEqual
(
output
,
{
7
},
"o"
);
CheckEqual
(
output
[
0
]
,
{
7
},
"o"
);
CheckEqual
(
output
,
{
8
},
"r"
);
CheckEqual
(
output
[
0
]
,
{
8
},
"r"
);
CheckEqual
(
output
,
{
9
},
"l"
);
CheckEqual
(
output
[
0
]
,
{
9
},
"l"
);
CheckEqual
(
output
,
{
10
},
"d"
);
CheckEqual
(
output
[
0
]
,
{
10
},
"d"
);
CheckEqual
(
output
,
{
11
},
"!"
);
CheckEqual
(
output
[
0
]
,
{
11
},
"!"
);
input
=
std
::
make_shared
<
Tensor
>
(
"中国 你好!"
);
input
=
std
::
make_shared
<
Tensor
>
(
"中国 你好!"
);
s
=
op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
6
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
6
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor2: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor2: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
"中"
);
CheckEqual
(
output
[
0
]
,
{
0
},
"中"
);
CheckEqual
(
output
,
{
1
},
"国"
);
CheckEqual
(
output
[
0
]
,
{
1
},
"国"
);
CheckEqual
(
output
,
{
2
},
" "
);
CheckEqual
(
output
[
0
]
,
{
2
},
" "
);
CheckEqual
(
output
,
{
3
},
"你"
);
CheckEqual
(
output
[
0
]
,
{
3
},
"你"
);
CheckEqual
(
output
,
{
4
},
"好"
);
CheckEqual
(
output
[
0
]
,
{
4
},
"好"
);
CheckEqual
(
output
,
{
5
},
"!"
);
CheckEqual
(
output
[
0
]
,
{
5
},
"!"
);
input
=
std
::
make_shared
<
Tensor
>
(
"中"
);
input
=
std
::
make_shared
<
Tensor
>
(
"中"
);
s
=
op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor3: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor3: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
"中"
);
CheckEqual
(
output
[
0
]
,
{
0
},
"中"
);
input
=
std
::
make_shared
<
Tensor
>
(
"H"
);
input
=
std
::
make_shared
<
Tensor
>
(
"H"
);
s
=
op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor4: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor4: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
"H"
);
CheckEqual
(
output
[
0
]
,
{
0
},
"H"
);
input
=
std
::
make_shared
<
Tensor
>
(
" "
);
input
=
std
::
make_shared
<
Tensor
>
(
" "
);
s
=
op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
2
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
2
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor5: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor5: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
" "
);
CheckEqual
(
output
[
0
]
,
{
0
},
" "
);
CheckEqual
(
output
,
{
1
},
" "
);
CheckEqual
(
output
[
0
]
,
{
1
},
" "
);
input
=
std
::
make_shared
<
Tensor
>
(
""
);
input
=
std
::
make_shared
<
Tensor
>
(
""
);
s
=
op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor6: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor6: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
""
);
CheckEqual
(
output
[
0
]
,
{
0
},
""
);
}
}
TEST_F
(
MindDataTestTokenizerOp
,
TestWhitespaceTokenizerOp
)
{
TEST_F
(
MindDataTestTokenizerOp
,
TestWhitespaceTokenizerOp
)
{
MS_LOG
(
INFO
)
<<
"Doing TestWhitespaceTokenizerOp."
;
MS_LOG
(
INFO
)
<<
"Doing TestWhitespaceTokenizerOp."
;
std
::
unique_ptr
<
WhitespaceTokenizerOp
>
op
(
new
WhitespaceTokenizerOp
());
std
::
unique_ptr
<
WhitespaceTokenizerOp
>
op
(
new
WhitespaceTokenizerOp
(
true
));
std
::
shared_ptr
<
Tensor
>
input
=
std
::
make_shared
<
Tensor
>
(
"Welcome to China."
);
std
::
shared_ptr
<
Tensor
>
input
=
std
::
make_shared
<
Tensor
>
(
"Welcome to China."
);
std
::
shared_ptr
<
Tensor
>
output
;
TensorRow
output
;
Status
s
=
op
->
Compute
(
input
,
&
output
);
Status
s
=
op
->
Compute
(
TensorRow
(
0
,
{
input
})
,
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
3
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
3
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor1: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor1: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
"Welcome"
);
CheckEqual
(
output
[
0
]
,
{
0
},
"Welcome"
);
CheckEqual
(
output
,
{
1
},
"to"
);
CheckEqual
(
output
[
0
]
,
{
1
},
"to"
);
CheckEqual
(
output
,
{
2
},
"China."
);
CheckEqual
(
output
[
0
]
,
{
2
},
"China."
);
input
=
std
::
make_shared
<
Tensor
>
(
" hello"
);
input
=
std
::
make_shared
<
Tensor
>
(
" hello"
);
s
=
op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor2: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor2: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
"hello"
);
CheckEqual
(
output
[
0
]
,
{
0
},
"hello"
);
input
=
std
::
make_shared
<
Tensor
>
(
"hello"
);
input
=
std
::
make_shared
<
Tensor
>
(
"hello"
);
s
=
op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor3: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor3: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
"hello"
);
CheckEqual
(
output
[
0
]
,
{
0
},
"hello"
);
input
=
std
::
make_shared
<
Tensor
>
(
"hello "
);
input
=
std
::
make_shared
<
Tensor
>
(
"hello "
);
s
=
op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor4: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor4: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
"hello"
);
CheckEqual
(
output
[
0
]
,
{
0
},
"hello"
);
input
=
std
::
make_shared
<
Tensor
>
(
" "
);
input
=
std
::
make_shared
<
Tensor
>
(
" "
);
s
=
op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor5: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor5: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
""
);
CheckEqual
(
output
[
0
]
,
{
0
},
""
);
}
}
TEST_F
(
MindDataTestTokenizerOp
,
TestUnicodeScriptTokenizer
)
{
TEST_F
(
MindDataTestTokenizerOp
,
TestUnicodeScriptTokenizer
)
{
MS_LOG
(
INFO
)
<<
"Doing TestUnicodeScriptTokenizer."
;
MS_LOG
(
INFO
)
<<
"Doing TestUnicodeScriptTokenizer."
;
std
::
unique_ptr
<
UnicodeScriptTokenizerOp
>
keep_whitespace_op
(
new
UnicodeScriptTokenizerOp
(
true
));
std
::
unique_ptr
<
UnicodeScriptTokenizerOp
>
keep_whitespace_op
(
new
UnicodeScriptTokenizerOp
(
true
,
true
));
std
::
unique_ptr
<
UnicodeScriptTokenizerOp
>
skip_whitespace_op
(
new
UnicodeScriptTokenizerOp
(
false
));
std
::
unique_ptr
<
UnicodeScriptTokenizerOp
>
skip_whitespace_op
(
new
UnicodeScriptTokenizerOp
(
false
,
true
));
std
::
shared_ptr
<
Tensor
>
input
=
std
::
make_shared
<
Tensor
>
(
"Welcome to China.
\n
中国
\t
北京"
);
std
::
shared_ptr
<
Tensor
>
input
=
std
::
make_shared
<
Tensor
>
(
"Welcome to China.
\n
中国
\t
北京"
);
std
::
shared_ptr
<
Tensor
>
output
;
TensorRow
output
;
Status
s
=
keep_whitespace_op
->
Compute
(
input
,
&
output
);
Status
s
=
keep_whitespace_op
->
Compute
(
TensorRow
(
0
,
{
input
})
,
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
10
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
10
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor1: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor1: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
"Welcome"
);
CheckEqual
(
output
[
0
],
{
0
},
"Welcome"
);
CheckEqual
(
output
,
{
1
},
" "
);
CheckEqual
(
output
[
0
],
{
1
},
" "
);
CheckEqual
(
output
,
{
2
},
"to"
);
CheckEqual
(
output
[
0
],
{
2
},
"to"
);
CheckEqual
(
output
,
{
3
},
" "
);
CheckEqual
(
output
[
0
],
{
3
},
" "
);
CheckEqual
(
output
,
{
4
},
"China"
);
CheckEqual
(
output
[
0
],
{
4
},
"China"
);
CheckEqual
(
output
,
{
5
},
"."
);
CheckEqual
(
output
[
0
],
{
5
},
"."
);
CheckEqual
(
output
,
{
6
},
"
\n
"
);
CheckEqual
(
output
[
0
],
{
6
},
"
\n
"
);
CheckEqual
(
output
,
{
7
},
"中国"
);
CheckEqual
(
output
[
0
],
{
7
},
"中国"
);
CheckEqual
(
output
,
{
8
},
"
\t
"
);
CheckEqual
(
output
[
0
],
{
8
},
"
\t
"
);
CheckEqual
(
output
,
{
9
},
"北京"
);
CheckEqual
(
output
[
0
],
{
9
},
"北京"
);
s
=
skip_whitespace_op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
skip_whitespace_op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
6
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
6
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor2: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor2: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
"Welcome"
);
CheckEqual
(
output
[
0
]
,
{
0
},
"Welcome"
);
CheckEqual
(
output
,
{
1
},
"to"
);
CheckEqual
(
output
[
0
]
,
{
1
},
"to"
);
CheckEqual
(
output
,
{
2
},
"China"
);
CheckEqual
(
output
[
0
]
,
{
2
},
"China"
);
CheckEqual
(
output
,
{
3
},
"."
);
CheckEqual
(
output
[
0
]
,
{
3
},
"."
);
CheckEqual
(
output
,
{
4
},
"中国"
);
CheckEqual
(
output
[
0
]
,
{
4
},
"中国"
);
CheckEqual
(
output
,
{
5
},
"北京"
);
CheckEqual
(
output
[
0
]
,
{
5
},
"北京"
);
input
=
std
::
make_shared
<
Tensor
>
(
" Welcome to 中国. "
);
input
=
std
::
make_shared
<
Tensor
>
(
" Welcome to 中国. "
);
s
=
skip_whitespace_op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
skip_whitespace_op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
4
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
4
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor3: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor3: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
"Welcome"
);
CheckEqual
(
output
[
0
],
{
0
},
"Welcome"
);
CheckEqual
(
output
,
{
1
},
"to"
);
CheckEqual
(
output
[
0
],
{
1
},
"to"
);
CheckEqual
(
output
,
{
2
},
"中国"
);
CheckEqual
(
output
[
0
],
{
2
},
"中国"
);
CheckEqual
(
output
,
{
3
},
"."
);
CheckEqual
(
output
[
0
],
{
3
},
"."
);
s
=
keep_whitespace_op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
keep_whitespace_op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
8
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
8
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor4: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor4: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
" "
);
CheckEqual
(
output
[
0
]
,
{
0
},
" "
);
CheckEqual
(
output
,
{
1
},
"Welcome"
);
CheckEqual
(
output
[
0
]
,
{
1
},
"Welcome"
);
CheckEqual
(
output
,
{
2
},
" "
);
CheckEqual
(
output
[
0
]
,
{
2
},
" "
);
CheckEqual
(
output
,
{
3
},
"to"
);
CheckEqual
(
output
[
0
]
,
{
3
},
"to"
);
CheckEqual
(
output
,
{
4
},
" "
);
CheckEqual
(
output
[
0
]
,
{
4
},
" "
);
CheckEqual
(
output
,
{
5
},
"中国"
);
CheckEqual
(
output
[
0
]
,
{
5
},
"中国"
);
CheckEqual
(
output
,
{
6
},
"."
);
CheckEqual
(
output
[
0
]
,
{
6
},
"."
);
CheckEqual
(
output
,
{
7
},
" "
);
CheckEqual
(
output
[
0
]
,
{
7
},
" "
);
input
=
std
::
make_shared
<
Tensor
>
(
"Hello"
);
input
=
std
::
make_shared
<
Tensor
>
(
"Hello"
);
s
=
keep_whitespace_op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
keep_whitespace_op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor5: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor5: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
"Hello"
);
CheckEqual
(
output
[
0
]
,
{
0
},
"Hello"
);
input
=
std
::
make_shared
<
Tensor
>
(
"H"
);
input
=
std
::
make_shared
<
Tensor
>
(
"H"
);
s
=
keep_whitespace_op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
keep_whitespace_op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor6: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor6: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
"H"
);
CheckEqual
(
output
[
0
]
,
{
0
},
"H"
);
input
=
std
::
make_shared
<
Tensor
>
(
""
);
input
=
std
::
make_shared
<
Tensor
>
(
""
);
s
=
keep_whitespace_op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
keep_whitespace_op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor7: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor7: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
""
);
CheckEqual
(
output
[
0
]
,
{
0
},
""
);
input
=
std
::
make_shared
<
Tensor
>
(
"Hello中国Hello世界"
);
input
=
std
::
make_shared
<
Tensor
>
(
"Hello中国Hello世界"
);
s
=
keep_whitespace_op
->
Compute
(
input
,
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
output
.
clear
();
EXPECT_EQ
(
output
->
Size
(),
4
);
s
=
keep_whitespace_op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
4
);
MS_LOG
(
INFO
)
<<
"Out tensor8: "
<<
output
->
ToString
();
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
CheckEqual
(
output
,
{
0
},
"Hello"
);
MS_LOG
(
INFO
)
<<
"Out tensor8: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
1
},
"中国"
);
CheckEqual
(
output
[
0
],
{
0
},
"Hello"
);
CheckEqual
(
output
,
{
2
},
"Hello"
);
CheckEqual
(
output
[
0
],
{
1
},
"中国"
);
CheckEqual
(
output
,
{
3
},
"世界"
);
CheckEqual
(
output
[
0
],
{
2
},
"Hello"
);
CheckEqual
(
output
[
0
],
{
3
},
"世界"
);
input
=
std
::
make_shared
<
Tensor
>
(
" "
);
input
=
std
::
make_shared
<
Tensor
>
(
" "
);
s
=
keep_whitespace_op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
keep_whitespace_op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor10: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor10: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
" "
);
CheckEqual
(
output
[
0
]
,
{
0
},
" "
);
input
=
std
::
make_shared
<
Tensor
>
(
" "
);
input
=
std
::
make_shared
<
Tensor
>
(
" "
);
s
=
skip_whitespace_op
->
Compute
(
input
,
&
output
);
output
.
clear
();
s
=
skip_whitespace_op
->
Compute
(
TensorRow
(
0
,
{
input
}),
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_EQ
(
output
->
Size
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Size
(),
1
);
EXPECT_EQ
(
output
->
Rank
(),
1
);
EXPECT_EQ
(
output
[
0
]
->
Rank
(),
1
);
MS_LOG
(
INFO
)
<<
"Out tensor11: "
<<
output
->
ToString
();
MS_LOG
(
INFO
)
<<
"Out tensor11: "
<<
output
[
0
]
->
ToString
();
CheckEqual
(
output
,
{
0
},
""
);
CheckEqual
(
output
[
0
]
,
{
0
},
""
);
}
}
TEST_F
(
MindDataTestTokenizerOp
,
TestCaseFold
)
{
TEST_F
(
MindDataTestTokenizerOp
,
TestCaseFold
)
{
...
@@ -321,10 +339,10 @@ TEST_F(MindDataTestTokenizerOp, TestRegexReplace) {
...
@@ -321,10 +339,10 @@ TEST_F(MindDataTestTokenizerOp, TestRegexReplace) {
TEST_F
(
MindDataTestTokenizerOp
,
TestRegexTokenizer
)
{
TEST_F
(
MindDataTestTokenizerOp
,
TestRegexTokenizer
)
{
MS_LOG
(
INFO
)
<<
"Doing TestRegexTokenizerOp."
;
MS_LOG
(
INFO
)
<<
"Doing TestRegexTokenizerOp."
;
std
::
unique_ptr
<
RegexTokenizerOp
>
regex_tokenizer_op
(
new
RegexTokenizerOp
(
"
\\
p{Cc}|
\\
p{Cf}|
\\
s+"
,
""
));
std
::
unique_ptr
<
RegexTokenizerOp
>
regex_tokenizer_op
(
new
RegexTokenizerOp
(
"
\\
p{Cc}|
\\
p{Cf}|
\\
s+"
,
""
,
true
));
std
::
shared_ptr
<
Tensor
>
input
=
std
::
make_shared
<
Tensor
>
(
"Welcome to China.
\n
中国
\t
北京"
);
std
::
shared_ptr
<
Tensor
>
input
=
std
::
make_shared
<
Tensor
>
(
"Welcome to China.
\n
中国
\t
北京"
);
std
::
shared_ptr
<
Tensor
>
output
;
TensorRow
output
;
Status
s
=
regex_tokenizer_op
->
Compute
(
input
,
&
output
);
Status
s
=
regex_tokenizer_op
->
Compute
(
TensorRow
(
0
,
{
input
})
,
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
}
}
...
@@ -332,9 +350,10 @@ TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) {
...
@@ -332,9 +350,10 @@ TEST_F(MindDataTestTokenizerOp, TestBasicTokenizer) {
MS_LOG
(
INFO
)
<<
"Doing TestBasicTokenizer."
;
MS_LOG
(
INFO
)
<<
"Doing TestBasicTokenizer."
;
//bool lower_case, bool keep_whitespace,
//bool lower_case, bool keep_whitespace,
// NormalizeForm normalization_form, bool preserve_unused_token
// NormalizeForm normalization_form, bool preserve_unused_token
std
::
unique_ptr
<
BasicTokenizerOp
>
basic_tokenizer
(
new
BasicTokenizerOp
(
true
,
true
,
NormalizeForm
::
kNone
,
false
));
std
::
unique_ptr
<
BasicTokenizerOp
>
basic_tokenizer
(
new
BasicTokenizerOp
(
true
,
true
,
NormalizeForm
::
kNone
,
false
,
true
));
std
::
shared_ptr
<
Tensor
>
input
=
std
::
make_shared
<
Tensor
>
(
"Welcome to China. 中国
\t
北京"
);
std
::
shared_ptr
<
Tensor
>
input
=
std
::
make_shared
<
Tensor
>
(
"Welcome to China. 中国
\t
北京"
);
std
::
shared_ptr
<
Tensor
>
output
;
TensorRow
output
;
Status
s
=
basic_tokenizer
->
Compute
(
input
,
&
output
);
Status
s
=
basic_tokenizer
->
Compute
(
TensorRow
(
0
,
{
input
})
,
&
output
);
EXPECT_TRUE
(
s
.
IsOk
());
EXPECT_TRUE
(
s
.
IsOk
());
}
}
\ No newline at end of file
tests/ut/python/dataset/test_basic_tokenizer.py
→
tests/ut/python/dataset/test_
text_
basic_tokenizer.py
浏览文件 @
47060631
...
@@ -18,7 +18,7 @@ Testing BasicTokenizer op in DE
...
@@ -18,7 +18,7 @@ Testing BasicTokenizer op in DE
import
numpy
as
np
import
numpy
as
np
import
mindspore.dataset
as
ds
import
mindspore.dataset
as
ds
from
mindspore
import
log
as
logger
from
mindspore
import
log
as
logger
import
mindspore.dataset.text
as
nlp
import
mindspore.dataset.text
as
text
BASIC_TOKENIZER_FILE
=
"../data/dataset/testTokenizerData/basic_tokenizer.txt"
BASIC_TOKENIZER_FILE
=
"../data/dataset/testTokenizerData/basic_tokenizer.txt"
...
@@ -37,47 +37,102 @@ test_paras = [
...
@@ -37,47 +37,102 @@ test_paras = [
'('
,
'1644'
,
'-'
,
'1911'
,
')'
,
'は'
,
'、'
,
'中'
,
'国'
,
'の'
,
'封'
,
'('
,
'1644'
,
'-'
,
'1911'
,
')'
,
'は'
,
'、'
,
'中'
,
'国'
,
'の'
,
'封'
,
'建'
,
'王'
,
'朝'
,
'の'
,
'歴'
,
'史'
,
'における'
,
'最'
,
'後'
,
'の2つの'
,
'王'
,
'朝'
,
'でした'
],
'建'
,
'王'
,
'朝'
,
'の'
,
'歴'
,
'史'
,
'における'
,
'最'
,
'後'
,
'の2つの'
,
'王'
,
'朝'
,
'でした'
],
[
'명나라'
,
'('
,
'1368'
,
'-'
,
'1644'
,
')'
,
'와'
,
'청나라'
,
'('
,
'1644'
,
'-'
,
'1911'
,
')'
,
'는'
,
[
'명나라'
,
'('
,
'1368'
,
'-'
,
'1644'
,
')'
,
'와'
,
'청나라'
,
'('
,
'1644'
,
'-'
,
'1911'
,
')'
,
'는'
,
'중국'
,
'봉건'
,
'왕조의'
,
'역사에서'
,
'마지막'
,
'두'
,
'왕조였다'
]]
'중국'
,
'봉건'
,
'왕조의'
,
'역사에서'
,
'마지막'
,
'두'
,
'왕조였다'
]],
expected_offsets_start
=
[[
0
,
8
,
11
,
18
,
21
,
24
,
27
,
30
],
[
0
,
3
,
6
,
9
,
12
,
15
,
18
,
21
,
24
,
27
,
30
,
33
,
36
,
39
,
42
],
[
0
,
4
,
7
,
10
,
14
,
17
,
20
,
24
,
27
,
30
,
34
,
37
],
[
0
,
3
,
6
,
9
,
13
,
16
,
20
,
23
,
26
,
29
,
32
,
35
,
38
,
42
,
45
,
49
,
52
,
55
,
58
,
61
,
64
,
67
,
70
,
73
,
76
,
79
,
82
,
85
,
88
,
91
,
94
,
97
,
100
],
[
0
,
3
,
6
,
9
,
13
,
14
,
18
,
21
,
24
,
27
,
30
,
33
,
37
,
38
,
42
,
45
,
48
,
51
,
54
,
57
,
60
,
63
,
66
,
69
,
72
,
75
,
78
,
81
,
93
,
96
,
99
,
109
,
112
,
115
],
[
0
,
10
,
11
,
15
,
16
,
20
,
21
,
25
,
35
,
36
,
40
,
41
,
45
,
46
,
50
,
57
,
64
,
74
,
87
,
97
,
101
]],
expected_offsets_limit
=
[[
7
,
10
,
18
,
21
,
24
,
27
,
30
,
33
],
[
3
,
6
,
9
,
12
,
15
,
18
,
21
,
24
,
27
,
30
,
33
,
36
,
39
,
42
,
45
],
[
4
,
7
,
10
,
14
,
17
,
20
,
24
,
27
,
30
,
34
,
37
,
40
],
[
3
,
6
,
9
,
13
,
16
,
20
,
23
,
26
,
29
,
32
,
35
,
38
,
42
,
45
,
49
,
52
,
55
,
58
,
61
,
64
,
67
,
70
,
73
,
76
,
79
,
82
,
85
,
88
,
91
,
94
,
97
,
100
,
103
],
[
3
,
6
,
9
,
13
,
14
,
18
,
21
,
24
,
27
,
30
,
33
,
37
,
38
,
42
,
45
,
48
,
51
,
54
,
57
,
60
,
63
,
66
,
69
,
72
,
75
,
78
,
81
,
93
,
96
,
99
,
109
,
112
,
115
,
124
],
[
9
,
11
,
15
,
16
,
20
,
21
,
24
,
34
,
36
,
40
,
41
,
45
,
46
,
49
,
56
,
63
,
73
,
86
,
96
,
100
,
113
]]
),
),
dict
(
dict
(
first
=
7
,
first
=
7
,
last
=
7
,
last
=
7
,
expected_tokens
=
[[
'this'
,
'is'
,
'a'
,
'funky'
,
'string'
]],
expected_tokens
=
[[
'this'
,
'is'
,
'a'
,
'funky'
,
'string'
]],
expected_offsets_start
=
[[
0
,
5
,
8
,
10
,
16
]],
expected_offsets_limit
=
[[
4
,
7
,
9
,
15
,
22
]],
lower_case
=
True
lower_case
=
True
),
),
]
]
def
check_basic_tokenizer
(
first
,
last
,
expected_tokens
,
lower_case
=
False
,
keep_whitespace
=
False
,
def
check_basic_tokenizer_default
(
first
,
last
,
expected_tokens
,
expected_offsets_start
,
expected_offsets_limit
,
normalization_form
=
nlp
.
utils
.
NormalizeForm
.
NONE
,
preserve_unused_token
=
False
):
lower_case
=
False
,
keep_whitespace
=
False
,
normalization_form
=
text
.
utils
.
NormalizeForm
.
NONE
,
preserve_unused_token
=
False
):
dataset
=
ds
.
TextFileDataset
(
BASIC_TOKENIZER_FILE
,
shuffle
=
False
)
dataset
=
ds
.
TextFileDataset
(
BASIC_TOKENIZER_FILE
,
shuffle
=
False
)
if
first
>
1
:
if
first
>
1
:
dataset
=
dataset
.
skip
(
first
-
1
)
dataset
=
dataset
.
skip
(
first
-
1
)
if
last
>=
first
:
if
last
>=
first
:
dataset
=
dataset
.
take
(
last
-
first
+
1
)
dataset
=
dataset
.
take
(
last
-
first
+
1
)
basic_tokenizer
=
nlp
.
BasicTokenizer
(
lower_case
=
lower_case
,
basic_tokenizer
=
text
.
BasicTokenizer
(
lower_case
=
lower_case
,
keep_whitespace
=
keep_whitespace
,
keep_whitespace
=
keep_whitespace
,
normalization_form
=
normalization_form
,
normalization_form
=
normalization_form
,
preserve_unused_token
=
preserve_unused_token
)
preserve_unused_token
=
preserve_unused_token
)
dataset
=
dataset
.
map
(
operations
=
basic_tokenizer
)
dataset
=
dataset
.
map
(
operations
=
basic_tokenizer
)
count
=
0
count
=
0
for
i
in
dataset
.
create_dict_iterator
():
for
i
in
dataset
.
create_dict_iterator
():
t
ext
=
nlp
.
to_str
(
i
[
'text'
])
t
oken
=
text
.
to_str
(
i
[
'text'
])
logger
.
info
(
"Out:"
,
t
ext
)
logger
.
info
(
"Out:"
,
t
oken
)
logger
.
info
(
"Exp:"
,
expected_tokens
[
count
])
logger
.
info
(
"Exp:"
,
expected_tokens
[
count
])
np
.
testing
.
assert_array_equal
(
t
ext
,
expected_tokens
[
count
])
np
.
testing
.
assert_array_equal
(
t
oken
,
expected_tokens
[
count
])
count
=
count
+
1
count
=
count
+
1
def
test_basic_tokenizer
():
def
check_basic_tokenizer_with_offsets
(
first
,
last
,
expected_tokens
,
expected_offsets_start
,
expected_offsets_limit
,
lower_case
=
False
,
keep_whitespace
=
False
,
normalization_form
=
text
.
utils
.
NormalizeForm
.
NONE
,
preserve_unused_token
=
False
):
dataset
=
ds
.
TextFileDataset
(
BASIC_TOKENIZER_FILE
,
shuffle
=
False
)
if
first
>
1
:
dataset
=
dataset
.
skip
(
first
-
1
)
if
last
>=
first
:
dataset
=
dataset
.
take
(
last
-
first
+
1
)
basic_tokenizer
=
text
.
BasicTokenizer
(
lower_case
=
lower_case
,
keep_whitespace
=
keep_whitespace
,
normalization_form
=
normalization_form
,
preserve_unused_token
=
preserve_unused_token
,
with_offsets
=
True
)
dataset
=
dataset
.
map
(
input_columns
=
[
'text'
],
output_columns
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
columns_order
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
operations
=
basic_tokenizer
)
count
=
0
for
i
in
dataset
.
create_dict_iterator
():
token
=
text
.
to_str
(
i
[
'token'
])
logger
.
info
(
"Out:"
,
token
)
logger
.
info
(
"Exp:"
,
expected_tokens
[
count
])
np
.
testing
.
assert_array_equal
(
token
,
expected_tokens
[
count
])
np
.
testing
.
assert_array_equal
(
i
[
'offsets_start'
],
expected_offsets_start
[
count
])
np
.
testing
.
assert_array_equal
(
i
[
'offsets_limit'
],
expected_offsets_limit
[
count
])
count
=
count
+
1
def
test_basic_tokenizer_with_offsets
():
"""
Test BasicTokenizer
"""
for
paras
in
test_paras
:
check_basic_tokenizer_with_offsets
(
**
paras
)
def
test_basic_tokenizer_default
():
"""
"""
Test BasicTokenizer
Test BasicTokenizer
"""
"""
for
paras
in
test_paras
:
for
paras
in
test_paras
:
check_basic_tokenizer
(
**
paras
)
check_basic_tokenizer
_default
(
**
paras
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
test_basic_tokenizer
()
test_basic_tokenizer_default
()
test_basic_tokenizer_with_offsets
()
tests/ut/python/dataset/test_bert_tokenizer.py
→
tests/ut/python/dataset/test_
text_
bert_tokenizer.py
浏览文件 @
47060631
...
@@ -18,7 +18,7 @@ Testing BertTokenizer op in DE
...
@@ -18,7 +18,7 @@ Testing BertTokenizer op in DE
import
numpy
as
np
import
numpy
as
np
import
mindspore.dataset
as
ds
import
mindspore.dataset
as
ds
from
mindspore
import
log
as
logger
from
mindspore
import
log
as
logger
import
mindspore.dataset.text
as
nlp
import
mindspore.dataset.text
as
text
BERT_TOKENIZER_FILE
=
"../data/dataset/testTokenizerData/bert_tokenizer.txt"
BERT_TOKENIZER_FILE
=
"../data/dataset/testTokenizerData/bert_tokenizer.txt"
...
@@ -39,6 +39,14 @@ test_paras = [
...
@@ -39,6 +39,14 @@ test_paras = [
[
'疑'
,
'是'
,
'地'
,
'上'
,
'霜'
],
[
'疑'
,
'是'
,
'地'
,
'上'
,
'霜'
],
[
'举'
,
'头'
,
'望'
,
'明'
,
'月'
],
[
'举'
,
'头'
,
'望'
,
'明'
,
'月'
],
[
'低'
,
'头'
,
'思'
,
'故'
,
'乡'
]],
[
'低'
,
'头'
,
'思'
,
'故'
,
'乡'
]],
expected_offsets_start
=
[[
0
,
3
,
6
,
9
,
12
],
[
0
,
3
,
6
,
9
,
12
],
[
0
,
3
,
6
,
9
,
12
],
[
0
,
3
,
6
,
9
,
12
]],
expected_offsets_limit
=
[[
3
,
6
,
9
,
12
,
15
],
[
3
,
6
,
9
,
12
,
15
],
[
3
,
6
,
9
,
12
,
15
],
[
3
,
6
,
9
,
12
,
15
]],
vocab_list
=
vocab_bert
vocab_list
=
vocab_bert
),
),
# test english text
# test english text
...
@@ -46,6 +54,8 @@ test_paras = [
...
@@ -46,6 +54,8 @@ test_paras = [
first
=
5
,
first
=
5
,
last
=
5
,
last
=
5
,
expect_str
=
[[
'i'
,
'am'
,
'mak'
,
'##ing'
,
'small'
,
'mistake'
,
'##s'
,
'during'
,
'work'
,
'##ing'
,
'hour'
,
'##s'
]],
expect_str
=
[[
'i'
,
'am'
,
'mak'
,
'##ing'
,
'small'
,
'mistake'
,
'##s'
,
'during'
,
'work'
,
'##ing'
,
'hour'
,
'##s'
]],
expected_offsets_start
=
[[
0
,
2
,
5
,
8
,
12
,
18
,
25
,
27
,
34
,
38
,
42
,
46
]],
expected_offsets_limit
=
[[
1
,
4
,
8
,
11
,
17
,
25
,
26
,
33
,
38
,
41
,
46
,
47
]],
lower_case
=
True
,
lower_case
=
True
,
vocab_list
=
vocab_bert
vocab_list
=
vocab_bert
),
),
...
@@ -53,6 +63,8 @@ test_paras = [
...
@@ -53,6 +63,8 @@ test_paras = [
first
=
5
,
first
=
5
,
last
=
5
,
last
=
5
,
expect_str
=
[[
'I'
,
"am"
,
'mak'
,
'##ing'
,
'small'
,
'mistake'
,
'##s'
,
'during'
,
'work'
,
'##ing'
,
'hour'
,
'##s'
]],
expect_str
=
[[
'I'
,
"am"
,
'mak'
,
'##ing'
,
'small'
,
'mistake'
,
'##s'
,
'during'
,
'work'
,
'##ing'
,
'hour'
,
'##s'
]],
expected_offsets_start
=
[[
0
,
2
,
5
,
8
,
12
,
18
,
25
,
27
,
34
,
38
,
42
,
46
]],
expected_offsets_limit
=
[[
1
,
4
,
8
,
11
,
17
,
25
,
26
,
33
,
38
,
41
,
46
,
47
]],
lower_case
=
False
,
lower_case
=
False
,
vocab_list
=
vocab_bert
vocab_list
=
vocab_bert
),
),
...
@@ -63,7 +75,9 @@ test_paras = [
...
@@ -63,7 +75,9 @@ test_paras = [
expect_str
=
[
expect_str
=
[
[
'😀'
,
'嘿'
,
'嘿'
,
'😃'
,
'哈'
,
'哈'
,
'😄'
,
'大'
,
'笑'
,
'😁'
,
'嘻'
,
'嘻'
],
[
'😀'
,
'嘿'
,
'嘿'
,
'😃'
,
'哈'
,
'哈'
,
'😄'
,
'大'
,
'笑'
,
'😁'
,
'嘻'
,
'嘻'
],
[
'繁'
,
'體'
,
'字'
]],
[
'繁'
,
'體'
,
'字'
]],
normalization_form
=
nlp
.
utils
.
NormalizeForm
.
NFKC
,
expected_offsets_start
=
[[
0
,
4
,
7
,
10
,
14
,
17
,
20
,
24
,
27
,
30
,
34
,
37
],
[
0
,
3
,
6
]],
expected_offsets_limit
=
[[
4
,
7
,
10
,
14
,
17
,
20
,
24
,
27
,
30
,
34
,
37
,
40
],
[
3
,
6
,
9
]],
normalization_form
=
text
.
utils
.
NormalizeForm
.
NFKC
,
vocab_list
=
vocab_bert
vocab_list
=
vocab_bert
),
),
# test preserved tokens
# test preserved tokens
...
@@ -79,6 +93,8 @@ test_paras = [
...
@@ -79,6 +93,8 @@ test_paras = [
[
'[unused1]'
],
[
'[unused1]'
],
[
'[unused10]'
]
[
'[unused10]'
]
],
],
expected_offsets_start
=
[[
0
,
7
],
[
0
,
7
],
[
0
,
7
],
[
0
,
7
],
[
0
,
7
],
[
0
],
[
0
]],
expected_offsets_limit
=
[[
6
,
12
],
[
6
,
12
],
[
6
,
12
],
[
6
,
12
],
[
6
,
13
],
[
9
],
[
10
]],
lower_case
=
False
,
lower_case
=
False
,
vocab_list
=
vocab_bert
,
vocab_list
=
vocab_bert
,
preserve_unused_token
=
True
,
preserve_unused_token
=
True
,
...
@@ -95,6 +111,8 @@ test_paras = [
...
@@ -95,6 +111,8 @@ test_paras = [
[
'[unused1]'
],
[
'[unused1]'
],
[
'[unused10]'
]
[
'[unused10]'
]
],
],
expected_offsets_start
=
[[
0
,
7
],
[
0
,
7
],
[
0
,
7
],
[
0
,
7
],
[
0
,
7
],
[
0
],
[
0
]],
expected_offsets_limit
=
[[
6
,
12
],
[
6
,
12
],
[
6
,
12
],
[
6
,
12
],
[
6
,
13
],
[
9
],
[
10
]],
lower_case
=
True
,
lower_case
=
True
,
vocab_list
=
vocab_bert
,
vocab_list
=
vocab_bert
,
preserve_unused_token
=
True
,
preserve_unused_token
=
True
,
...
@@ -104,6 +122,8 @@ test_paras = [
...
@@ -104,6 +122,8 @@ test_paras = [
first
=
15
,
first
=
15
,
last
=
15
,
last
=
15
,
expect_str
=
[[
'12'
,
'+'
,
'/'
,
'-'
,
'28'
,
'='
,
'40'
,
'/'
,
'-'
,
'16'
]],
expect_str
=
[[
'12'
,
'+'
,
'/'
,
'-'
,
'28'
,
'='
,
'40'
,
'/'
,
'-'
,
'16'
]],
expected_offsets_start
=
[[
0
,
2
,
3
,
4
,
5
,
7
,
8
,
10
,
11
,
12
]],
expected_offsets_limit
=
[[
2
,
3
,
4
,
5
,
7
,
8
,
10
,
11
,
12
,
14
]],
preserve_unused_token
=
True
,
preserve_unused_token
=
True
,
vocab_list
=
vocab_bert
vocab_list
=
vocab_bert
),
),
...
@@ -112,6 +132,8 @@ test_paras = [
...
@@ -112,6 +132,8 @@ test_paras = [
first
=
8
,
first
=
8
,
last
=
8
,
last
=
8
,
expect_str
=
[[
'[UNK]'
,
' '
,
'[CLS]'
]],
expect_str
=
[[
'[UNK]'
,
' '
,
'[CLS]'
]],
expected_offsets_start
=
[[
0
,
6
,
7
]],
expected_offsets_limit
=
[[
6
,
7
,
12
]],
lower_case
=
False
,
lower_case
=
False
,
vocab_list
=
vocab_bert
,
vocab_list
=
vocab_bert
,
preserve_unused_token
=
True
,
preserve_unused_token
=
True
,
...
@@ -121,6 +143,8 @@ test_paras = [
...
@@ -121,6 +143,8 @@ test_paras = [
first
=
8
,
first
=
8
,
last
=
8
,
last
=
8
,
expect_str
=
[[
'unused'
,
' '
,
'[CLS]'
]],
expect_str
=
[[
'unused'
,
' '
,
'[CLS]'
]],
expected_offsets_start
=
[[
0
,
6
,
7
]],
expected_offsets_limit
=
[[
6
,
7
,
12
]],
lower_case
=
False
,
lower_case
=
False
,
vocab_list
=
vocab_bert
,
vocab_list
=
vocab_bert
,
preserve_unused_token
=
True
,
preserve_unused_token
=
True
,
...
@@ -131,6 +155,8 @@ test_paras = [
...
@@ -131,6 +155,8 @@ test_paras = [
first
=
8
,
first
=
8
,
last
=
8
,
last
=
8
,
expect_str
=
[[
'unused'
,
' '
,
'['
,
'CLS'
,
']'
]],
expect_str
=
[[
'unused'
,
' '
,
'['
,
'CLS'
,
']'
]],
expected_offsets_start
=
[[
0
,
6
,
7
,
8
,
11
]],
expected_offsets_limit
=
[[
6
,
7
,
8
,
11
,
12
]],
lower_case
=
False
,
lower_case
=
False
,
vocab_list
=
vocab_bert
,
vocab_list
=
vocab_bert
,
preserve_unused_token
=
False
,
preserve_unused_token
=
False
,
...
@@ -140,20 +166,20 @@ test_paras = [
...
@@ -140,20 +166,20 @@ test_paras = [
]
]
def
check_bert_tokenizer
(
first
,
last
,
expect_str
,
def
check_bert_tokenizer
_default
(
first
,
last
,
expect_str
,
vocab_lis
t
,
expected_offsets_start
,
expected_offsets_limi
t
,
suffix_indicator
=
'##'
,
vocab_list
,
suffix_indicator
=
'##'
,
max_bytes_per_token
=
100
,
unknown_token
=
'[UNK]'
,
max_bytes_per_token
=
100
,
unknown_token
=
'[UNK]'
,
lower_case
=
False
,
keep_whitespace
=
False
,
lower_case
=
False
,
keep_whitespace
=
False
,
normalization_form
=
nlp
.
utils
.
NormalizeForm
.
NONE
,
normalization_form
=
text
.
utils
.
NormalizeForm
.
NONE
,
preserve_unused_token
=
False
):
preserve_unused_token
=
False
):
dataset
=
ds
.
TextFileDataset
(
BERT_TOKENIZER_FILE
,
shuffle
=
False
)
dataset
=
ds
.
TextFileDataset
(
BERT_TOKENIZER_FILE
,
shuffle
=
False
)
if
first
>
1
:
if
first
>
1
:
dataset
=
dataset
.
skip
(
first
-
1
)
dataset
=
dataset
.
skip
(
first
-
1
)
if
last
>=
first
:
if
last
>=
first
:
dataset
=
dataset
.
take
(
last
-
first
+
1
)
dataset
=
dataset
.
take
(
last
-
first
+
1
)
vocab
=
nlp
.
Vocab
.
from_list
(
vocab_list
)
vocab
=
text
.
Vocab
.
from_list
(
vocab_list
)
tokenizer_op
=
nlp
.
BertTokenizer
(
tokenizer_op
=
text
.
BertTokenizer
(
vocab
=
vocab
,
suffix_indicator
=
suffix_indicator
,
vocab
=
vocab
,
suffix_indicator
=
suffix_indicator
,
max_bytes_per_token
=
max_bytes_per_token
,
unknown_token
=
unknown_token
,
max_bytes_per_token
=
max_bytes_per_token
,
unknown_token
=
unknown_token
,
lower_case
=
lower_case
,
keep_whitespace
=
keep_whitespace
,
lower_case
=
lower_case
,
keep_whitespace
=
keep_whitespace
,
...
@@ -162,20 +188,59 @@ def check_bert_tokenizer(first, last, expect_str,
...
@@ -162,20 +188,59 @@ def check_bert_tokenizer(first, last, expect_str,
dataset
=
dataset
.
map
(
operations
=
tokenizer_op
)
dataset
=
dataset
.
map
(
operations
=
tokenizer_op
)
count
=
0
count
=
0
for
i
in
dataset
.
create_dict_iterator
():
for
i
in
dataset
.
create_dict_iterator
():
t
ext
=
nlp
.
to_str
(
i
[
'text'
])
t
oken
=
text
.
to_str
(
i
[
'text'
])
logger
.
info
(
"Out:"
,
t
ext
)
logger
.
info
(
"Out:"
,
t
oken
)
logger
.
info
(
"Exp:"
,
expect_str
[
count
])
logger
.
info
(
"Exp:"
,
expect_str
[
count
])
np
.
testing
.
assert_array_equal
(
t
ext
,
expect_str
[
count
])
np
.
testing
.
assert_array_equal
(
t
oken
,
expect_str
[
count
])
count
=
count
+
1
count
=
count
+
1
def
test_bert_tokenizer
():
def
check_bert_tokenizer_with_offsets
(
first
,
last
,
expect_str
,
expected_offsets_start
,
expected_offsets_limit
,
vocab_list
,
suffix_indicator
=
'##'
,
max_bytes_per_token
=
100
,
unknown_token
=
'[UNK]'
,
lower_case
=
False
,
keep_whitespace
=
False
,
normalization_form
=
text
.
utils
.
NormalizeForm
.
NONE
,
preserve_unused_token
=
False
):
dataset
=
ds
.
TextFileDataset
(
BERT_TOKENIZER_FILE
,
shuffle
=
False
)
if
first
>
1
:
dataset
=
dataset
.
skip
(
first
-
1
)
if
last
>=
first
:
dataset
=
dataset
.
take
(
last
-
first
+
1
)
vocab
=
text
.
Vocab
.
from_list
(
vocab_list
)
tokenizer_op
=
text
.
BertTokenizer
(
vocab
=
vocab
,
suffix_indicator
=
suffix_indicator
,
max_bytes_per_token
=
max_bytes_per_token
,
unknown_token
=
unknown_token
,
lower_case
=
lower_case
,
keep_whitespace
=
keep_whitespace
,
normalization_form
=
normalization_form
,
preserve_unused_token
=
preserve_unused_token
,
with_offsets
=
True
)
dataset
=
dataset
.
map
(
input_columns
=
[
'text'
],
output_columns
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
columns_order
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
operations
=
tokenizer_op
)
count
=
0
for
i
in
dataset
.
create_dict_iterator
():
token
=
text
.
to_str
(
i
[
'token'
])
logger
.
info
(
"Out:"
,
token
)
logger
.
info
(
"Exp:"
,
expect_str
[
count
])
np
.
testing
.
assert_array_equal
(
token
,
expect_str
[
count
])
np
.
testing
.
assert_array_equal
(
i
[
'offsets_start'
],
expected_offsets_start
[
count
])
np
.
testing
.
assert_array_equal
(
i
[
'offsets_limit'
],
expected_offsets_limit
[
count
])
count
=
count
+
1
def
test_bert_tokenizer_default
():
"""
Test WordpieceTokenizer when with_offsets=False
"""
for
paras
in
test_paras
:
check_bert_tokenizer_default
(
**
paras
)
def
test_bert_tokenizer_with_offsets
():
"""
"""
Test WordpieceTokenizer
Test WordpieceTokenizer
when with_offsets=True
"""
"""
for
paras
in
test_paras
:
for
paras
in
test_paras
:
check_bert_tokenizer
(
**
paras
)
check_bert_tokenizer
_with_offsets
(
**
paras
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
test_bert_tokenizer
()
test_bert_tokenizer_default
()
test_bert_tokenizer_with_offsets
()
tests/ut/python/dataset/test_
nlp_jieop
.py
→
tests/ut/python/dataset/test_
text_jieba_tokenizer
.py
浏览文件 @
47060631
...
@@ -197,6 +197,229 @@ def test_jieba_5():
...
@@ -197,6 +197,229 @@ def test_jieba_5():
assert
item
==
expect
[
index
]
assert
item
==
expect
[
index
]
def
test_jieba_with_offsets_1
():
"""Test jieba tokenizer with MP mode"""
data
=
ds
.
TextFileDataset
(
DATA_FILE
)
jieba_op
=
JiebaTokenizer
(
HMM_FILE
,
MP_FILE
,
mode
=
JiebaMode
.
MP
,
with_offsets
=
True
)
data
=
data
.
map
(
input_columns
=
[
"text"
],
output_columns
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
columns_order
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
operations
=
jieba_op
,
num_parallel_workers
=
1
)
expect
=
[
'今天天气'
,
'太好了'
,
'我们'
,
'一起'
,
'去'
,
'外面'
,
'玩吧'
]
expected_offsets_start
=
[
0
,
12
,
21
,
27
,
33
,
36
,
42
]
expected_offsets_limit
=
[
12
,
21
,
27
,
33
,
36
,
42
,
48
]
ret
=
[]
for
i
in
data
.
create_dict_iterator
():
ret
=
to_str
(
i
[
"token"
])
for
index
,
item
in
enumerate
(
ret
):
assert
item
==
expect
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_start"
]):
assert
item
==
expected_offsets_start
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_limit"
]):
assert
item
==
expected_offsets_limit
[
index
]
def
test_jieba_with_offsets_1_1
():
"""Test jieba tokenizer with HMM mode"""
data
=
ds
.
TextFileDataset
(
DATA_FILE
)
jieba_op
=
JiebaTokenizer
(
HMM_FILE
,
MP_FILE
,
mode
=
JiebaMode
.
HMM
,
with_offsets
=
True
)
data
=
data
.
map
(
input_columns
=
[
"text"
],
output_columns
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
columns_order
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
operations
=
jieba_op
,
num_parallel_workers
=
1
)
expect
=
[
'今天'
,
'天气'
,
'太'
,
'好'
,
'了'
,
'我们'
,
'一起'
,
'去'
,
'外面'
,
'玩'
,
'吧'
]
expected_offsets_start
=
[
0
,
6
,
12
,
15
,
18
,
21
,
27
,
33
,
36
,
42
,
45
]
expected_offsets_limit
=
[
6
,
12
,
15
,
18
,
21
,
27
,
33
,
36
,
42
,
45
,
48
]
for
i
in
data
.
create_dict_iterator
():
ret
=
to_str
(
i
[
"token"
])
for
index
,
item
in
enumerate
(
ret
):
assert
item
==
expect
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_start"
]):
assert
item
==
expected_offsets_start
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_limit"
]):
assert
item
==
expected_offsets_limit
[
index
]
def
test_jieba_with_offsets_1_2
():
"""Test jieba tokenizer with HMM MIX"""
data
=
ds
.
TextFileDataset
(
DATA_FILE
)
jieba_op
=
JiebaTokenizer
(
HMM_FILE
,
MP_FILE
,
mode
=
JiebaMode
.
MIX
,
with_offsets
=
True
)
data
=
data
.
map
(
input_columns
=
[
"text"
],
output_columns
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
columns_order
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
operations
=
jieba_op
,
num_parallel_workers
=
1
)
expect
=
[
'今天天气'
,
'太好了'
,
'我们'
,
'一起'
,
'去'
,
'外面'
,
'玩吧'
]
expected_offsets_start
=
[
0
,
12
,
21
,
27
,
33
,
36
,
42
]
expected_offsets_limit
=
[
12
,
21
,
27
,
33
,
36
,
42
,
48
]
for
i
in
data
.
create_dict_iterator
():
ret
=
to_str
(
i
[
"token"
])
for
index
,
item
in
enumerate
(
ret
):
assert
item
==
expect
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_start"
]):
assert
item
==
expected_offsets_start
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_limit"
]):
assert
item
==
expected_offsets_limit
[
index
]
def
test_jieba_with_offsets_2
():
"""Test add_word"""
DATA_FILE4
=
"../data/dataset/testJiebaDataset/4.txt"
data
=
ds
.
TextFileDataset
(
DATA_FILE4
)
jieba_op
=
JiebaTokenizer
(
HMM_FILE
,
MP_FILE
,
mode
=
JiebaMode
.
MP
,
with_offsets
=
True
)
jieba_op
.
add_word
(
"男默女泪"
)
expect
=
[
'男默女泪'
,
'市'
,
'长江大桥'
]
data
=
data
.
map
(
input_columns
=
[
"text"
],
output_columns
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
columns_order
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
operations
=
jieba_op
,
num_parallel_workers
=
2
)
expected_offsets_start
=
[
0
,
12
,
15
]
expected_offsets_limit
=
[
12
,
15
,
27
]
for
i
in
data
.
create_dict_iterator
():
ret
=
to_str
(
i
[
"token"
])
for
index
,
item
in
enumerate
(
ret
):
assert
item
==
expect
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_start"
]):
assert
item
==
expected_offsets_start
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_limit"
]):
assert
item
==
expected_offsets_limit
[
index
]
def
test_jieba_with_offsets_2_1
():
"""Test add_word with freq"""
DATA_FILE4
=
"../data/dataset/testJiebaDataset/4.txt"
data
=
ds
.
TextFileDataset
(
DATA_FILE4
)
jieba_op
=
JiebaTokenizer
(
HMM_FILE
,
MP_FILE
,
mode
=
JiebaMode
.
MP
,
with_offsets
=
True
)
jieba_op
.
add_word
(
"男默女泪"
,
10
)
data
=
data
.
map
(
input_columns
=
[
"text"
],
output_columns
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
columns_order
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
operations
=
jieba_op
,
num_parallel_workers
=
2
)
expect
=
[
'男默女泪'
,
'市'
,
'长江大桥'
]
expected_offsets_start
=
[
0
,
12
,
15
]
expected_offsets_limit
=
[
12
,
15
,
27
]
for
i
in
data
.
create_dict_iterator
():
ret
=
to_str
(
i
[
"token"
])
for
index
,
item
in
enumerate
(
ret
):
assert
item
==
expect
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_start"
]):
assert
item
==
expected_offsets_start
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_limit"
]):
assert
item
==
expected_offsets_limit
[
index
]
def
test_jieba_with_offsets_2_2
():
"""Test add_word with freq, the value of freq affects the result of segmentation"""
DATA_FILE4
=
"../data/dataset/testJiebaDataset/6.txt"
data
=
ds
.
TextFileDataset
(
DATA_FILE4
)
jieba_op
=
JiebaTokenizer
(
HMM_FILE
,
MP_FILE
,
mode
=
JiebaMode
.
MP
,
with_offsets
=
True
)
jieba_op
.
add_word
(
"江大桥"
,
20000
)
data
=
data
.
map
(
input_columns
=
[
"text"
],
output_columns
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
columns_order
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
operations
=
jieba_op
,
num_parallel_workers
=
2
)
expect
=
[
'江州'
,
'市长'
,
'江大桥'
,
'参加'
,
'了'
,
'长江大桥'
,
'的'
,
'通车'
,
'仪式'
]
expected_offsets_start
=
[
0
,
6
,
12
,
21
,
27
,
30
,
42
,
45
,
51
]
expected_offsets_limit
=
[
6
,
12
,
21
,
27
,
30
,
42
,
45
,
51
,
57
]
for
i
in
data
.
create_dict_iterator
():
ret
=
to_str
(
i
[
"token"
])
for
index
,
item
in
enumerate
(
ret
):
assert
item
==
expect
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_start"
]):
assert
item
==
expected_offsets_start
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_limit"
]):
assert
item
==
expected_offsets_limit
[
index
]
def
test_jieba_with_offsets_3
():
"""Test add_dict with dict"""
DATA_FILE4
=
"../data/dataset/testJiebaDataset/4.txt"
user_dict
=
{
"男默女泪"
:
10
}
data
=
ds
.
TextFileDataset
(
DATA_FILE4
)
jieba_op
=
JiebaTokenizer
(
HMM_FILE
,
MP_FILE
,
mode
=
JiebaMode
.
MP
,
with_offsets
=
True
)
jieba_op
.
add_dict
(
user_dict
)
data
=
data
.
map
(
input_columns
=
[
"text"
],
output_columns
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
columns_order
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
operations
=
jieba_op
,
num_parallel_workers
=
1
)
expect
=
[
'男默女泪'
,
'市'
,
'长江大桥'
]
expected_offsets_start
=
[
0
,
12
,
15
]
expected_offsets_limit
=
[
12
,
15
,
27
]
for
i
in
data
.
create_dict_iterator
():
ret
=
to_str
(
i
[
"token"
])
for
index
,
item
in
enumerate
(
ret
):
assert
item
==
expect
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_start"
]):
assert
item
==
expected_offsets_start
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_limit"
]):
assert
item
==
expected_offsets_limit
[
index
]
def
test_jieba_with_offsets_3_1
():
"""Test add_dict with dict"""
DATA_FILE4
=
"../data/dataset/testJiebaDataset/4.txt"
user_dict
=
{
"男默女泪"
:
10
,
"江大桥"
:
20000
}
data
=
ds
.
TextFileDataset
(
DATA_FILE4
)
jieba_op
=
JiebaTokenizer
(
HMM_FILE
,
MP_FILE
,
mode
=
JiebaMode
.
MP
,
with_offsets
=
True
)
jieba_op
.
add_dict
(
user_dict
)
data
=
data
.
map
(
input_columns
=
[
"text"
],
output_columns
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
columns_order
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
operations
=
jieba_op
,
num_parallel_workers
=
1
)
expect
=
[
'男默女泪'
,
'市长'
,
'江大桥'
]
expected_offsets_start
=
[
0
,
12
,
18
]
expected_offsets_limit
=
[
12
,
18
,
27
]
for
i
in
data
.
create_dict_iterator
():
ret
=
to_str
(
i
[
"token"
])
for
index
,
item
in
enumerate
(
ret
):
assert
item
==
expect
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_start"
]):
assert
item
==
expected_offsets_start
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_limit"
]):
assert
item
==
expected_offsets_limit
[
index
]
def
test_jieba_with_offsets_4
():
DATA_FILE4
=
"../data/dataset/testJiebaDataset/3.txt"
DICT_FILE
=
"../data/dataset/testJiebaDataset/user_dict.txt"
data
=
ds
.
TextFileDataset
(
DATA_FILE4
)
jieba_op
=
JiebaTokenizer
(
HMM_FILE
,
MP_FILE
,
mode
=
JiebaMode
.
MP
,
with_offsets
=
True
)
jieba_op
.
add_dict
(
DICT_FILE
)
data
=
data
.
map
(
input_columns
=
[
"text"
],
output_columns
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
columns_order
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
operations
=
jieba_op
,
num_parallel_workers
=
1
)
expect
=
[
'今天天气'
,
'太好了'
,
'我们'
,
'一起'
,
'去'
,
'外面'
,
'玩吧'
]
expected_offsets_start
=
[
0
,
12
,
21
,
27
,
33
,
36
,
42
]
expected_offsets_limit
=
[
12
,
21
,
27
,
33
,
36
,
42
,
48
]
for
i
in
data
.
create_dict_iterator
():
ret
=
to_str
(
i
[
"token"
])
for
index
,
item
in
enumerate
(
ret
):
assert
item
==
expect
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_start"
]):
assert
item
==
expected_offsets_start
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_limit"
]):
assert
item
==
expected_offsets_limit
[
index
]
def
test_jieba_with_offsets_5
():
"""Test add dict with file path"""
DATA_FILE4
=
"../data/dataset/testJiebaDataset/6.txt"
data
=
ds
.
TextFileDataset
(
DATA_FILE4
)
jieba_op
=
JiebaTokenizer
(
HMM_FILE
,
MP_FILE
,
mode
=
JiebaMode
.
MP
,
with_offsets
=
True
)
jieba_op
.
add_word
(
"江大桥"
,
20000
)
data
=
data
.
map
(
input_columns
=
[
"text"
],
output_columns
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
columns_order
=
[
"token"
,
"offsets_start"
,
"offsets_limit"
],
operations
=
jieba_op
,
num_parallel_workers
=
1
)
expect
=
[
'江州'
,
'市长'
,
'江大桥'
,
'参加'
,
'了'
,
'长江大桥'
,
'的'
,
'通车'
,
'仪式'
]
expected_offsets_start
=
[
0
,
6
,
12
,
21
,
27
,
30
,
42
,
45
,
51
]
expected_offsets_limit
=
[
6
,
12
,
21
,
27
,
30
,
42
,
45
,
51
,
57
]
for
i
in
data
.
create_dict_iterator
():
ret
=
to_str
(
i
[
"token"
])
for
index
,
item
in
enumerate
(
ret
):
assert
item
==
expect
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_start"
]):
assert
item
==
expected_offsets_start
[
index
]
for
index
,
item
in
enumerate
(
i
[
"offsets_limit"
]):
assert
item
==
expected_offsets_limit
[
index
]
def
gen
():
def
gen
():
text
=
np
.
array
(
"今天天气太好了我们一起去外面玩吧"
.
encode
(
"UTF8"
),
dtype
=
'S'
)
text
=
np
.
array
(
"今天天气太好了我们一起去外面玩吧"
.
encode
(
"UTF8"
),
dtype
=
'S'
)
yield
(
text
,)
yield
(
text
,)
...
@@ -236,3 +459,13 @@ if __name__ == "__main__":
...
@@ -236,3 +459,13 @@ if __name__ == "__main__":
test_jieba_5
()
test_jieba_5
()
test_jieba_5
()
test_jieba_5
()
test_jieba_6
()
test_jieba_6
()
test_jieba_with_offsets_1
()
test_jieba_with_offsets_1_1
()
test_jieba_with_offsets_1_2
()
test_jieba_with_offsets_2
()
test_jieba_with_offsets_2_1
()
test_jieba_with_offsets_2_2
()
test_jieba_with_offsets_3
()
test_jieba_with_offsets_3_1
()
test_jieba_with_offsets_4
()
test_jieba_with_offsets_5
()
tests/ut/python/dataset/test_tokenizer.py
→
tests/ut/python/dataset/test_t
ext_t
okenizer.py
浏览文件 @
47060631
...
@@ -18,7 +18,7 @@ Testing UnicodeCharTokenizer op in DE
...
@@ -18,7 +18,7 @@ Testing UnicodeCharTokenizer op in DE
import
numpy
as
np
import
numpy
as
np
import
mindspore.dataset
as
ds
import
mindspore.dataset
as
ds
from
mindspore
import
log
as
logger
from
mindspore
import
log
as
logger
import
mindspore.dataset.text
as
nlp
import
mindspore.dataset.text
as
text
DATA_FILE
=
"../data/dataset/testTokenizerData/1.txt"
DATA_FILE
=
"../data/dataset/testTokenizerData/1.txt"
NORMALIZE_FILE
=
"../data/dataset/testTokenizerData/normalize.txt"
NORMALIZE_FILE
=
"../data/dataset/testTokenizerData/normalize.txt"
...
@@ -36,23 +36,48 @@ def split_by_unicode_char(input_strs):
...
@@ -36,23 +36,48 @@ def split_by_unicode_char(input_strs):
return
out
return
out
def
test_unicode_char_tokenizer
():
def
test_unicode_char_tokenizer
_default
():
"""
"""
Test UnicodeCharTokenizer
Test UnicodeCharTokenizer
"""
"""
input_strs
=
(
"Welcome to Beijing!"
,
"北京欢迎您!"
,
"我喜欢English!"
,
" "
)
input_strs
=
(
"Welcome to Beijing!"
,
"北京欢迎您!"
,
"我喜欢English!"
,
" "
)
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
tokenizer
=
nlp
.
UnicodeCharTokenizer
()
tokenizer
=
text
.
UnicodeCharTokenizer
()
dataset
=
dataset
.
map
(
operations
=
tokenizer
)
dataset
=
dataset
.
map
(
operations
=
tokenizer
)
tokens
=
[]
tokens
=
[]
for
i
in
dataset
.
create_dict_iterator
():
for
i
in
dataset
.
create_dict_iterator
():
t
ext
=
nlp
.
to_str
(
i
[
'text'
]).
tolist
()
t
oken
=
text
.
to_str
(
i
[
'text'
]).
tolist
()
tokens
.
append
(
t
ext
)
tokens
.
append
(
t
oken
)
logger
.
info
(
"The out tokens is : {}"
.
format
(
tokens
))
logger
.
info
(
"The out tokens is : {}"
.
format
(
tokens
))
assert
split_by_unicode_char
(
input_strs
)
==
tokens
assert
split_by_unicode_char
(
input_strs
)
==
tokens
def
test_whitespace_tokenizer
():
def
test_unicode_char_tokenizer_with_offsets
():
"""
Test UnicodeCharTokenizer
"""
input_strs
=
(
"Welcome to Beijing!"
,
"北京欢迎您!"
,
"我喜欢English!"
,
" "
)
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
tokenizer
=
text
.
UnicodeCharTokenizer
(
with_offsets
=
True
)
dataset
=
dataset
.
map
(
input_columns
=
[
'text'
],
output_columns
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
columns_order
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
operations
=
tokenizer
)
tokens
=
[]
expected_offsets_start
=
[[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
],
[
0
,
3
,
6
,
9
,
12
,
15
],
[
0
,
3
,
6
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
],
[
0
,
1
]]
expected_offsets_limit
=
[[
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
],
[
3
,
6
,
9
,
12
,
15
,
18
],
[
3
,
6
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
],
[
1
,
2
]]
count
=
0
for
i
in
dataset
.
create_dict_iterator
():
token
=
text
.
to_str
(
i
[
'token'
]).
tolist
()
tokens
.
append
(
token
)
np
.
testing
.
assert_array_equal
(
i
[
'offsets_start'
],
expected_offsets_start
[
count
])
np
.
testing
.
assert_array_equal
(
i
[
'offsets_limit'
],
expected_offsets_limit
[
count
])
count
+=
1
logger
.
info
(
"The out tokens is : {}"
.
format
(
tokens
))
assert
split_by_unicode_char
(
input_strs
)
==
tokens
def
test_whitespace_tokenizer_default
():
"""
"""
Test WhitespaceTokenizer
Test WhitespaceTokenizer
"""
"""
...
@@ -61,17 +86,44 @@ def test_whitespace_tokenizer():
...
@@ -61,17 +86,44 @@ def test_whitespace_tokenizer():
[
"我喜欢English!"
],
[
"我喜欢English!"
],
[
""
]]
[
""
]]
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
tokenizer
=
nlp
.
WhitespaceTokenizer
()
tokenizer
=
text
.
WhitespaceTokenizer
()
dataset
=
dataset
.
map
(
operations
=
tokenizer
)
dataset
=
dataset
.
map
(
operations
=
tokenizer
)
tokens
=
[]
tokens
=
[]
for
i
in
dataset
.
create_dict_iterator
():
for
i
in
dataset
.
create_dict_iterator
():
t
ext
=
nlp
.
to_str
(
i
[
'text'
]).
tolist
()
t
oken
=
text
.
to_str
(
i
[
'text'
]).
tolist
()
tokens
.
append
(
t
ext
)
tokens
.
append
(
t
oken
)
logger
.
info
(
"The out tokens is : {}"
.
format
(
tokens
))
logger
.
info
(
"The out tokens is : {}"
.
format
(
tokens
))
assert
whitespace_strs
==
tokens
assert
whitespace_strs
==
tokens
def
test_unicode_script_tokenizer
():
def
test_whitespace_tokenizer_with_offsets
():
"""
Test WhitespaceTokenizer
"""
whitespace_strs
=
[[
"Welcome"
,
"to"
,
"Beijing!"
],
[
"北京欢迎您!"
],
[
"我喜欢English!"
],
[
""
]]
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
tokenizer
=
text
.
WhitespaceTokenizer
(
with_offsets
=
True
)
dataset
=
dataset
.
map
(
input_columns
=
[
'text'
],
output_columns
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
columns_order
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
operations
=
tokenizer
)
tokens
=
[]
expected_offsets_start
=
[[
0
,
8
,
11
],
[
0
],
[
0
],
[
0
]]
expected_offsets_limit
=
[[
7
,
10
,
19
],
[
18
],
[
17
],
[
0
]]
count
=
0
for
i
in
dataset
.
create_dict_iterator
():
token
=
text
.
to_str
(
i
[
'token'
]).
tolist
()
tokens
.
append
(
token
)
np
.
testing
.
assert_array_equal
(
i
[
'offsets_start'
],
expected_offsets_start
[
count
])
np
.
testing
.
assert_array_equal
(
i
[
'offsets_limit'
],
expected_offsets_limit
[
count
])
count
+=
1
logger
.
info
(
"The out tokens is : {}"
.
format
(
tokens
))
assert
whitespace_strs
==
tokens
def
test_unicode_script_tokenizer_default
():
"""
"""
Test UnicodeScriptTokenizer when para keep_whitespace=False
Test UnicodeScriptTokenizer when para keep_whitespace=False
"""
"""
...
@@ -80,18 +132,18 @@ def test_unicode_script_tokenizer():
...
@@ -80,18 +132,18 @@ def test_unicode_script_tokenizer():
[
"我喜欢"
,
"English"
,
"!"
],
[
"我喜欢"
,
"English"
,
"!"
],
[
""
]]
[
""
]]
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
tokenizer
=
nlp
.
UnicodeScriptTokenizer
(
keep_whitespace
=
False
)
tokenizer
=
text
.
UnicodeScriptTokenizer
(
keep_whitespace
=
False
)
dataset
=
dataset
.
map
(
operations
=
tokenizer
)
dataset
=
dataset
.
map
(
operations
=
tokenizer
)
tokens
=
[]
tokens
=
[]
for
i
in
dataset
.
create_dict_iterator
():
for
i
in
dataset
.
create_dict_iterator
():
t
ext
=
nlp
.
to_str
(
i
[
'text'
]).
tolist
()
t
oken
=
text
.
to_str
(
i
[
'text'
]).
tolist
()
tokens
.
append
(
t
ext
)
tokens
.
append
(
t
oken
)
logger
.
info
(
"The out tokens is : {}"
.
format
(
tokens
))
logger
.
info
(
"The out tokens is : {}"
.
format
(
tokens
))
assert
unicode_script_strs
==
tokens
assert
unicode_script_strs
==
tokens
def
test_unicode_script_tokenizer2
():
def
test_unicode_script_tokenizer
_default
2
():
"""
"""
Test UnicodeScriptTokenizer when para keep_whitespace=True
Test UnicodeScriptTokenizer when para keep_whitespace=True
"""
"""
...
@@ -100,12 +152,64 @@ def test_unicode_script_tokenizer2():
...
@@ -100,12 +152,64 @@ def test_unicode_script_tokenizer2():
[
"我喜欢"
,
"English"
,
"!"
],
[
"我喜欢"
,
"English"
,
"!"
],
[
" "
]]
[
" "
]]
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
tokenizer
=
nlp
.
UnicodeScriptTokenizer
(
keep_whitespace
=
True
)
tokenizer
=
text
.
UnicodeScriptTokenizer
(
keep_whitespace
=
True
)
dataset
=
dataset
.
map
(
operations
=
tokenizer
)
dataset
=
dataset
.
map
(
operations
=
tokenizer
)
tokens
=
[]
tokens
=
[]
for
i
in
dataset
.
create_dict_iterator
():
for
i
in
dataset
.
create_dict_iterator
():
text
=
nlp
.
to_str
(
i
[
'text'
]).
tolist
()
token
=
text
.
to_str
(
i
[
'text'
]).
tolist
()
tokens
.
append
(
text
)
tokens
.
append
(
token
)
logger
.
info
(
"The out tokens is :"
,
tokens
)
assert
unicode_script_strs2
==
tokens
def
test_unicode_script_tokenizer_with_offsets
():
"""
Test UnicodeScriptTokenizer when para keep_whitespace=False and with_offsets=True
"""
unicode_script_strs
=
[[
"Welcome"
,
"to"
,
"Beijing"
,
"!"
],
[
"北京欢迎您"
,
"!"
],
[
"我喜欢"
,
"English"
,
"!"
],
[
""
]]
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
tokenizer
=
text
.
UnicodeScriptTokenizer
(
keep_whitespace
=
False
,
with_offsets
=
True
)
dataset
=
dataset
.
map
(
input_columns
=
[
'text'
],
output_columns
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
columns_order
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
operations
=
tokenizer
)
tokens
=
[]
expected_offsets_start
=
[[
0
,
8
,
11
,
18
],
[
0
,
15
],
[
0
,
9
,
16
],
[
0
]]
expected_offsets_limit
=
[[
7
,
10
,
18
,
19
],
[
15
,
18
],
[
9
,
16
,
17
],
[
0
]]
count
=
0
for
i
in
dataset
.
create_dict_iterator
():
token
=
text
.
to_str
(
i
[
'token'
]).
tolist
()
tokens
.
append
(
token
)
np
.
testing
.
assert_array_equal
(
i
[
'offsets_start'
],
expected_offsets_start
[
count
])
np
.
testing
.
assert_array_equal
(
i
[
'offsets_limit'
],
expected_offsets_limit
[
count
])
count
+=
1
logger
.
info
(
"The out tokens is : {}"
.
format
(
tokens
))
assert
unicode_script_strs
==
tokens
def
test_unicode_script_tokenizer_with_offsets2
():
"""
Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True
"""
unicode_script_strs2
=
[[
"Welcome"
,
" "
,
"to"
,
" "
,
"Beijing"
,
"!"
],
[
"北京欢迎您"
,
"!"
],
[
"我喜欢"
,
"English"
,
"!"
],
[
" "
]]
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
tokenizer
=
text
.
UnicodeScriptTokenizer
(
keep_whitespace
=
True
,
with_offsets
=
True
)
dataset
=
dataset
.
map
(
input_columns
=
[
'text'
],
output_columns
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
columns_order
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
operations
=
tokenizer
)
tokens
=
[]
expected_offsets_start
=
[[
0
,
7
,
8
,
10
,
11
,
18
],
[
0
,
15
],
[
0
,
9
,
16
],
[
0
]]
expected_offsets_limit
=
[[
7
,
8
,
10
,
11
,
18
,
19
],
[
15
,
18
],
[
9
,
16
,
17
],
[
2
]]
count
=
0
for
i
in
dataset
.
create_dict_iterator
():
token
=
text
.
to_str
(
i
[
'token'
]).
tolist
()
tokens
.
append
(
token
)
np
.
testing
.
assert_array_equal
(
i
[
'offsets_start'
],
expected_offsets_start
[
count
])
np
.
testing
.
assert_array_equal
(
i
[
'offsets_limit'
],
expected_offsets_limit
[
count
])
count
+=
1
logger
.
info
(
"The out tokens is :"
,
tokens
)
logger
.
info
(
"The out tokens is :"
,
tokens
)
assert
unicode_script_strs2
==
tokens
assert
unicode_script_strs2
==
tokens
...
@@ -116,13 +220,13 @@ def test_case_fold():
...
@@ -116,13 +220,13 @@ def test_case_fold():
"""
"""
expect_strs
=
[
"welcome to beijing!"
,
"北京欢迎您!"
,
"我喜欢english!"
,
" "
]
expect_strs
=
[
"welcome to beijing!"
,
"北京欢迎您!"
,
"我喜欢english!"
,
" "
]
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
op
=
nlp
.
CaseFold
()
op
=
text
.
CaseFold
()
dataset
=
dataset
.
map
(
operations
=
op
)
dataset
=
dataset
.
map
(
operations
=
op
)
lower_strs
=
[]
lower_strs
=
[]
for
i
in
dataset
.
create_dict_iterator
():
for
i
in
dataset
.
create_dict_iterator
():
t
ext
=
nlp
.
to_str
(
i
[
'text'
]).
tolist
()
t
oken
=
text
.
to_str
(
i
[
'text'
]).
tolist
()
lower_strs
.
append
(
t
ext
)
lower_strs
.
append
(
t
oken
)
assert
lower_strs
==
expect_strs
assert
lower_strs
==
expect_strs
...
@@ -133,13 +237,13 @@ def test_normalize_utf8():
...
@@ -133,13 +237,13 @@ def test_normalize_utf8():
def
normalize
(
normalize_form
):
def
normalize
(
normalize_form
):
dataset
=
ds
.
TextFileDataset
(
NORMALIZE_FILE
,
shuffle
=
False
)
dataset
=
ds
.
TextFileDataset
(
NORMALIZE_FILE
,
shuffle
=
False
)
normalize
=
nlp
.
NormalizeUTF8
(
normalize_form
=
normalize_form
)
normalize
=
text
.
NormalizeUTF8
(
normalize_form
=
normalize_form
)
dataset
=
dataset
.
map
(
operations
=
normalize
)
dataset
=
dataset
.
map
(
operations
=
normalize
)
out_bytes
=
[]
out_bytes
=
[]
out_texts
=
[]
out_texts
=
[]
for
i
in
dataset
.
create_dict_iterator
():
for
i
in
dataset
.
create_dict_iterator
():
out_bytes
.
append
(
i
[
'text'
])
out_bytes
.
append
(
i
[
'text'
])
out_texts
.
append
(
nlp
.
to_str
(
i
[
'text'
]).
tolist
())
out_texts
.
append
(
text
.
to_str
(
i
[
'text'
]).
tolist
())
logger
.
info
(
"The out bytes is : "
,
out_bytes
)
logger
.
info
(
"The out bytes is : "
,
out_bytes
)
logger
.
info
(
"The out texts is: "
,
out_texts
)
logger
.
info
(
"The out texts is: "
,
out_texts
)
return
out_bytes
return
out_bytes
...
@@ -158,10 +262,10 @@ def test_normalize_utf8():
...
@@ -158,10 +262,10 @@ def test_normalize_utf8():
[
b
's
\xcc\xa3\xcc\x87
'
,
b
'd
\xcc\xa3\xcc\x87
'
,
b
'q
\xcc\xa3\xcc\x87
'
,
[
b
's
\xcc\xa3\xcc\x87
'
,
b
'd
\xcc\xa3\xcc\x87
'
,
b
'q
\xcc\xa3\xcc\x87
'
,
b
'fi'
,
b
'25'
,
b
's
\xcc\xa3\xcc\x87
'
]
b
'fi'
,
b
'25'
,
b
's
\xcc\xa3\xcc\x87
'
]
]
]
assert
normalize
(
nlp
.
utils
.
NormalizeForm
.
NFC
)
==
expect_normlize_data
[
0
]
assert
normalize
(
text
.
utils
.
NormalizeForm
.
NFC
)
==
expect_normlize_data
[
0
]
assert
normalize
(
nlp
.
utils
.
NormalizeForm
.
NFKC
)
==
expect_normlize_data
[
1
]
assert
normalize
(
text
.
utils
.
NormalizeForm
.
NFKC
)
==
expect_normlize_data
[
1
]
assert
normalize
(
nlp
.
utils
.
NormalizeForm
.
NFD
)
==
expect_normlize_data
[
2
]
assert
normalize
(
text
.
utils
.
NormalizeForm
.
NFD
)
==
expect_normlize_data
[
2
]
assert
normalize
(
nlp
.
utils
.
NormalizeForm
.
NFKD
)
==
expect_normlize_data
[
3
]
assert
normalize
(
text
.
utils
.
NormalizeForm
.
NFKD
)
==
expect_normlize_data
[
3
]
def
test_regex_replace
():
def
test_regex_replace
():
...
@@ -175,12 +279,12 @@ def test_regex_replace():
...
@@ -175,12 +279,12 @@ def test_regex_replace():
dataset
=
dataset
.
skip
(
first
-
1
)
dataset
=
dataset
.
skip
(
first
-
1
)
if
last
>=
first
:
if
last
>=
first
:
dataset
=
dataset
.
take
(
last
-
first
+
1
)
dataset
=
dataset
.
take
(
last
-
first
+
1
)
replace_op
=
nlp
.
RegexReplace
(
pattern
,
replace
)
replace_op
=
text
.
RegexReplace
(
pattern
,
replace
)
dataset
=
dataset
.
map
(
operations
=
replace_op
)
dataset
=
dataset
.
map
(
operations
=
replace_op
)
out_text
=
[]
out_text
=
[]
for
i
in
dataset
.
create_dict_iterator
():
for
i
in
dataset
.
create_dict_iterator
():
t
ext
=
nlp
.
to_str
(
i
[
'text'
]).
tolist
()
t
oken
=
text
.
to_str
(
i
[
'text'
]).
tolist
()
out_text
.
append
(
t
ext
)
out_text
.
append
(
t
oken
)
logger
.
info
(
"Out:"
,
out_text
)
logger
.
info
(
"Out:"
,
out_text
)
logger
.
info
(
"Exp:"
,
expect_str
)
logger
.
info
(
"Exp:"
,
expect_str
)
assert
expect_str
==
out_text
assert
expect_str
==
out_text
...
@@ -191,7 +295,7 @@ def test_regex_replace():
...
@@ -191,7 +295,7 @@ def test_regex_replace():
regex_replace
(
7
,
8
,
[
'我不想长大'
,
'WelcometoShenzhen!'
],
"
\\
p{Cc}|
\\
p{Cf}|
\\
s+"
,
""
)
regex_replace
(
7
,
8
,
[
'我不想长大'
,
'WelcometoShenzhen!'
],
"
\\
p{Cc}|
\\
p{Cf}|
\\
s+"
,
""
)
def
test_regex_tokenizer
():
def
test_regex_tokenizer
_default
():
"""
"""
Test RegexTokenizer
Test RegexTokenizer
"""
"""
...
@@ -202,15 +306,15 @@ def test_regex_tokenizer():
...
@@ -202,15 +306,15 @@ def test_regex_tokenizer():
dataset
=
dataset
.
skip
(
first
-
1
)
dataset
=
dataset
.
skip
(
first
-
1
)
if
last
>=
first
:
if
last
>=
first
:
dataset
=
dataset
.
take
(
last
-
first
+
1
)
dataset
=
dataset
.
take
(
last
-
first
+
1
)
tokenizer_op
=
nlp
.
RegexTokenizer
(
delim_pattern
,
keep_delim_pattern
)
tokenizer_op
=
text
.
RegexTokenizer
(
delim_pattern
,
keep_delim_pattern
)
dataset
=
dataset
.
map
(
operations
=
tokenizer_op
)
dataset
=
dataset
.
map
(
operations
=
tokenizer_op
)
out_text
=
[]
out_text
=
[]
count
=
0
count
=
0
for
i
in
dataset
.
create_dict_iterator
():
for
i
in
dataset
.
create_dict_iterator
():
t
ext
=
nlp
.
to_str
(
i
[
'text'
]).
tolist
()
t
oken
=
text
.
to_str
(
i
[
'text'
]).
tolist
()
np
.
testing
.
assert_array_equal
(
t
ext
,
expect_str
[
count
])
np
.
testing
.
assert_array_equal
(
t
oken
,
expect_str
[
count
])
count
+=
1
count
+=
1
out_text
.
append
(
t
ext
)
out_text
.
append
(
t
oken
)
logger
.
info
(
"Out:"
,
out_text
)
logger
.
info
(
"Out:"
,
out_text
)
logger
.
info
(
"Exp:"
,
expect_str
)
logger
.
info
(
"Exp:"
,
expect_str
)
...
@@ -222,12 +326,55 @@ def test_regex_tokenizer():
...
@@ -222,12 +326,55 @@ def test_regex_tokenizer():
regex_tokenizer
(
3
,
3
,
[[
'¥+'
,
'¥=?'
]],
r
"[\p{N}]+"
,
""
)
regex_tokenizer
(
3
,
3
,
[[
'¥+'
,
'¥=?'
]],
r
"[\p{N}]+"
,
""
)
def
test_regex_tokenizer_with_offsets
():
"""
Test RegexTokenizer
"""
def
regex_tokenizer
(
first
,
last
,
expect_str
,
expected_offsets_start
,
expected_offsets_limit
,
delim_pattern
,
keep_delim_pattern
):
dataset
=
ds
.
TextFileDataset
(
REGEX_TOKENIZER_FILE
,
shuffle
=
False
)
if
first
>
1
:
dataset
=
dataset
.
skip
(
first
-
1
)
if
last
>=
first
:
dataset
=
dataset
.
take
(
last
-
first
+
1
)
tokenizer_op
=
text
.
RegexTokenizer
(
delim_pattern
,
keep_delim_pattern
,
with_offsets
=
True
)
dataset
=
dataset
.
map
(
input_columns
=
[
'text'
],
output_columns
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
columns_order
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
operations
=
tokenizer_op
)
out_text
=
[]
count
=
0
for
i
in
dataset
.
create_dict_iterator
():
token
=
text
.
to_str
(
i
[
'token'
]).
tolist
()
np
.
testing
.
assert_array_equal
(
token
,
expect_str
[
count
])
np
.
testing
.
assert_array_equal
(
i
[
'offsets_start'
],
expected_offsets_start
[
count
])
np
.
testing
.
assert_array_equal
(
i
[
'offsets_limit'
],
expected_offsets_limit
[
count
])
count
+=
1
out_text
.
append
(
token
)
logger
.
info
(
"Out:"
,
out_text
)
logger
.
info
(
"Exp:"
,
expect_str
)
regex_tokenizer
(
1
,
1
,
[[
'Welcome'
,
'to'
,
'Shenzhen!'
]],
[[
0
,
8
,
11
]],
[[
7
,
10
,
20
]],
"
\\
s+"
,
""
)
regex_tokenizer
(
1
,
1
,
[[
'Welcome'
,
' '
,
'to'
,
' '
,
'Shenzhen!'
]],
[[
0
,
7
,
8
,
10
,
11
]],
[[
7
,
8
,
10
,
11
,
20
]],
"
\\
s+"
,
"
\\
s+"
)
regex_tokenizer
(
2
,
2
,
[[
'北'
,
'京'
,
'欢'
,
'迎'
,
'您'
,
'!Welcome to Beijing!'
]],
[[
0
,
3
,
6
,
9
,
12
,
15
]],
[[
3
,
6
,
9
,
12
,
15
,
35
]],
r
"\p{Han}"
,
r
"\p{Han}"
)
regex_tokenizer
(
3
,
3
,
[[
'12'
,
'¥+'
,
'36'
,
'¥=?'
]],
[[
0
,
2
,
6
,
8
]],
[[
2
,
6
,
8
,
13
]],
r
"[\p{P}|\p{S}]+"
,
r
"[\p{P}|\p{S}]+"
)
regex_tokenizer
(
3
,
3
,
[[
'12'
,
'36'
]],
[[
0
,
6
]],
[[
2
,
8
]],
r
"[\p{P}|\p{S}]+"
,
""
)
regex_tokenizer
(
3
,
3
,
[[
'¥+'
,
'¥=?'
]],
[[
2
,
8
]],
[[
6
,
13
]],
r
"[\p{N}]+"
,
""
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
test_unicode_char_tokenizer
()
test_unicode_char_tokenizer_default
()
test_whitespace_tokenizer
()
test_unicode_char_tokenizer_with_offsets
()
test_unicode_script_tokenizer
()
test_whitespace_tokenizer_default
()
test_unicode_script_tokenizer2
()
test_whitespace_tokenizer_with_offsets
()
test_unicode_script_tokenizer_default
()
test_unicode_script_tokenizer_default2
()
test_unicode_script_tokenizer_with_offsets
()
test_unicode_script_tokenizer_with_offsets2
()
test_case_fold
()
test_case_fold
()
test_normalize_utf8
()
test_normalize_utf8
()
test_regex_replace
()
test_regex_replace
()
test_regex_tokenizer
()
test_regex_tokenizer_default
()
test_regex_tokenizer_with_offsets
()
tests/ut/python/dataset/test_wordpiece_tokenizer.py
→
tests/ut/python/dataset/test_
text_
wordpiece_tokenizer.py
浏览文件 @
47060631
...
@@ -18,7 +18,7 @@ Testing WordpieceTokenizer op in DE
...
@@ -18,7 +18,7 @@ Testing WordpieceTokenizer op in DE
import
numpy
as
np
import
numpy
as
np
import
mindspore.dataset
as
ds
import
mindspore.dataset
as
ds
from
mindspore
import
log
as
logger
from
mindspore
import
log
as
logger
import
mindspore.dataset.text
as
nlp
import
mindspore.dataset.text
as
text
WORDPIECE_TOKENIZER_FILE
=
"../data/dataset/testTokenizerData/wordpiece_tokenizer.txt"
WORDPIECE_TOKENIZER_FILE
=
"../data/dataset/testTokenizerData/wordpiece_tokenizer.txt"
...
@@ -38,6 +38,8 @@ test_paras = [
...
@@ -38,6 +38,8 @@ test_paras = [
last
=
10
,
last
=
10
,
expect_str
=
[[
'my'
],
[
'favor'
,
'##ite'
],
[
'book'
],
[
'is'
],
[
'love'
],
[
'dur'
,
'##ing'
],
[
'the'
],
[
'cholera'
],
expect_str
=
[[
'my'
],
[
'favor'
,
'##ite'
],
[
'book'
],
[
'is'
],
[
'love'
],
[
'dur'
,
'##ing'
],
[
'the'
],
[
'cholera'
],
[
'era'
],
[
'[UNK]'
]],
[
'era'
],
[
'[UNK]'
]],
expected_offsets_start
=
[[
0
],
[
0
,
5
],
[
0
],
[
0
],
[
0
],
[
0
,
3
],
[
0
],
[
0
],
[
0
],
[
0
]],
expected_offsets_limit
=
[[
2
],
[
5
,
8
],
[
4
],
[
2
],
[
4
],
[
3
,
6
],
[
3
],
[
7
],
[
3
],
[
4
]],
vocab_list
=
vocab_english
vocab_list
=
vocab_english
),
),
dict
(
dict
(
...
@@ -45,6 +47,8 @@ test_paras = [
...
@@ -45,6 +47,8 @@ test_paras = [
last
=
10
,
last
=
10
,
expect_str
=
[[
'my'
],
[
'favor'
,
'##ite'
],
[
'book'
],
[
'is'
],
[
'love'
],
[
'dur'
,
'##ing'
],
[
'the'
],
[
'cholera'
],
expect_str
=
[[
'my'
],
[
'favor'
,
'##ite'
],
[
'book'
],
[
'is'
],
[
'love'
],
[
'dur'
,
'##ing'
],
[
'the'
],
[
'cholera'
],
[
'era'
],
[
'what'
]],
[
'era'
],
[
'what'
]],
expected_offsets_start
=
[[
0
],
[
0
,
5
],
[
0
],
[
0
],
[
0
],
[
0
,
3
],
[
0
],
[
0
],
[
0
],
[
0
]],
expected_offsets_limit
=
[[
2
],
[
5
,
8
],
[
4
],
[
2
],
[
4
],
[
3
,
6
],
[
3
],
[
7
],
[
3
],
[
4
]],
vocab_list
=
vocab_english
,
vocab_list
=
vocab_english
,
unknown_token
=
""
unknown_token
=
""
),
),
...
@@ -52,6 +56,8 @@ test_paras = [
...
@@ -52,6 +56,8 @@ test_paras = [
first
=
1
,
first
=
1
,
last
=
10
,
last
=
10
,
expect_str
=
[[
'my'
],
[
'[UNK]'
],
[
'book'
],
[
'is'
],
[
'love'
],
[
'[UNK]'
],
[
'the'
],
[
'[UNK]'
],
[
'era'
],
[
'[UNK]'
]],
expect_str
=
[[
'my'
],
[
'[UNK]'
],
[
'book'
],
[
'is'
],
[
'love'
],
[
'[UNK]'
],
[
'the'
],
[
'[UNK]'
],
[
'era'
],
[
'[UNK]'
]],
expected_offsets_start
=
[[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
]],
expected_offsets_limit
=
[[
2
],
[
5
],
[
4
],
[
2
],
[
4
],
[
5
],
[
3
],
[
5
],
[
3
],
[
4
]],
vocab_list
=
vocab_english
,
vocab_list
=
vocab_english
,
max_bytes_per_token
=
4
max_bytes_per_token
=
4
),
),
...
@@ -60,12 +66,16 @@ test_paras = [
...
@@ -60,12 +66,16 @@ test_paras = [
last
=
25
,
last
=
25
,
expect_str
=
[[
'我'
],
[
'最'
],
[
'喜'
],
[
'欢'
],
[
'的'
],
[
'书'
],
[
'是'
],
[
'霍'
],
[
'乱'
],
[
'时'
],
[
'期'
],
[
'的'
],
[
'爱'
],
[
'情'
],
expect_str
=
[[
'我'
],
[
'最'
],
[
'喜'
],
[
'欢'
],
[
'的'
],
[
'书'
],
[
'是'
],
[
'霍'
],
[
'乱'
],
[
'时'
],
[
'期'
],
[
'的'
],
[
'爱'
],
[
'情'
],
[
'[UNK]'
]],
[
'[UNK]'
]],
expected_offsets_start
=
[[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
]],
expected_offsets_limit
=
[[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
]],
vocab_list
=
vocab_chinese
,
vocab_list
=
vocab_chinese
,
),
),
dict
(
dict
(
first
=
25
,
first
=
25
,
last
=
25
,
last
=
25
,
expect_str
=
[[
'您'
]],
expect_str
=
[[
'您'
]],
expected_offsets_start
=
[[
0
]],
expected_offsets_limit
=
[[
3
]],
vocab_list
=
vocab_chinese
,
vocab_list
=
vocab_chinese
,
unknown_token
=
""
unknown_token
=
""
),
),
...
@@ -77,37 +87,74 @@ test_paras = [
...
@@ -77,37 +87,74 @@ test_paras = [
[
'[UNK]'
],
[
'[UNK]'
],
[
'我'
],
[
'最'
],
[
'喜'
],
[
'欢'
],
[
'的'
],
[
'书'
],
[
'是'
],
[
'霍'
],
[
'乱'
],
[
'时'
],
[
'期'
],
[
'的'
],
[
'爱'
],
[
'情'
],
[
'我'
],
[
'最'
],
[
'喜'
],
[
'欢'
],
[
'的'
],
[
'书'
],
[
'是'
],
[
'霍'
],
[
'乱'
],
[
'时'
],
[
'期'
],
[
'的'
],
[
'爱'
],
[
'情'
],
[
'[UNK]'
]],
[
'[UNK]'
]],
expected_offsets_start
=
[[
0
],
[
0
,
5
],
[
0
],
[
0
],
[
0
],
[
0
,
3
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
],
[
0
]],
expected_offsets_limit
=
[[
2
],
[
5
,
8
],
[
4
],
[
2
],
[
4
],
[
3
,
6
],
[
3
],
[
7
],
[
3
],
[
4
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
],
[
3
]],
vocab_list
=
vocab_mix
,
vocab_list
=
vocab_mix
,
),
),
]
]
def
check_wordpiece_tokenizer
(
first
,
last
,
expect_str
,
vocab_list
,
unknown_token
=
'[UNK]'
,
max_bytes_per_token
=
100
):
def
check_wordpiece_tokenizer_default
(
first
,
last
,
expect_str
,
expected_offsets_start
,
expected_offsets_limit
,
vocab_list
,
unknown_token
=
'[UNK]'
,
max_bytes_per_token
=
100
):
dataset
=
ds
.
TextFileDataset
(
WORDPIECE_TOKENIZER_FILE
,
shuffle
=
False
)
dataset
=
ds
.
TextFileDataset
(
WORDPIECE_TOKENIZER_FILE
,
shuffle
=
False
)
if
first
>
1
:
if
first
>
1
:
dataset
=
dataset
.
skip
(
first
-
1
)
dataset
=
dataset
.
skip
(
first
-
1
)
if
last
>=
first
:
if
last
>=
first
:
dataset
=
dataset
.
take
(
last
-
first
+
1
)
dataset
=
dataset
.
take
(
last
-
first
+
1
)
vocab
=
nlp
.
Vocab
.
from_list
(
vocab_list
)
vocab
=
text
.
Vocab
.
from_list
(
vocab_list
)
tokenizer_op
=
nlp
.
WordpieceTokenizer
(
vocab
=
vocab
,
unknown_token
=
unknown_token
,
tokenizer_op
=
text
.
WordpieceTokenizer
(
vocab
=
vocab
,
unknown_token
=
unknown_token
,
max_bytes_per_token
=
max_bytes_per_token
)
max_bytes_per_token
=
max_bytes_per_token
)
dataset
=
dataset
.
map
(
operations
=
tokenizer_op
)
dataset
=
dataset
.
map
(
operations
=
tokenizer_op
)
count
=
0
count
=
0
for
i
in
dataset
.
create_dict_iterator
():
for
i
in
dataset
.
create_dict_iterator
():
t
ext
=
nlp
.
to_str
(
i
[
'text'
])
t
oken
=
text
.
to_str
(
i
[
'text'
])
logger
.
info
(
"Out:"
,
t
ext
)
logger
.
info
(
"Out:"
,
t
oken
)
logger
.
info
(
"Exp:"
,
expect_str
[
count
])
logger
.
info
(
"Exp:"
,
expect_str
[
count
])
np
.
testing
.
assert_array_equal
(
t
ext
,
expect_str
[
count
])
np
.
testing
.
assert_array_equal
(
t
oken
,
expect_str
[
count
])
count
=
count
+
1
count
=
count
+
1
def
test_wordpiece_tokenizer
():
def
check_wordpiece_tokenizer_with_offsets
(
first
,
last
,
expect_str
,
expected_offsets_start
,
expected_offsets_limit
,
vocab_list
,
unknown_token
=
'[UNK]'
,
max_bytes_per_token
=
100
):
dataset
=
ds
.
TextFileDataset
(
WORDPIECE_TOKENIZER_FILE
,
shuffle
=
False
)
if
first
>
1
:
dataset
=
dataset
.
skip
(
first
-
1
)
if
last
>=
first
:
dataset
=
dataset
.
take
(
last
-
first
+
1
)
vocab
=
text
.
Vocab
.
from_list
(
vocab_list
)
tokenizer_op
=
text
.
WordpieceTokenizer
(
vocab
=
vocab
,
with_offsets
=
True
,
unknown_token
=
unknown_token
,
max_bytes_per_token
=
max_bytes_per_token
)
dataset
=
dataset
.
map
(
input_columns
=
[
'text'
],
output_columns
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
columns_order
=
[
'token'
,
'offsets_start'
,
'offsets_limit'
],
operations
=
tokenizer_op
)
count
=
0
for
i
in
dataset
.
create_dict_iterator
():
token
=
text
.
to_str
(
i
[
'token'
])
logger
.
info
(
"Out:"
,
token
)
logger
.
info
(
"Exp:"
,
expect_str
[
count
])
np
.
testing
.
assert_array_equal
(
token
,
expect_str
[
count
])
np
.
testing
.
assert_array_equal
(
i
[
'offsets_start'
],
expected_offsets_start
[
count
])
np
.
testing
.
assert_array_equal
(
i
[
'offsets_limit'
],
expected_offsets_limit
[
count
])
count
=
count
+
1
def
test_wordpiece_tokenizer_default
():
"""
Test WordpieceTokenizer
"""
for
paras
in
test_paras
:
check_wordpiece_tokenizer_default
(
**
paras
)
def
test_wordpiece_tokenizer_with_offsets
():
"""
"""
Test WordpieceTokenizer
Test WordpieceTokenizer
"""
"""
for
paras
in
test_paras
:
for
paras
in
test_paras
:
check_wordpiece_tokenizer
(
**
paras
)
check_wordpiece_tokenizer
_with_offsets
(
**
paras
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
test_wordpiece_tokenizer
()
test_wordpiece_tokenizer_default
()
test_wordpiece_tokenizer_with_offsets
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录