Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
1ea38eb6
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1ea38eb6
编写于
6月 22, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
6月 22, 2020
浏览文件
操作
浏览文件
下载
差异文件
!2375 Add Python Tokenizer
Merge pull request !2375 from h.farahat/python_tokenizer
上级
72194b23
e981c67a
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
102 addition
and
4 deletion
+102
-4
mindspore/dataset/text/__init__.py
mindspore/dataset/text/__init__.py
+3
-2
mindspore/dataset/text/transforms.py
mindspore/dataset/text/transforms.py
+25
-2
mindspore/dataset/text/validators.py
mindspore/dataset/text/validators.py
+22
-0
tests/ut/python/dataset/test_python_tokenizer.py
tests/ut/python/dataset/test_python_tokenizer.py
+52
-0
未找到文件。
mindspore/dataset/text/__init__.py
浏览文件 @
1ea38eb6
...
@@ -22,12 +22,13 @@ from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm
...
@@ -22,12 +22,13 @@ from .utils import to_str, to_bytes, JiebaMode, Vocab, NormalizeForm
__all__
=
[
__all__
=
[
"Lookup"
,
"JiebaTokenizer"
,
"UnicodeCharTokenizer"
,
"Ngram"
,
"Lookup"
,
"JiebaTokenizer"
,
"UnicodeCharTokenizer"
,
"Ngram"
,
"to_str"
,
"to_bytes"
,
"JiebaMode"
,
"Vocab"
,
"WordpieceTokenizer"
,
"TruncateSequencePair"
,
"ToNumber"
"to_str"
,
"to_bytes"
,
"JiebaMode"
,
"Vocab"
,
"WordpieceTokenizer"
,
"TruncateSequencePair"
,
"ToNumber"
,
"PythonTokenizer"
]
]
if
platform
.
system
().
lower
()
!=
'windows'
:
if
platform
.
system
().
lower
()
!=
'windows'
:
from
.transforms
import
UnicodeScriptTokenizer
,
WhitespaceTokenizer
,
CaseFold
,
NormalizeUTF8
,
\
from
.transforms
import
UnicodeScriptTokenizer
,
WhitespaceTokenizer
,
CaseFold
,
NormalizeUTF8
,
\
RegexReplace
,
RegexTokenizer
,
BasicTokenizer
,
BertTokenizer
RegexReplace
,
RegexTokenizer
,
BasicTokenizer
,
BertTokenizer
,
PythonTokenizer
__all__
.
append
([
"UnicodeScriptTokenizer"
,
"WhitespaceTokenizer"
,
"CaseFold"
,
"NormalizeUTF8"
,
__all__
.
append
([
"UnicodeScriptTokenizer"
,
"WhitespaceTokenizer"
,
"CaseFold"
,
"NormalizeUTF8"
,
"RegexReplace"
,
"RegexTokenizer"
,
"BasicTokenizer"
,
"BertTokenizer"
,
"NormalizeForm"
])
"RegexReplace"
,
"RegexTokenizer"
,
"BasicTokenizer"
,
"BertTokenizer"
,
"NormalizeForm"
])
mindspore/dataset/text/transforms.py
浏览文件 @
1ea38eb6
...
@@ -18,13 +18,14 @@ c transforms for all text related operators
...
@@ -18,13 +18,14 @@ c transforms for all text related operators
import
os
import
os
import
re
import
re
import
platform
import
platform
import
numpy
as
np
import
mindspore._c_dataengine
as
cde
import
mindspore._c_dataengine
as
cde
from
.utils
import
JiebaMode
,
NormalizeForm
from
.utils
import
JiebaMode
,
NormalizeForm
,
to_str
from
.validators
import
check_lookup
,
check_jieba_add_dict
,
\
from
.validators
import
check_lookup
,
check_jieba_add_dict
,
\
check_jieba_add_word
,
check_jieba_init
,
check_ngram
,
check_pair_truncate
,
\
check_jieba_add_word
,
check_jieba_init
,
check_ngram
,
check_pair_truncate
,
\
check_to_number
check_to_number
,
check_python_tokenizer
from
..core.datatypes
import
mstype_to_detype
from
..core.datatypes
import
mstype_to_detype
...
@@ -418,3 +419,25 @@ class ToNumber(cde.ToNumberOp):
...
@@ -418,3 +419,25 @@ class ToNumber(cde.ToNumberOp):
data_type
=
mstype_to_detype
(
data_type
)
data_type
=
mstype_to_detype
(
data_type
)
self
.
data_type
=
str
(
data_type
)
self
.
data_type
=
str
(
data_type
)
super
().
__init__
(
data_type
)
super
().
__init__
(
data_type
)
class
PythonTokenizer
:
"""
Callable class to be used for user-defined string tokenizer.
Args:
tokenizer (Callable): Python function that takes a `str` and returns a list of `str` as tokens.
Examples:
>>> def my_tokenizer(line):
>>> return line.split()
>>> data = data.map(operations=PythonTokenizer(my_tokenizer))
"""
@
check_python_tokenizer
def
__init__
(
self
,
tokenizer
):
self
.
tokenizer
=
np
.
vectorize
(
lambda
x
:
np
.
array
(
tokenizer
(
x
),
dtype
=
'U'
),
signature
=
'()->(n)'
)
def
__call__
(
self
,
in_array
):
in_array
=
to_str
(
in_array
)
tokens
=
self
.
tokenizer
(
in_array
)
return
tokens
mindspore/dataset/text/validators.py
浏览文件 @
1ea38eb6
...
@@ -411,3 +411,25 @@ def check_to_number(method):
...
@@ -411,3 +411,25 @@ def check_to_number(method):
return
method
(
self
,
**
kwargs
)
return
method
(
self
,
**
kwargs
)
return
new_method
return
new_method
def
check_python_tokenizer
(
method
):
"""A wrapper that wraps a parameter check to the original function (PythonTokenizer)."""
@
wraps
(
method
)
def
new_method
(
self
,
*
args
,
**
kwargs
):
tokenizer
=
(
list
(
args
)
+
[
None
])[
0
]
if
"tokenizer"
in
kwargs
:
tokenizer
=
kwargs
.
get
(
"tokenizer"
)
if
tokenizer
is
None
:
raise
ValueError
(
"tokenizer is a mandatory parameter."
)
if
not
callable
(
tokenizer
):
raise
TypeError
(
"tokenizer is not a callable python function"
)
kwargs
[
"tokenizer"
]
=
tokenizer
return
method
(
self
,
**
kwargs
)
return
new_method
tests/ut/python/dataset/test_python_tokenizer.py
0 → 100644
浏览文件 @
1ea38eb6
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Testing PythonTokenizer op in DE
"""
import
mindspore.dataset
as
ds
import
mindspore.dataset.text
as
text
from
mindspore
import
log
as
logger
DATA_FILE
=
"../data/dataset/testTokenizerData/1.txt"
def
test_whitespace_tokenizer_ch
():
"""
Test PythonTokenizer
"""
whitespace_strs
=
[[
"Welcome"
,
"to"
,
"Beijing!"
],
[
"北京欢迎您!"
],
[
"我喜欢English!"
],
[
""
]]
def
my_tokenizer
(
line
):
words
=
line
.
split
()
if
not
words
:
return
[
""
]
return
words
dataset
=
ds
.
TextFileDataset
(
DATA_FILE
,
shuffle
=
False
)
tokenizer
=
text
.
PythonTokenizer
(
my_tokenizer
)
dataset
=
dataset
.
map
(
operations
=
tokenizer
,
num_parallel_workers
=
1
)
tokens
=
[]
for
i
in
dataset
.
create_dict_iterator
():
s
=
text
.
to_str
(
i
[
'text'
]).
tolist
()
tokens
.
append
(
s
)
logger
.
info
(
"The out tokens is : {}"
.
format
(
tokens
))
assert
whitespace_strs
==
tokens
if
__name__
==
'__main__'
:
test_whitespace_tokenizer_ch
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录