Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
48d3bc46
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
48d3bc46
编写于
6月 29, 2020
作者:
Y
yuchaojie
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
modify tokenization for transformer model
上级
f7610a6c
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
116 addition
and
136 deletion
+116
-136
model_zoo/Transformer/create_data.py
model_zoo/Transformer/create_data.py
+6
-6
model_zoo/Transformer/scripts/replace-quote.perl
model_zoo/Transformer/scripts/replace-quote.perl
+15
-0
model_zoo/Transformer/src/tokenization.py
model_zoo/Transformer/src/tokenization.py
+95
-130
未找到文件。
model_zoo/Transformer/create_data.py
浏览文件 @
48d3bc46
...
...
@@ -37,13 +37,13 @@ class SampleInstance():
def
__str__
(
self
):
s
=
""
s
+=
"source sos tokens: %s
\n
"
%
(
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
self
.
source_sos_tokens
]))
[
tokenization
.
convert_to_printable
(
x
)
for
x
in
self
.
source_sos_tokens
]))
s
+=
"source eos tokens: %s
\n
"
%
(
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
self
.
source_eos_tokens
]))
[
tokenization
.
convert_to_printable
(
x
)
for
x
in
self
.
source_eos_tokens
]))
s
+=
"target sos tokens: %s
\n
"
%
(
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
self
.
target_sos_tokens
]))
[
tokenization
.
convert_to_printable
(
x
)
for
x
in
self
.
target_sos_tokens
]))
s
+=
"target eos tokens: %s
\n
"
%
(
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
self
.
target_eos_tokens
]))
[
tokenization
.
convert_to_printable
(
x
)
for
x
in
self
.
target_eos_tokens
]))
s
+=
"
\n
"
return
s
...
...
@@ -185,9 +185,9 @@ def main():
if
total_written
<=
20
:
logging
.
info
(
"*** Example ***"
)
logging
.
info
(
"source tokens: %s"
,
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
instance
.
source_eos_tokens
]))
[
tokenization
.
convert_to_printable
(
x
)
for
x
in
instance
.
source_eos_tokens
]))
logging
.
info
(
"target tokens: %s"
,
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
instance
.
target_sos_tokens
]))
[
tokenization
.
convert_to_printable
(
x
)
for
x
in
instance
.
target_sos_tokens
]))
for
feature_name
in
features
.
keys
():
feature
=
features
[
feature_name
]
...
...
model_zoo/Transformer/scripts/replace-quote.perl
浏览文件 @
48d3bc46
#!/usr/bin/env perl
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
use
warnings
;
use
strict
;
...
...
model_zoo/Transformer/src/tokenization.py
浏览文件 @
48d3bc46
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###############################################################################
# Modified by Huawei Technologies Co., Ltd, May, 2020, with following changes:
# - Remove some unused classes and functions
# - Modify load_vocab, convert_to_unicode, printable_text function
# - Modify BasicTokenizer class
# - Add WhiteSpaceTokenizer class
###############################################################################
# ============================================================================
"""Tokenization utilities."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
sys
import
collections
import
unicodedata
import
six
def
convert_to_unicode
(
text
):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if
six
.
PY3
:
def
convert_to_printable
(
text
):
"""
Converts `text` to a printable coding format.
"""
if
sys
.
version_info
[
0
]
==
3
:
if
isinstance
(
text
,
str
):
return
text
if
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
raise
ValueError
(
"
Unsupported string type: %s
"
%
(
type
(
text
)))
if
s
ix
.
PY
2
:
raise
ValueError
(
"
Only support type `str` or `bytes`, while text type is `%s`
"
%
(
type
(
text
)))
if
s
ys
.
version_info
[
0
]
==
2
:
if
isinstance
(
text
,
str
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
if
isinstance
(
text
,
unicode
):
return
text
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
raise
ValueError
(
"Not running on Python2 or Python 3?"
)
if
isinstance
(
text
,
unicode
):
return
text
.
encode
(
"utf-8"
)
raise
ValueError
(
"Only support type `str` or `unicode`, while text type is `%s`"
%
(
type
(
text
)))
raise
ValueError
(
"Only supported when running on Python2 or Python3."
)
def
printable_text
(
text
):
"""Returns text encoded in a way suitable for print or `logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if
six
.
PY3
:
def
convert_to_unicode
(
text
):
"""
Converts `text` to Unicode format.
"""
if
sys
.
version_info
[
0
]
==
3
:
if
isinstance
(
text
,
str
):
return
text
if
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
raise
ValueError
(
"
Unsupported string type: %s
"
%
(
type
(
text
)))
if
s
ix
.
PY
2
:
raise
ValueError
(
"
Only support type `str` or `bytes`, while text type is `%s`
"
%
(
type
(
text
)))
if
s
ys
.
version_info
[
0
]
==
2
:
if
isinstance
(
text
,
str
):
return
text
return
text
.
decode
(
"utf-8"
,
"ignore"
)
if
isinstance
(
text
,
unicode
):
return
text
.
encode
(
"utf-8"
)
raise
ValueError
(
"
Unsupported string type: %s
"
%
(
type
(
text
)))
raise
ValueError
(
"
Not running on Python2 or Python 3?
"
)
return
text
raise
ValueError
(
"
Only support type `str` or `unicode`, while text type is `%s`
"
%
(
type
(
text
)))
raise
ValueError
(
"
Only supported when running on Python2 or Python3.
"
)
def
load_vocab
(
vocab_file
):
"""Loads a vocabulary file into a dictionary."""
vocab
=
collections
.
OrderedDict
()
def
load_vocab_file
(
vocab_file
):
"""
Loads a vocabulary file and turns into a {token:id} dictionary.
"""
vocab_dict
=
collections
.
OrderedDict
()
index
=
0
with
open
(
vocab_file
,
"r"
)
as
reader
:
with
open
(
vocab_file
,
"r"
)
as
vocab
:
while
True
:
token
=
convert_to_unicode
(
reader
.
readline
())
token
=
convert_to_unicode
(
vocab
.
readline
())
if
not
token
:
break
token
=
token
.
strip
()
vocab
[
token
]
=
index
vocab
_dict
[
token
]
=
index
index
+=
1
return
vocab
return
vocab
_dict
def
convert_by_vocab
(
vocab
,
items
):
"""Converts a sequence of [tokens|ids] using the vocab."""
def
convert_by_vocab_dict
(
vocab_dict
,
items
):
"""
Converts a sequence of [tokens|ids] according to the vocab dict.
"""
output
=
[]
for
item
in
items
:
if
item
in
vocab
:
output
.
append
(
vocab
[
item
])
if
item
in
vocab
_dict
:
output
.
append
(
vocab
_dict
[
item
])
else
:
output
.
append
(
vocab
[
"<unk>"
])
output
.
append
(
vocab
_dict
[
"<unk>"
])
return
output
def
convert_tokens_to_ids
(
vocab
,
tokens
):
return
convert_by_vocab
(
vocab
,
tokens
)
def
convert_ids_to_tokens
(
inv_vocab
,
ids
):
return
convert_by_vocab
(
inv_vocab
,
ids
)
def
whitespace_tokenize
(
text
):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text
=
text
.
strip
()
if
not
text
:
return
[]
tokens
=
text
.
split
()
return
tokens
class
WhiteSpaceTokenizer
():
"""Runs end-to-end tokenziation."""
"""
Whitespace tokenizer.
"""
def
__init__
(
self
,
vocab_file
):
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
inv_vocab
=
{
v
:
k
for
k
,
v
in
self
.
vocab
.
items
()}
self
.
basic_tokenizer
=
BasicTokenizer
()
def
tokenize
(
self
,
text
):
return
self
.
basic_tokenizer
.
tokenize
(
text
)
def
convert_tokens_to_ids
(
self
,
tokens
):
return
convert_by_vocab
(
self
.
vocab
,
tokens
)
def
convert_ids_to_tokens
(
self
,
ids
):
return
convert_by_vocab
(
self
.
inv_vocab
,
ids
)
class
BasicTokenizer
():
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def
__init__
(
self
):
"""Constructs a BasicTokenizer."""
self
.
vocab_dict
=
load_vocab_file
(
vocab_file
)
self
.
inv_vocab_dict
=
{
index
:
token
for
token
,
index
in
self
.
vocab_dict
.
items
()}
def
_is_whitespace_char
(
self
,
char
):
"""
Checks if it is a whitespace character(regard "
\t
", "
\n
", "
\r
" as whitespace here).
"""
if
char
in
(
" "
,
"
\t
"
,
"
\n
"
,
"
\r
"
):
return
True
uni
=
unicodedata
.
category
(
char
)
if
uni
==
"Zs"
:
return
True
return
False
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text."""
text
=
convert_to_unicode
(
text
)
text
=
self
.
_clean_text
(
text
)
return
whitespace_tokenize
(
text
)
def
_is_control_char
(
self
,
char
):
"""
Checks if it is a control character.
"""
if
char
in
(
"
\t
"
,
"
\n
"
,
"
\r
"
):
return
False
uni
=
unicodedata
.
category
(
char
)
if
uni
in
(
"Cc"
,
"Cf"
):
return
True
return
False
def
_clean_text
(
self
,
text
):
"""Performs invalid character removal and whitespace cleanup on text."""
"""
Remove invalid characters and cleanup whitespace.
"""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
cp
==
0
or
cp
==
0xfffd
or
_is_control
(
char
):
if
cp
==
0
or
cp
==
0xfffd
or
self
.
_is_control_char
(
char
):
continue
if
_is_whitespace
(
char
):
if
self
.
_is_whitespace_char
(
char
):
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_whitespace_tokenize
(
self
,
text
):
"""
Clean whitespace and split text into tokens.
"""
text
=
text
.
strip
()
if
not
text
:
tokens
=
[]
else
:
tokens
=
text
.
split
()
return
tokens
def
_is_whitespace
(
char
):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if
char
in
(
" "
,
"
\t
"
,
"
\n
"
,
"
\r
"
):
return
True
cat
=
unicodedata
.
category
(
char
)
if
cat
==
"Zs"
:
return
True
return
False
def
tokenize
(
self
,
text
):
"""
Tokenizes text.
"""
text
=
convert_to_unicode
(
text
)
text
=
self
.
_clean_text
(
text
)
tokens
=
self
.
_whitespace_tokenize
(
text
)
return
tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
return
convert_by_vocab_dict
(
self
.
vocab_dict
,
tokens
)
def
_is_control
(
char
):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if
char
in
(
"
\t
"
,
"
\n
"
,
"
\r
"
):
return
False
cat
=
unicodedata
.
category
(
char
)
if
cat
in
(
"Cc"
,
"Cf"
):
return
True
return
False
def
_is_punctuation
(
char
):
"""Checks whether `chars` is a punctuation character."""
cp
=
ord
(
char
)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if
((
33
<=
cp
<=
47
)
or
(
58
<=
cp
<=
64
)
or
(
91
<=
cp
<=
96
)
or
(
123
<=
cp
<=
126
)):
return
True
cat
=
unicodedata
.
category
(
char
)
if
cat
.
startswith
(
"P"
):
return
True
return
False
def
convert_ids_to_tokens
(
self
,
ids
):
return
convert_by_vocab_dict
(
self
.
inv_vocab_dict
,
ids
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录