Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
b5dda129
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b5dda129
编写于
7月 14, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
7月 14, 2020
浏览文件
操作
浏览文件
下载
差异文件
!3009 Add a sample script of data processing for fine-tuning BERT on CLUE classification dataset
Merge pull request !3009 from dessyang/master
上级
eadcb341
82426851
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
153 addition
and
0 deletion
+153
-0
model_zoo/bert/src/clue_classification_dataset_process.py
model_zoo/bert/src/clue_classification_dataset_process.py
+153
-0
未找到文件。
model_zoo/bert/src/clue_classification_dataset_process.py
0 → 100755
浏览文件 @
b5dda129
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
sample script of processing CLUE classification dataset using mindspore.dataset.text for fine-tuning bert
"""
import
os
import
numpy
as
np
import
mindspore.common.dtype
as
mstype
import
mindspore.dataset
as
ds
import
mindspore.dataset.text
as
text
import
mindspore.dataset.transforms.c_transforms
as
ops
def
process_tnews_clue_dataset
(
data_dir
,
label_list
,
bert_vocab_path
,
data_usage
=
'train'
,
shuffle_dataset
=
False
,
max_seq_len
=
128
,
batch_size
=
64
):
"""Process TNEWS dataset"""
### Loading TNEWS from CLUEDataset
assert
data_usage
in
[
'train'
,
'eval'
,
'test'
]
if
data_usage
==
'train'
:
dataset
=
ds
.
CLUEDataset
(
os
.
path
.
join
(
data_dir
,
"train.json"
),
task
=
'TNEWS'
,
usage
=
data_usage
,
shuffle
=
shuffle_dataset
)
elif
data_usage
==
'eval'
:
dataset
=
ds
.
CLUEDataset
(
os
.
path
.
join
(
data_dir
,
"dev.json"
),
task
=
'TNEWS'
,
usage
=
data_usage
,
shuffle
=
shuffle_dataset
)
else
:
dataset
=
ds
.
CLUEDataset
(
os
.
path
.
join
(
data_dir
,
"test.json"
),
task
=
'TNEWS'
,
usage
=
data_usage
,
shuffle
=
shuffle_dataset
)
### Processing label
if
data_usage
==
'test'
:
dataset
=
dataset
.
map
(
input_columns
=
[
"id"
],
output_columns
=
[
"id"
,
"label_id"
],
columns_order
=
[
"id"
,
"label_id"
,
"sentence"
],
operations
=
ops
.
Duplicate
())
dataset
=
dataset
.
map
(
input_columns
=
[
"label_id"
],
operations
=
ops
.
Fill
(
0
))
else
:
label_vocab
=
text
.
Vocab
.
from_list
(
label_list
)
label_lookup
=
text
.
Lookup
(
label_vocab
)
dataset
=
dataset
.
map
(
input_columns
=
"label_desc"
,
output_columns
=
"label_id"
,
operations
=
label_lookup
)
### Processing sentence
vocab
=
text
.
Vocab
.
from_file
(
bert_vocab_path
)
tokenizer
=
text
.
BertTokenizer
(
vocab
,
lower_case
=
True
)
lookup
=
text
.
Lookup
(
vocab
,
unknown_token
=
'[UNK]'
)
dataset
=
dataset
.
map
(
input_columns
=
[
"sentence"
],
operations
=
tokenizer
)
dataset
=
dataset
.
map
(
input_columns
=
[
"sentence"
],
operations
=
ops
.
Slice
(
slice
(
0
,
max_seq_len
)))
dataset
=
dataset
.
map
(
input_columns
=
[
"sentence"
],
operations
=
ops
.
Concatenate
(
prepend
=
np
.
array
([
"[CLS]"
],
dtype
=
'S'
),
append
=
np
.
array
([
"[SEP]"
],
dtype
=
'S'
)))
dataset
=
dataset
.
map
(
input_columns
=
[
"sentence"
],
output_columns
=
[
"text_ids"
],
operations
=
lookup
)
dataset
=
dataset
.
map
(
input_columns
=
[
"text_ids"
],
operations
=
ops
.
PadEnd
([
max_seq_len
],
0
))
dataset
=
dataset
.
map
(
input_columns
=
[
"text_ids"
],
output_columns
=
[
"text_ids"
,
"mask_ids"
],
columns_order
=
[
"label_id"
,
"text_ids"
,
"mask_ids"
],
operations
=
ops
.
Duplicate
())
dataset
=
dataset
.
map
(
input_columns
=
[
"mask_ids"
],
operations
=
ops
.
Mask
(
ops
.
Relational
.
NE
,
0
,
mstype
.
int32
))
dataset
=
dataset
.
map
(
input_columns
=
[
"text_ids"
],
output_columns
=
[
"text_ids"
,
"segment_ids"
],
columns_order
=
[
"label_id"
,
"text_ids"
,
"mask_ids"
,
"segment_ids"
],
operations
=
ops
.
Duplicate
())
dataset
=
dataset
.
map
(
input_columns
=
[
"segment_ids"
],
operations
=
ops
.
Fill
(
0
))
dataset
=
dataset
.
batch
(
batch_size
)
label
=
[]
text_ids
=
[]
mask_ids
=
[]
segment_ids
=
[]
for
data
in
dataset
:
label
.
append
(
data
[
0
])
text_ids
.
append
(
data
[
1
])
mask_ids
.
append
(
data
[
2
])
segment_ids
.
append
(
data
[
3
])
return
label
,
text_ids
,
mask_ids
,
segment_ids
def
process_cmnli_clue_dataset
(
data_dir
,
label_list
,
bert_vocab_path
,
data_usage
=
'train'
,
shuffle_dataset
=
False
,
max_seq_len
=
128
,
batch_size
=
64
):
"""Process CMNLI dataset"""
### Loading CMNLI from CLUEDataset
assert
data_usage
in
[
'train'
,
'eval'
,
'test'
]
if
data_usage
==
'train'
:
dataset
=
ds
.
CLUEDataset
(
os
.
path
.
join
(
data_dir
,
"train.json"
),
task
=
'CMNLI'
,
usage
=
data_usage
,
shuffle
=
shuffle_dataset
)
elif
data_usage
==
'eval'
:
dataset
=
ds
.
CLUEDataset
(
os
.
path
.
join
(
data_dir
,
"dev.json"
),
task
=
'CMNLI'
,
usage
=
data_usage
,
shuffle
=
shuffle_dataset
)
else
:
dataset
=
ds
.
CLUEDataset
(
os
.
path
.
join
(
data_dir
,
"test.json"
),
task
=
'CMNLI'
,
usage
=
data_usage
,
shuffle
=
shuffle_dataset
)
### Processing label
if
data_usage
==
'test'
:
dataset
=
dataset
.
map
(
input_columns
=
[
"id"
],
output_columns
=
[
"id"
,
"label_id"
],
columns_order
=
[
"id"
,
"label_id"
,
"sentence1"
,
"sentence2"
],
operations
=
ops
.
Duplicate
())
dataset
=
dataset
.
map
(
input_columns
=
[
"label_id"
],
operations
=
ops
.
Fill
(
0
))
else
:
label_vocab
=
text
.
Vocab
.
from_list
(
label_list
)
label_lookup
=
text
.
Lookup
(
label_vocab
)
dataset
=
dataset
.
map
(
input_columns
=
"label"
,
output_columns
=
"label_id"
,
operations
=
label_lookup
)
### Processing sentence pairs
vocab
=
text
.
Vocab
.
from_file
(
bert_vocab_path
)
tokenizer
=
text
.
BertTokenizer
(
vocab
,
lower_case
=
True
)
lookup
=
text
.
Lookup
(
vocab
,
unknown_token
=
'[UNK]'
)
### Tokenizing sentences and truncate sequence pair
dataset
=
dataset
.
map
(
input_columns
=
[
"sentence1"
],
operations
=
tokenizer
)
dataset
=
dataset
.
map
(
input_columns
=
[
"sentence2"
],
operations
=
tokenizer
)
dataset
=
dataset
.
map
(
input_columns
=
[
"sentence1"
,
"sentence2"
],
operations
=
text
.
TruncateSequencePair
(
max_seq_len
-
3
))
### Adding special tokens
dataset
=
dataset
.
map
(
input_columns
=
[
"sentence1"
],
operations
=
ops
.
Concatenate
(
prepend
=
np
.
array
([
"[CLS]"
],
dtype
=
'S'
),
append
=
np
.
array
([
"[SEP]"
],
dtype
=
'S'
)))
dataset
=
dataset
.
map
(
input_columns
=
[
"sentence2"
],
operations
=
ops
.
Concatenate
(
append
=
np
.
array
([
"[SEP]"
],
dtype
=
'S'
)))
### Generating segment_ids
dataset
=
dataset
.
map
(
input_columns
=
[
"sentence1"
],
output_columns
=
[
"sentence1"
,
"type_sentence1"
],
columns_order
=
[
"sentence1"
,
"type_sentence1"
,
"sentence2"
,
"label_id"
],
operations
=
ops
.
Duplicate
())
dataset
=
dataset
.
map
(
input_columns
=
[
"sentence2"
],
output_columns
=
[
"sentence2"
,
"type_sentence2"
],
columns_order
=
[
"sentence1"
,
"type_sentence1"
,
"sentence2"
,
"type_sentence2"
,
"label_id"
],
operations
=
ops
.
Duplicate
())
dataset
=
dataset
.
map
(
input_columns
=
[
"type_sentence1"
],
operations
=
[
lookup
,
ops
.
Fill
(
0
)])
dataset
=
dataset
.
map
(
input_columns
=
[
"type_sentence2"
],
operations
=
[
lookup
,
ops
.
Fill
(
1
)])
dataset
=
dataset
.
map
(
input_columns
=
[
"type_sentence1"
,
"type_sentence2"
],
output_columns
=
[
"segment_ids"
],
columns_order
=
[
"sentence1"
,
"sentence2"
,
"segment_ids"
,
"label_id"
],
operations
=
ops
.
Concatenate
())
dataset
=
dataset
.
map
(
input_columns
=
[
"segment_ids"
],
operations
=
ops
.
PadEnd
([
max_seq_len
],
0
))
### Generating text_ids
dataset
=
dataset
.
map
(
input_columns
=
[
"sentence1"
,
"sentence2"
],
output_columns
=
[
"text_ids"
],
columns_order
=
[
"text_ids"
,
"segment_ids"
,
"label_id"
],
operations
=
ops
.
Concatenate
())
dataset
=
dataset
.
map
(
input_columns
=
[
"text_ids"
],
operations
=
lookup
)
dataset
=
dataset
.
map
(
input_columns
=
[
"text_ids"
],
operations
=
ops
.
PadEnd
([
max_seq_len
],
0
))
### Generating mask_ids
dataset
=
dataset
.
map
(
input_columns
=
[
"text_ids"
],
output_columns
=
[
"text_ids"
,
"mask_ids"
],
columns_order
=
[
"label_id"
,
"text_ids"
,
"mask_ids"
,
"segment_ids"
],
operations
=
ops
.
Duplicate
())
dataset
=
dataset
.
map
(
input_columns
=
[
"mask_ids"
],
operations
=
ops
.
Mask
(
ops
.
Relational
.
NE
,
0
,
mstype
.
int32
))
dataset
=
dataset
.
batch
(
batch_size
)
label
=
[]
text_ids
=
[]
mask_ids
=
[]
segment_ids
=
[]
for
data
in
dataset
:
label
.
append
(
data
[
0
])
text_ids
.
append
(
data
[
1
])
mask_ids
.
append
(
data
[
2
])
segment_ids
.
append
(
data
[
3
])
return
label
,
text_ids
,
mask_ids
,
segment_ids
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录