Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
06915d0a
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
06915d0a
编写于
3月 06, 2017
作者:
Q
qiaolongfei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add wmt14_util.py and a small dataset on bos for test
上级
f6f444ff
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
174 addition
and
1 deletion
+174
-1
python/paddle/v2/dataset/wmt14.py
python/paddle/v2/dataset/wmt14.py
+2
-1
python/paddle/v2/dataset/wmt14_util.py
python/paddle/v2/dataset/wmt14_util.py
+172
-0
未找到文件。
python/paddle/v2/dataset/wmt14.py
浏览文件 @
06915d0a
...
@@ -25,7 +25,8 @@ __all__ = ['train', 'test', 'build_dict']
...
@@ -25,7 +25,8 @@ __all__ = ['train', 'test', 'build_dict']
URL_DEV_TEST
=
'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
URL_DEV_TEST
=
'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
MD5_DEV_TEST
=
'7d7897317ddd8ba0ae5c5fa7248d3ff5'
MD5_DEV_TEST
=
'7d7897317ddd8ba0ae5c5fa7248d3ff5'
URL_TRAIN
=
'http://localhost:8989/wmt14.tgz'
# this is a small set of data for test. The original data is too large and will be add later.
URL_TRAIN
=
'http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
MD5_TRAIN
=
'7373473f86016f1f48037c9c340a2d5b'
MD5_TRAIN
=
'7373473f86016f1f48037c9c340a2d5b'
START
=
"<s>"
START
=
"<s>"
...
...
python/paddle/v2/dataset/wmt14_util.py
0 → 100644
浏览文件 @
06915d0a
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
from
paddle.utils.preprocess_util
import
save_list
,
DatasetCreater
class
SeqToSeqDatasetCreater
(
DatasetCreater
):
"""
A class to process data for sequence to sequence application.
"""
def
__init__
(
self
,
data_path
,
output_path
):
"""
data_path: the path to store the train data, test data and gen data
output_path: the path to store the processed dataset
"""
DatasetCreater
.
__init__
(
self
,
data_path
)
self
.
gen_dir_name
=
'gen'
self
.
gen_list_name
=
'gen.list'
self
.
output_path
=
output_path
def
concat_file
(
self
,
file_path
,
file1
,
file2
,
output_path
,
output
):
"""
Concat file1 and file2 to be one output file
The i-th line of output = i-th line of file1 + '
\t
' + i-th line of file2
file_path: the path to store file1 and file2
output_path: the path to store output file
"""
file1
=
os
.
path
.
join
(
file_path
,
file1
)
file2
=
os
.
path
.
join
(
file_path
,
file2
)
output
=
os
.
path
.
join
(
output_path
,
output
)
if
not
os
.
path
.
exists
(
output
):
os
.
system
(
'paste '
+
file1
+
' '
+
file2
+
' > '
+
output
)
def
cat_file
(
self
,
dir_path
,
suffix
,
output_path
,
output
):
"""
Cat all the files in dir_path with suffix to be one output file
dir_path: the base directory to store input file
suffix: suffix of file name
output_path: the path to store output file
"""
cmd
=
'cat '
file_list
=
os
.
listdir
(
dir_path
)
file_list
.
sort
()
for
file
in
file_list
:
if
file
.
endswith
(
suffix
):
cmd
+=
os
.
path
.
join
(
dir_path
,
file
)
+
' '
output
=
os
.
path
.
join
(
output_path
,
output
)
if
not
os
.
path
.
exists
(
output
):
os
.
system
(
cmd
+
'> '
+
output
)
def
build_dict
(
self
,
file_path
,
dict_path
,
dict_size
=-
1
):
"""
Create the dictionary for the file, Note that
1. Valid characters include all printable characters
2. There is distinction between uppercase and lowercase letters
3. There is 3 special token:
<s>: the start of a sequence
<e>: the end of a sequence
<unk>: a word not included in dictionary
file_path: the path to store file
dict_path: the path to store dictionary
dict_size: word count of dictionary
if is -1, dictionary will contains all the words in file
"""
if
not
os
.
path
.
exists
(
dict_path
):
dictory
=
dict
()
with
open
(
file_path
,
"r"
)
as
fdata
:
for
line
in
fdata
:
line
=
line
.
split
(
'
\t
'
)
for
line_split
in
line
:
words
=
line_split
.
strip
().
split
()
for
word
in
words
:
if
word
not
in
dictory
:
dictory
[
word
]
=
1
else
:
dictory
[
word
]
+=
1
output
=
open
(
dict_path
,
"w+"
)
output
.
write
(
'<s>
\n
<e>
\n
<unk>
\n
'
)
count
=
3
for
key
,
value
in
sorted
(
dictory
.
items
(),
key
=
lambda
d
:
d
[
1
],
reverse
=
True
):
output
.
write
(
key
+
"
\n
"
)
count
+=
1
if
count
==
dict_size
:
break
self
.
dict_size
=
count
def
create_dataset
(
self
,
dict_size
=-
1
,
mergeDict
=
False
,
suffixes
=
[
'.src'
,
'.trg'
]):
"""
Create seqToseq dataset
"""
# dataset_list and dir_list has one-to-one relationship
train_dataset
=
os
.
path
.
join
(
self
.
data_path
,
self
.
train_dir_name
)
test_dataset
=
os
.
path
.
join
(
self
.
data_path
,
self
.
test_dir_name
)
gen_dataset
=
os
.
path
.
join
(
self
.
data_path
,
self
.
gen_dir_name
)
dataset_list
=
[
train_dataset
,
test_dataset
,
gen_dataset
]
train_dir
=
os
.
path
.
join
(
self
.
output_path
,
self
.
train_dir_name
)
test_dir
=
os
.
path
.
join
(
self
.
output_path
,
self
.
test_dir_name
)
gen_dir
=
os
.
path
.
join
(
self
.
output_path
,
self
.
gen_dir_name
)
dir_list
=
[
train_dir
,
test_dir
,
gen_dir
]
# create directory
for
dir
in
dir_list
:
if
not
os
.
path
.
exists
(
dir
):
os
.
makedirs
(
dir
)
# checkout dataset should be parallel corpora
suffix_len
=
len
(
suffixes
[
0
])
for
dataset
in
dataset_list
:
file_list
=
os
.
listdir
(
dataset
)
if
len
(
file_list
)
%
2
==
1
:
raise
RuntimeError
(
"dataset should be parallel corpora"
)
file_list
.
sort
()
for
i
in
range
(
0
,
len
(
file_list
),
2
):
if
file_list
[
i
][:
-
suffix_len
]
!=
file_list
[
i
+
1
][:
-
suffix_len
]:
raise
RuntimeError
(
"source and target file name should be equal"
)
# cat all the files with the same suffix in dataset
for
suffix
in
suffixes
:
for
dataset
in
dataset_list
:
outname
=
os
.
path
.
basename
(
dataset
)
+
suffix
self
.
cat_file
(
dataset
,
suffix
,
dataset
,
outname
)
# concat parallel corpora and create file.list
print
'concat parallel corpora for dataset'
id
=
0
list
=
[
'train.list'
,
'test.list'
,
'gen.list'
]
for
dataset
in
dataset_list
:
outname
=
os
.
path
.
basename
(
dataset
)
self
.
concat_file
(
dataset
,
outname
+
suffixes
[
0
],
outname
+
suffixes
[
1
],
dir_list
[
id
],
outname
)
save_list
([
os
.
path
.
join
(
dir_list
[
id
],
outname
)],
os
.
path
.
join
(
self
.
output_path
,
list
[
id
]))
id
+=
1
# build dictionary for train data
dict
=
[
'src.dict'
,
'trg.dict'
]
dict_path
=
[
os
.
path
.
join
(
self
.
output_path
,
dict
[
0
]),
os
.
path
.
join
(
self
.
output_path
,
dict
[
1
])
]
if
mergeDict
:
outname
=
os
.
path
.
join
(
train_dir
,
train_dataset
.
split
(
'/'
)[
-
1
])
print
'build src dictionary for train data'
self
.
build_dict
(
outname
,
dict_path
[
0
],
dict_size
)
print
'build trg dictionary for train data'
os
.
system
(
'cp '
+
dict_path
[
0
]
+
' '
+
dict_path
[
1
])
else
:
outname
=
os
.
path
.
join
(
train_dataset
,
self
.
train_dir_name
)
for
id
in
range
(
0
,
2
):
suffix
=
suffixes
[
id
]
print
'build '
+
suffix
[
1
:]
+
' dictionary for train data'
self
.
build_dict
(
outname
+
suffix
,
dict_path
[
id
],
dict_size
)
print
'dictionary size is'
,
self
.
dict_size
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录