Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
f6f444ff
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f6f444ff
编写于
3月 05, 2017
作者:
Q
qiaolongfei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
optimize code
上级
98522dcb
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
2 addition
and
160 deletion
+2
-160
demo/seqToseq/api_train_v2.py
demo/seqToseq/api_train_v2.py
+1
-1
demo/seqToseq/preprocess.py
demo/seqToseq/preprocess.py
+1
-159
未找到文件。
demo/seqToseq/api_train_v2.py
浏览文件 @
f6f444ff
...
...
@@ -115,7 +115,7 @@ def main():
'target_language_word'
:
1
,
'target_language_next_word'
:
2
}
wmt14_reader
=
paddle
.
reader
.
batched
(
wmt14_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
paddle
.
dataset
.
wmt14
.
train
(
dict_size
=
dict_size
),
buf_size
=
8192
),
batch_size
=
5
)
...
...
demo/seqToseq/preprocess.py
浏览文件 @
f6f444ff
...
...
@@ -23,167 +23,9 @@ Options:
-m --mergeDict merge source and target dictionary
"""
import
os
import
sys
import
string
from
optparse
import
OptionParser
from
paddle.utils.preprocess_util
import
save_list
,
DatasetCreater
class
SeqToSeqDatasetCreater
(
DatasetCreater
):
"""
A class to process data for sequence to sequence application.
"""
def
__init__
(
self
,
data_path
,
output_path
):
"""
data_path: the path to store the train data, test data and gen data
output_path: the path to store the processed dataset
"""
DatasetCreater
.
__init__
(
self
,
data_path
)
self
.
gen_dir_name
=
'gen'
self
.
gen_list_name
=
'gen.list'
self
.
output_path
=
output_path
def
concat_file
(
self
,
file_path
,
file1
,
file2
,
output_path
,
output
):
"""
Concat file1 and file2 to be one output file
The i-th line of output = i-th line of file1 + '
\t
' + i-th line of file2
file_path: the path to store file1 and file2
output_path: the path to store output file
"""
file1
=
os
.
path
.
join
(
file_path
,
file1
)
file2
=
os
.
path
.
join
(
file_path
,
file2
)
output
=
os
.
path
.
join
(
output_path
,
output
)
if
not
os
.
path
.
exists
(
output
):
os
.
system
(
'paste '
+
file1
+
' '
+
file2
+
' > '
+
output
)
def
cat_file
(
self
,
dir_path
,
suffix
,
output_path
,
output
):
"""
Cat all the files in dir_path with suffix to be one output file
dir_path: the base directory to store input file
suffix: suffix of file name
output_path: the path to store output file
"""
cmd
=
'cat '
file_list
=
os
.
listdir
(
dir_path
)
file_list
.
sort
()
for
file
in
file_list
:
if
file
.
endswith
(
suffix
):
cmd
+=
os
.
path
.
join
(
dir_path
,
file
)
+
' '
output
=
os
.
path
.
join
(
output_path
,
output
)
if
not
os
.
path
.
exists
(
output
):
os
.
system
(
cmd
+
'> '
+
output
)
def
build_dict
(
self
,
file_path
,
dict_path
,
dict_size
=-
1
):
"""
Create the dictionary for the file, Note that
1. Valid characters include all printable characters
2. There is distinction between uppercase and lowercase letters
3. There is 3 special token:
<s>: the start of a sequence
<e>: the end of a sequence
<unk>: a word not included in dictionary
file_path: the path to store file
dict_path: the path to store dictionary
dict_size: word count of dictionary
if is -1, dictionary will contains all the words in file
"""
if
not
os
.
path
.
exists
(
dict_path
):
dictory
=
dict
()
with
open
(
file_path
,
"r"
)
as
fdata
:
for
line
in
fdata
:
line
=
line
.
split
(
'
\t
'
)
for
line_split
in
line
:
words
=
line_split
.
strip
().
split
()
for
word
in
words
:
if
word
not
in
dictory
:
dictory
[
word
]
=
1
else
:
dictory
[
word
]
+=
1
output
=
open
(
dict_path
,
"w+"
)
output
.
write
(
'<s>
\n
<e>
\n
<unk>
\n
'
)
count
=
3
for
key
,
value
in
sorted
(
dictory
.
items
(),
key
=
lambda
d
:
d
[
1
],
reverse
=
True
):
output
.
write
(
key
+
"
\n
"
)
count
+=
1
if
count
==
dict_size
:
break
self
.
dict_size
=
count
def
create_dataset
(
self
,
dict_size
=-
1
,
mergeDict
=
False
,
suffixes
=
[
'.src'
,
'.trg'
]):
"""
Create seqToseq dataset
"""
# dataset_list and dir_list has one-to-one relationship
train_dataset
=
os
.
path
.
join
(
self
.
data_path
,
self
.
train_dir_name
)
test_dataset
=
os
.
path
.
join
(
self
.
data_path
,
self
.
test_dir_name
)
gen_dataset
=
os
.
path
.
join
(
self
.
data_path
,
self
.
gen_dir_name
)
dataset_list
=
[
train_dataset
,
test_dataset
,
gen_dataset
]
train_dir
=
os
.
path
.
join
(
self
.
output_path
,
self
.
train_dir_name
)
test_dir
=
os
.
path
.
join
(
self
.
output_path
,
self
.
test_dir_name
)
gen_dir
=
os
.
path
.
join
(
self
.
output_path
,
self
.
gen_dir_name
)
dir_list
=
[
train_dir
,
test_dir
,
gen_dir
]
# create directory
for
dir
in
dir_list
:
if
not
os
.
path
.
exists
(
dir
):
os
.
mkdir
(
dir
)
# checkout dataset should be parallel corpora
suffix_len
=
len
(
suffixes
[
0
])
for
dataset
in
dataset_list
:
file_list
=
os
.
listdir
(
dataset
)
if
len
(
file_list
)
%
2
==
1
:
raise
RuntimeError
(
"dataset should be parallel corpora"
)
file_list
.
sort
()
for
i
in
range
(
0
,
len
(
file_list
),
2
):
if
file_list
[
i
][:
-
suffix_len
]
!=
file_list
[
i
+
1
][:
-
suffix_len
]:
raise
RuntimeError
(
"source and target file name should be equal"
)
# cat all the files with the same suffix in dataset
for
suffix
in
suffixes
:
for
dataset
in
dataset_list
:
outname
=
os
.
path
.
basename
(
dataset
)
+
suffix
self
.
cat_file
(
dataset
,
suffix
,
dataset
,
outname
)
# concat parallel corpora and create file.list
print
'concat parallel corpora for dataset'
id
=
0
list
=
[
'train.list'
,
'test.list'
,
'gen.list'
]
for
dataset
in
dataset_list
:
outname
=
os
.
path
.
basename
(
dataset
)
self
.
concat_file
(
dataset
,
outname
+
suffixes
[
0
],
outname
+
suffixes
[
1
],
dir_list
[
id
],
outname
)
save_list
([
os
.
path
.
join
(
dir_list
[
id
],
outname
)],
os
.
path
.
join
(
self
.
output_path
,
list
[
id
]))
id
+=
1
# build dictionary for train data
dict
=
[
'src.dict'
,
'trg.dict'
]
dict_path
=
[
os
.
path
.
join
(
self
.
output_path
,
dict
[
0
]),
os
.
path
.
join
(
self
.
output_path
,
dict
[
1
])
]
if
mergeDict
:
outname
=
os
.
path
.
join
(
train_dir
,
train_dataset
.
split
(
'/'
)[
-
1
])
print
'build src dictionary for train data'
self
.
build_dict
(
outname
,
dict_path
[
0
],
dict_size
)
print
'build trg dictionary for train data'
os
.
system
(
'cp '
+
dict_path
[
0
]
+
' '
+
dict_path
[
1
])
else
:
outname
=
os
.
path
.
join
(
train_dataset
,
self
.
train_dir_name
)
for
id
in
range
(
0
,
2
):
suffix
=
suffixes
[
id
]
print
'build '
+
suffix
[
1
:]
+
' dictionary for train data'
self
.
build_dict
(
outname
+
suffix
,
dict_path
[
id
],
dict_size
)
print
'dictionary size is'
,
self
.
dict_size
from
paddle.v2.dataset.wmt14_util
import
SeqToSeqDatasetCreater
def
main
():
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录