Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
6b52ec51
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6b52ec51
编写于
2月 28, 2017
作者:
W
wangkuiyi
提交者:
GitHub
2月 28, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1490 from wangkuiyi/imdb-dataset
Add IMDB dataset without need of NLTK
上级
c6bfb712
eb1e34d2
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
170 addition
and
0 deletion
+170
-0
python/paddle/v2/dataset/common.py
python/paddle/v2/dataset/common.py
+7
-0
python/paddle/v2/dataset/imdb.py
python/paddle/v2/dataset/imdb.py
+120
-0
python/paddle/v2/dataset/tests/imdb_test.py
python/paddle/v2/dataset/tests/imdb_test.py
+43
-0
未找到文件。
python/paddle/v2/dataset/common.py
浏览文件 @
6b52ec51
...
...
@@ -32,3 +32,10 @@ def download(url, module_name, md5sum):
shutil
.
copyfileobj
(
r
.
raw
,
f
)
return
filename
def
dict_add
(
a_dict
,
ele
):
if
ele
in
a_dict
:
a_dict
[
ele
]
+=
1
else
:
a_dict
[
ele
]
=
1
python/paddle/v2/dataset/imdb.py
0 → 100644
浏览文件 @
6b52ec51
# /usr/bin/env python
# -*- coding:utf-8 -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
"""
import
paddle.v2.dataset.common
import
tarfile
import
Queue
import
re
import
string
import
threading
__all__
=
[
'build_dict'
,
'train'
,
'test'
]
URL
=
'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
MD5
=
'7c2ac02c03563afcf9b574c7e56c153a'
# Read files that match pattern. Tokenize and yield each file.
def
tokenize
(
pattern
):
with
tarfile
.
open
(
paddle
.
v2
.
dataset
.
common
.
download
(
URL
,
'imdb'
,
MD5
))
as
tarf
:
# Note that we should use tarfile.next(), which does
# sequential access of member files, other than
# tarfile.extractfile, which does random access and might
# destroy hard disks.
tf
=
tarf
.
next
()
while
tf
!=
None
:
if
bool
(
pattern
.
match
(
tf
.
name
)):
# newline and punctuations removal and ad-hoc tokenization.
yield
tarf
.
extractfile
(
tf
).
read
().
rstrip
(
"
\n\r
"
).
translate
(
None
,
string
.
punctuation
).
lower
().
split
()
tf
=
tarf
.
next
()
def
build_dict
(
pattern
,
cutoff
):
word_freq
=
{}
for
doc
in
tokenize
(
pattern
):
for
word
in
doc
:
paddle
.
v2
.
dataset
.
common
.
dict_add
(
word_freq
,
word
)
# Not sure if we should prune less-frequent words here.
word_freq
=
filter
(
lambda
x
:
x
[
1
]
>
cutoff
,
word_freq
.
items
())
dictionary
=
sorted
(
word_freq
,
key
=
lambda
x
:
(
-
x
[
1
],
x
[
0
]))
words
,
_
=
list
(
zip
(
*
dictionary
))
word_idx
=
dict
(
zip
(
words
,
xrange
(
len
(
words
))))
word_idx
[
'<unk>'
]
=
len
(
words
)
return
word_idx
def
reader_creator
(
pos_pattern
,
neg_pattern
,
word_idx
,
buffer_size
):
UNK
=
word_idx
[
'<unk>'
]
qs
=
[
Queue
.
Queue
(
maxsize
=
buffer_size
),
Queue
.
Queue
(
maxsize
=
buffer_size
)]
def
load
(
pattern
,
queue
):
for
doc
in
tokenize
(
pattern
):
queue
.
put
(
doc
)
queue
.
put
(
None
)
def
reader
():
# Creates two threads that loads positive and negative samples
# into qs.
t0
=
threading
.
Thread
(
target
=
load
,
args
=
(
pos_pattern
,
qs
[
0
],
))
t0
.
daemon
=
True
t0
.
start
()
t1
=
threading
.
Thread
(
target
=
load
,
args
=
(
neg_pattern
,
qs
[
1
],
))
t1
.
daemon
=
True
t1
.
start
()
# Read alternatively from qs[0] and qs[1].
i
=
0
doc
=
qs
[
i
].
get
()
while
doc
!=
None
:
yield
[
word_idx
.
get
(
w
,
UNK
)
for
w
in
doc
],
i
%
2
i
+=
1
doc
=
qs
[
i
%
2
].
get
()
# If any queue is empty, reads from the other queue.
i
+=
1
doc
=
qs
[
i
%
2
].
get
()
while
doc
!=
None
:
yield
[
word_idx
.
get
(
w
,
UNK
)
for
w
in
doc
],
i
%
2
doc
=
qs
[
i
%
2
].
get
()
return
reader
()
def
train
(
word_idx
):
return
reader_creator
(
re
.
compile
(
"aclImdb/train/pos/.*\.txt$"
),
re
.
compile
(
"aclImdb/train/neg/.*\.txt$"
),
word_idx
,
1000
)
def
test
(
word_idx
):
return
reader_creator
(
re
.
compile
(
"aclImdb/test/pos/.*\.txt$"
),
re
.
compile
(
"aclImdb/test/neg/.*\.txt$"
),
word_idx
,
1000
)
python/paddle/v2/dataset/tests/imdb_test.py
0 → 100644
浏览文件 @
6b52ec51
import
paddle.v2.dataset.imdb
import
unittest
import
re
TRAIN_POS_PATTERN
=
re
.
compile
(
"aclImdb/train/pos/.*\.txt$"
)
TRAIN_NEG_PATTERN
=
re
.
compile
(
"aclImdb/train/neg/.*\.txt$"
)
TRAIN_PATTERN
=
re
.
compile
(
"aclImdb/train/.*\.txt$"
)
TEST_POS_PATTERN
=
re
.
compile
(
"aclImdb/test/pos/.*\.txt$"
)
TEST_NEG_PATTERN
=
re
.
compile
(
"aclImdb/test/neg/.*\.txt$"
)
TEST_PATTERN
=
re
.
compile
(
"aclImdb/test/.*\.txt$"
)
class
TestIMDB
(
unittest
.
TestCase
):
word_idx
=
None
def
test_build_dict
(
self
):
if
self
.
word_idx
==
None
:
self
.
word_idx
=
paddle
.
v2
.
dataset
.
imdb
.
build_dict
(
TRAIN_PATTERN
,
150
)
self
.
assertEqual
(
len
(
self
.
word_idx
),
7036
)
def
check_dataset
(
self
,
dataset
,
expected_size
):
if
self
.
word_idx
==
None
:
self
.
word_idx
=
paddle
.
v2
.
dataset
.
imdb
.
build_dict
(
TRAIN_PATTERN
,
150
)
sum
=
0
for
l
in
dataset
(
self
.
word_idx
):
self
.
assertEqual
(
l
[
1
],
sum
%
2
)
sum
+=
1
self
.
assertEqual
(
sum
,
expected_size
)
def
test_train
(
self
):
self
.
check_dataset
(
paddle
.
v2
.
dataset
.
imdb
.
train
,
25000
)
def
test_test
(
self
):
self
.
check_dataset
(
paddle
.
v2
.
dataset
.
imdb
.
test
,
25000
)
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录