Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
吃玉米的猫
models
提交
bbf4aa7f
M
models
项目概览
吃玉米的猫
/
models
与 Fork 源项目一致
Fork自
PaddlePaddle / models
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
bbf4aa7f
编写于
1月 07, 2019
作者:
J
JiabinYang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix reader bug
上级
41351679
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
56 addition
and
36 deletion
+56
-36
fluid/PaddleRec/word2vec/infer.py
fluid/PaddleRec/word2vec/infer.py
+19
-5
fluid/PaddleRec/word2vec/preprocess.py
fluid/PaddleRec/word2vec/preprocess.py
+11
-14
fluid/PaddleRec/word2vec/reader.py
fluid/PaddleRec/word2vec/reader.py
+26
-16
fluid/PaddleRec/word2vec/train.py
fluid/PaddleRec/word2vec/train.py
+0
-1
未找到文件。
fluid/PaddleRec/word2vec/infer.py
浏览文件 @
bbf4aa7f
...
...
@@ -131,9 +131,25 @@ def build_small_test_case(emb):
desc5
=
"old - older + deeper = deep"
label5
=
word_to_id
[
"deep"
]
test_cases
=
[
emb1
,
emb2
,
emb3
,
emb4
,
emb5
]
test_case_desc
=
[
desc1
,
desc2
,
desc3
,
desc4
,
desc5
]
test_labels
=
[
label1
,
label2
,
label3
,
label4
,
label5
]
emb6
=
emb
[
word_to_id
[
'boy'
]]
desc6
=
"boy"
label6
=
word_to_id
[
"boy"
]
emb7
=
emb
[
word_to_id
[
'king'
]]
desc7
=
"king"
label7
=
word_to_id
[
"king"
]
emb8
=
emb
[
word_to_id
[
'sun'
]]
desc8
=
"sun"
label8
=
word_to_id
[
"sun"
]
emb9
=
emb
[
word_to_id
[
'key'
]]
desc9
=
"key"
label9
=
word_to_id
[
"key"
]
test_cases
=
[
emb1
,
emb2
,
emb3
,
emb4
,
emb5
,
emb6
,
emb7
,
emb8
,
emb9
]
test_case_desc
=
[
desc1
,
desc2
,
desc3
,
desc4
,
desc5
,
desc6
,
desc7
,
desc8
,
desc9
]
test_labels
=
[
label1
,
label2
,
label3
,
label4
,
label5
,
label6
,
label7
,
label8
,
label9
]
return
norm
(
np
.
array
(
test_cases
)),
test_case_desc
,
test_labels
...
...
@@ -229,8 +245,6 @@ def infer_during_train(args):
while
True
:
time
.
sleep
(
60
)
current_list
=
os
.
listdir
(
args
.
model_output_dir
)
# logger.info("current_list is : {}".format(current_list))
# logger.info("model_file_list is : {}".format(model_file_list))
if
set
(
model_file_list
)
==
set
(
current_list
):
if
solved_new
:
solved_new
=
False
...
...
fluid/PaddleRec/word2vec/preprocess.py
浏览文件 @
bbf4aa7f
...
...
@@ -3,6 +3,7 @@
import
re
import
six
import
argparse
import
io
prog
=
re
.
compile
(
"[^a-z ]"
,
flags
=
0
)
word_count
=
dict
()
...
...
@@ -83,7 +84,6 @@ def native_to_unicode(s):
return
_to_unicode
(
s
)
except
UnicodeDecodeError
:
res
=
_to_unicode
(
s
,
ignore_errors
=
True
)
tf
.
logging
.
info
(
"Ignoring Unicode error, outputting: %s"
%
res
)
return
res
...
...
@@ -199,14 +199,15 @@ def preprocess(args):
# word to count
if
args
.
with_other_dict
:
with
open
(
args
.
other_dict_path
,
'r
'
)
as
f
:
with
io
.
open
(
args
.
other_dict_path
,
'r'
,
encoding
=
'utf-8
'
)
as
f
:
for
line
in
f
:
word_count
[
native_to_unicode
(
line
.
strip
())]
=
1
if
args
.
is_local
:
for
i
in
range
(
1
,
100
):
with
open
(
args
.
data_path
+
"/news.en-000{:0>2d}-of-00100"
.
format
(
i
))
as
f
:
with
io
.
open
(
args
.
data_path
+
"/news.en-000{:0>2d}-of-00100"
.
format
(
i
),
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
line
=
strip_lines
(
line
)
words
=
line
.
split
()
...
...
@@ -231,21 +232,17 @@ def preprocess(args):
path_table
,
path_code
,
word_code_len
=
build_Huffman
(
word_count
,
40
)
with
open
(
args
.
dict_path
,
'w+
'
)
as
f
:
with
io
.
open
(
args
.
dict_path
,
'w+'
,
encoding
=
'utf-8
'
)
as
f
:
for
k
,
v
in
word_count
.
items
():
f
.
write
(
k
.
encode
(
"utf-8"
)
+
" "
+
str
(
v
).
encode
(
"utf-8"
)
+
'
\n
'
)
f
.
write
(
k
+
" "
+
str
(
v
)
+
'
\n
'
)
with
open
(
args
.
dict_path
+
"_ptable"
,
'w+
'
)
as
f2
:
with
io
.
open
(
args
.
dict_path
+
"_ptable"
,
'w+'
,
encoding
=
'utf-8
'
)
as
f2
:
for
pk
,
pv
in
path_table
.
items
():
f2
.
write
(
pk
.
encode
(
"utf-8"
)
+
'
\t
'
+
' '
.
join
((
str
(
x
).
encode
(
"utf-8"
)
for
x
in
pv
))
+
'
\n
'
)
f2
.
write
(
pk
+
'
\t
'
+
' '
.
join
((
str
(
x
)
for
x
in
pv
))
+
'
\n
'
)
with
open
(
args
.
dict_path
+
"_pcode"
,
'w+
'
)
as
f3
:
with
io
.
open
(
args
.
dict_path
+
"_pcode"
,
'w+'
,
encoding
=
'utf-8
'
)
as
f3
:
for
pck
,
pcv
in
path_code
.
items
():
f3
.
write
(
pck
.
encode
(
"utf-8"
)
+
'
\t
'
+
' '
.
join
((
str
(
x
).
encode
(
"utf-8"
)
for
x
in
pcv
))
+
'
\n
'
)
f3
.
write
(
pck
+
'
\t
'
+
' '
.
join
((
str
(
x
)
for
x
in
pcv
))
+
'
\n
'
)
if
__name__
==
"__main__"
:
...
...
fluid/PaddleRec/word2vec/reader.py
浏览文件 @
bbf4aa7f
...
...
@@ -2,8 +2,8 @@
import
numpy
as
np
import
preprocess
import
logging
import
io
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(message)s'
)
logger
=
logging
.
getLogger
(
"fluid"
)
...
...
@@ -42,6 +42,7 @@ class Word2VecReader(object):
self
.
num_non_leaf
=
0
self
.
word_to_id_
=
dict
()
self
.
id_to_word
=
dict
()
self
.
word_count
=
dict
()
self
.
word_to_path
=
dict
()
self
.
word_to_code
=
dict
()
self
.
trainer_id
=
trainer_id
...
...
@@ -51,20 +52,19 @@ class Word2VecReader(object):
word_counts
=
[]
word_id
=
0
with
open
(
dict_path
,
'r
'
)
as
f
:
with
io
.
open
(
dict_path
,
'r'
,
encoding
=
'utf-8
'
)
as
f
:
for
line
in
f
:
line
=
line
.
decode
(
encoding
=
'UTF-8'
)
word
,
count
=
line
.
split
()[
0
],
int
(
line
.
split
()[
1
])
self
.
word_count
[
word
]
=
count
self
.
word_to_id_
[
word
]
=
word_id
self
.
id_to_word
[
word_id
]
=
word
#build id to word dict
word_id
+=
1
word_counts
.
append
(
count
)
word_all_count
+=
count
with
open
(
dict_path
+
"_word_to_id_"
,
'w+
'
)
as
f6
:
with
io
.
open
(
dict_path
+
"_word_to_id_"
,
'w+'
,
encoding
=
'utf-8
'
)
as
f6
:
for
k
,
v
in
self
.
word_to_id_
.
items
():
f6
.
write
(
k
.
encode
(
"utf-8"
)
+
" "
+
str
(
v
).
encode
(
"utf-8"
)
+
'
\n
'
)
f6
.
write
(
k
+
" "
+
str
(
v
)
+
'
\n
'
)
self
.
dict_size
=
len
(
self
.
word_to_id_
)
self
.
word_frequencys
=
[
...
...
@@ -73,7 +73,7 @@ class Word2VecReader(object):
print
(
"dict_size = "
+
str
(
self
.
dict_size
))
+
" word_all_count = "
+
str
(
word_all_count
)
with
open
(
dict_path
+
"_ptable"
,
'r
'
)
as
f2
:
with
io
.
open
(
dict_path
+
"_ptable"
,
'r'
,
encoding
=
'utf-8
'
)
as
f2
:
for
line
in
f2
:
self
.
word_to_path
[
line
.
split
(
'
\t
'
)[
0
]]
=
np
.
fromstring
(
line
.
split
(
'
\t
'
)[
1
],
dtype
=
int
,
sep
=
' '
)
...
...
@@ -81,9 +81,8 @@ class Word2VecReader(object):
line
.
split
(
'
\t
'
)[
1
],
dtype
=
int
,
sep
=
' '
)[
0
]
print
(
"word_ptable dict_size = "
+
str
(
len
(
self
.
word_to_path
)))
with
open
(
dict_path
+
"_pcode"
,
'r
'
)
as
f3
:
with
io
.
open
(
dict_path
+
"_pcode"
,
'r'
,
encoding
=
'utf-8
'
)
as
f3
:
for
line
in
f3
:
line
=
line
.
decode
(
encoding
=
'UTF-8'
)
self
.
word_to_code
[
line
.
split
(
'
\t
'
)[
0
]]
=
np
.
fromstring
(
line
.
split
(
'
\t
'
)[
1
],
dtype
=
int
,
sep
=
' '
)
print
(
"word_pcode dict_size = "
+
str
(
len
(
self
.
word_to_code
)))
...
...
@@ -109,13 +108,15 @@ class Word2VecReader(object):
def
train
(
self
,
with_hs
):
def
_reader
():
for
file
in
self
.
filelist
:
with
open
(
self
.
data_path_
+
"/"
+
file
,
'r'
)
as
f
:
with
io
.
open
(
self
.
data_path_
+
"/"
+
file
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
logger
.
info
(
"running data in {}"
.
format
(
self
.
data_path_
+
"/"
+
file
))
count
=
1
for
line
in
f
:
if
self
.
trainer_id
==
count
%
self
.
trainer_num
:
line
=
preprocess
.
strip_lines
(
line
)
line
=
preprocess
.
strip_lines
(
line
,
self
.
word_count
)
word_ids
=
[
self
.
word_to_id_
[
word
]
for
word
in
line
.
split
()
if
word
in
self
.
word_to_id_
...
...
@@ -131,13 +132,15 @@ class Word2VecReader(object):
def
_reader_hs
():
for
file
in
self
.
filelist
:
with
open
(
self
.
data_path_
+
"/"
+
file
,
'r'
)
as
f
:
with
io
.
open
(
self
.
data_path_
+
"/"
+
file
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
logger
.
info
(
"running data in {}"
.
format
(
self
.
data_path_
+
"/"
+
file
))
count
=
1
for
line
in
f
:
if
self
.
trainer_id
==
count
%
self
.
trainer_num
:
line
=
preprocess
.
strip_lines
(
line
)
line
=
preprocess
.
strip_lines
(
line
,
self
.
word_count
)
word_ids
=
[
self
.
word_to_id_
[
word
]
for
word
in
line
.
split
()
if
word
in
self
.
word_to_id_
...
...
@@ -164,13 +167,20 @@ class Word2VecReader(object):
if
__name__
==
"__main__"
:
window_size
=
10
window_size
=
5
reader
=
Word2VecReader
(
"./data/1-billion_dict"
,
"./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/"
,
[
"news.en-00001-of-00100"
],
0
,
1
)
reader
=
Word2VecReader
(
"data/enwik9_dict"
,
"data/enwik9"
,
window_size
)
i
=
0
for
x
,
y
in
reader
.
train
()():
# print(reader.train(True))
for
x
,
y
,
z
,
f
in
reader
.
train
(
True
)():
print
(
"x: "
+
str
(
x
))
print
(
"y: "
+
str
(
y
))
print
(
"path: "
+
str
(
z
))
print
(
"code: "
+
str
(
f
))
print
(
"
\n
"
)
if
i
==
10
:
exit
(
0
)
...
...
fluid/PaddleRec/word2vec/train.py
浏览文件 @
bbf4aa7f
...
...
@@ -135,7 +135,6 @@ def convert_python_to_tensor(batch_size, sample_reader, is_hs):
for
sample
in
sample_reader
():
for
i
,
fea
in
enumerate
(
sample
):
result
[
i
].
append
(
fea
)
if
len
(
result
[
0
])
==
batch_size
:
tensor_result
=
[]
for
tensor
in
result
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录