Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
13a5f259
M
models
项目概览
PaddlePaddle
/
models
大约 2 年 前同步成功
通知
232
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
13a5f259
编写于
6月 13, 2017
作者:
Z
zhaopu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
delete old file
上级
aa7cd753
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
0 addition
and
595 deletion
+0
-595
language_model/data_util.py
language_model/data_util.py
+0
-157
language_model/lm_ngram.py
language_model/lm_ngram.py
+0
-184
language_model/lm_rnn.py
language_model/lm_rnn.py
+0
-254
未找到文件。
language_model/data_util.py
已删除
100644 → 0
浏览文件 @
aa7cd753
# coding=utf-8
import
collections
import
os
# -- function --
def
save_vocab
(
word_id_dict
,
vocab_file_name
):
"""
save vocab.
:param word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type.
:param vocab_file_name: vocab file name.
"""
f
=
open
(
vocab_file_name
,
'w'
)
for
(
k
,
v
)
in
word_id_dict
.
items
():
f
.
write
(
k
.
encode
(
'utf-8'
)
+
'
\t
'
+
str
(
v
)
+
'
\n
'
)
print
(
'save vocab to '
+
vocab_file_name
)
f
.
close
()
def
load_vocab
(
vocab_file_name
):
"""
load vocab from file
:param vocab_file_name: vocab file name.
:return: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type.
"""
if
not
os
.
path
.
isfile
(
vocab_file_name
):
raise
Exception
(
'vocab file does not exist!'
)
dict
=
{}
for
line
in
open
(
vocab_file_name
):
if
len
(
line
)
<
2
:
continue
kv
=
line
.
decode
(
'utf-8'
).
strip
().
split
(
'
\t
'
)
dict
[
kv
[
0
]]
=
int
(
kv
[
1
])
return
dict
def
build_vocab
(
file_name
,
vocab_max_size
):
"""
build vacab.
:param vocab_max_size: vocab's max size.
:return: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type.
"""
words
=
[]
for
line
in
open
(
file_name
):
words
+=
line
.
decode
(
'utf-8'
,
'ignore'
).
strip
().
split
()
counter
=
collections
.
Counter
(
words
)
counter
=
sorted
(
counter
.
items
(),
key
=
lambda
x
:
-
x
[
1
])
if
len
(
counter
)
>
vocab_max_size
:
counter
=
counter
[:
vocab_max_size
]
words
,
counts
=
zip
(
*
counter
)
word_id_dict
=
dict
(
zip
(
words
,
range
(
2
,
len
(
words
)
+
2
)))
word_id_dict
[
'<UNK>'
]
=
0
word_id_dict
[
'<EOS>'
]
=
1
return
word_id_dict
def
_read_by_fixed_length
(
file_name
,
word_id_dict
,
sentence_len
=
10
):
"""
create reader, each sample with fixed length.
:param file_name: file name.
:param word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type.
:param sentence_len: each sample's length.
:return: data reader.
"""
def
reader
():
words
=
[]
UNK
=
word_id_dict
[
'<UNK>'
]
for
line
in
open
(
file_name
):
words
+=
line
.
decode
(
'utf-8'
,
'ignore'
).
strip
().
split
()
ids
=
[
word_id_dict
.
get
(
w
,
UNK
)
for
w
in
words
]
words_len
=
len
(
words
)
sentence_num
=
(
words_len
-
1
)
//
sentence_len
count
=
0
while
count
<
sentence_num
:
start
=
count
*
sentence_len
count
+=
1
yield
ids
[
start
:
start
+
sentence_len
],
ids
[
start
+
1
:
start
+
sentence_len
+
1
]
return
reader
def
_read_by_line
(
file_name
,
min_sentence_length
,
max_sentence_length
,
word_id_dict
):
"""
create reader, each line is a sample.
:param file_name: file name.
:param min_sentence_length: sentence's min length.
:param max_sentence_length: sentence's max length.
:param word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type.
:return: data reader.
"""
def
reader
():
UNK
=
word_id_dict
[
'<UNK>'
]
for
line
in
open
(
file_name
):
words
=
line
.
decode
(
'utf-8'
,
'ignore'
).
strip
().
split
()
if
len
(
words
)
<
min_sentence_length
or
len
(
words
)
>
max_sentence_length
:
continue
ids
=
[
word_id_dict
.
get
(
w
,
UNK
)
for
w
in
words
]
ids
.
append
(
word_id_dict
[
'<EOS>'
])
target
=
ids
[
1
:]
target
.
append
(
word_id_dict
[
'<EOS>'
])
yield
ids
[:],
target
[:]
return
reader
def
_reader_creator_for_NGram
(
file_name
,
N
,
word_id_dict
):
"""
create reader for ngram.
:param file_name: file name.
:param N: ngram's n.
:param word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type.
:return: data reader.
"""
assert
N
>=
2
def
reader
():
words
=
[]
UNK
=
word_id_dict
[
'<UNK>'
]
for
line
in
open
(
file_name
):
words
+=
line
.
decode
(
'utf-8'
,
'ignore'
).
strip
().
split
()
ids
=
[
word_id_dict
.
get
(
w
,
UNK
)
for
w
in
words
]
words_len
=
len
(
words
)
for
i
in
range
(
words_len
-
N
-
1
):
yield
tuple
(
ids
[
i
:
i
+
N
])
return
reader
def
train_data
(
train_file
,
min_sentence_length
,
max_sentence_length
,
word_id_dict
):
return
_read_by_line
(
train_file
,
min_sentence_length
,
max_sentence_length
,
word_id_dict
)
def
test_data
(
test_file
,
min_sentence_length
,
max_sentence_length
,
word_id_dict
):
return
_read_by_line
(
test_file
,
min_sentence_length
,
max_sentence_length
,
word_id_dict
)
def
train_data_for_NGram
(
train_file
,
N
,
word_id_dict
):
return
_reader_creator_for_NGram
(
train_file
,
N
,
word_id_dict
)
def
test_data_for_NGram
(
test_file
,
N
,
word_id_dict
):
return
_reader_creator_for_NGram
(
test_file
,
N
,
word_id_dict
)
language_model/lm_ngram.py
已删除
100644 → 0
浏览文件 @
aa7cd753
# coding=utf-8
import
sys
import
paddle.v2
as
paddle
import
data_util
as
reader
import
gzip
import
numpy
as
np
def
lm
(
vocab_size
,
emb_dim
,
hidden_size
,
num_layer
):
"""
ngram language model definition.
:param vocab_size: size of vocab.
:param emb_dim: embedding vector's dimension.
:param hidden_size: size of unit.
:param num_layer: layer number.
:return: cost and output layer of model.
"""
assert
emb_dim
>
0
and
hidden_size
>
0
and
vocab_size
>
0
and
num_layer
>
0
def
wordemb
(
inlayer
):
wordemb
=
paddle
.
layer
.
table_projection
(
input
=
inlayer
,
size
=
emb_dim
,
param_attr
=
paddle
.
attr
.
Param
(
name
=
"_proj"
,
initial_std
=
0.001
,
learning_rate
=
1
,
l2_rate
=
0
,
))
return
wordemb
# input layers
firstword
=
paddle
.
layer
.
data
(
name
=
"firstw"
,
type
=
paddle
.
data_type
.
integer_value
(
vocab_size
))
secondword
=
paddle
.
layer
.
data
(
name
=
"secondw"
,
type
=
paddle
.
data_type
.
integer_value
(
vocab_size
))
thirdword
=
paddle
.
layer
.
data
(
name
=
"thirdw"
,
type
=
paddle
.
data_type
.
integer_value
(
vocab_size
))
fourthword
=
paddle
.
layer
.
data
(
name
=
"fourthw"
,
type
=
paddle
.
data_type
.
integer_value
(
vocab_size
))
# embedding layer
Efirst
=
wordemb
(
firstword
)
Esecond
=
wordemb
(
secondword
)
Ethird
=
wordemb
(
thirdword
)
Efourth
=
wordemb
(
fourthword
)
contextemb
=
paddle
.
layer
.
concat
(
input
=
[
Efirst
,
Esecond
,
Ethird
,
Efourth
])
# hidden layer
hidden
=
paddle
.
layer
.
fc
(
input
=
contextemb
,
size
=
hidden_size
,
act
=
paddle
.
activation
.
Relu
())
for
_
in
range
(
num_layer
-
1
):
hidden
=
paddle
.
layer
.
fc
(
input
=
hidden
,
size
=
hidden_size
,
act
=
paddle
.
activation
.
Relu
())
# fc and output layer
predictword
=
paddle
.
layer
.
fc
(
input
=
[
hidden
],
size
=
vocab_size
,
act
=
paddle
.
activation
.
Softmax
())
# loss
nextword
=
paddle
.
layer
.
data
(
name
=
"fifthw"
,
type
=
paddle
.
data_type
.
integer_value
(
vocab_size
))
cost
=
paddle
.
layer
.
classification_cost
(
input
=
predictword
,
label
=
nextword
)
return
cost
,
predictword
def
train
():
"""
train ngram language model.
:return: none, but this function will save the training model each epoch.
"""
# prepare word dictionary
print
(
'prepare vocab...'
)
word_id_dict
=
reader
.
build_vocab
(
train_file
,
vocab_max_size
)
# build vocab
reader
.
save_vocab
(
word_id_dict
,
vocab_file
)
# save vocab
# define data reader
train_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
reader
.
train_data_for_NGram
(
train_file
,
N
,
word_id_dict
),
buf_size
=
65536
),
batch_size
=
32
)
test_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
reader
.
test_data_for_NGram
(
train_file
,
N
,
word_id_dict
),
buf_size
=
65536
),
batch_size
=
8
)
# network config
print
(
'prepare model...'
)
cost
,
_
=
lm
(
len
(
word_id_dict
),
emb_dim
,
hidden_size
,
num_layer
)
# create parameters
parameters
=
paddle
.
parameters
.
create
(
cost
)
# create optimizer
adam_optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
1e-3
,
regularization
=
paddle
.
optimizer
.
L2Regularization
(
rate
=
1e-3
),
model_average
=
paddle
.
optimizer
.
ModelAverage
(
average_window
=
0.5
))
# create trainer
trainer
=
paddle
.
trainer
.
SGD
(
cost
=
cost
,
parameters
=
parameters
,
update_equation
=
adam_optimizer
)
# define event_handler callback
def
event_handler
(
event
):
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
if
event
.
batch_id
%
100
==
0
:
print
(
"
\n
Pass %d, Batch %d, Cost %f, %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
))
else
:
sys
.
stdout
.
write
(
'.'
)
sys
.
stdout
.
flush
()
# save model each pass
if
isinstance
(
event
,
paddle
.
event
.
EndPass
):
result
=
trainer
.
test
(
reader
=
test_reader
)
print
(
"
\n
Test with Pass %d, %s"
%
(
event
.
pass_id
,
result
.
metrics
))
with
gzip
.
open
(
model_file_name_prefix
+
str
(
event
.
pass_id
)
+
'.tar.gz'
,
'w'
)
as
f
:
parameters
.
to_tar
(
f
)
# start to train
print
(
'start training...'
)
trainer
.
train
(
reader
=
train_reader
,
event_handler
=
event_handler
,
num_passes
=
num_passs
)
print
(
"Training finished."
)
if
__name__
==
'__main__'
:
# -- config : model --
emb_dim
=
200
hidden_size
=
200
num_passs
=
2
num_layer
=
2
N
=
5
model_file_name_prefix
=
'lm_ngram_pass_'
# -- config : data --
train_file
=
'data/ptb.train.txt'
test_file
=
'data/ptb.test.txt'
vocab_file
=
'data/vocab_ptb.txt'
# the file to save vocab
vocab_max_size
=
3000
min_sentence_length
=
3
max_sentence_length
=
60
# -- train --
paddle
.
init
(
use_gpu
=
False
,
trainer_count
=
1
)
train
()
# -- predict --
text
=
'the end of the'
# use 4 words to predict the 5th word
# prepare model
word_id_dict
=
reader
.
load_vocab
(
vocab_file
)
# load word dictionary
_
,
output_layer
=
lm
(
len
(
word_id_dict
),
emb_dim
,
hidden_size
,
num_layer
)
# network config
model_file_name
=
model_file_name_prefix
+
str
(
num_passs
-
1
)
+
'.tar.gz'
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
gzip
.
open
(
model_file_name
))
# load parameters
# generate
input
=
[[
word_id_dict
.
get
(
w
,
word_id_dict
[
'<UNK>'
])
for
w
in
text
.
split
()]]
predictions
=
paddle
.
infer
(
output_layer
=
output_layer
,
parameters
=
parameters
,
input
=
input
,
field
=
[
'value'
])
id_word_dict
=
dict
(
[(
v
,
k
)
for
k
,
v
in
word_id_dict
.
items
()])
# dictionary with type {id : word}
predictions
[
-
1
][
word_id_dict
[
'<UNK>'
]]
=
-
1
# filter <UNK>
next_word
=
id_word_dict
[
np
.
argmax
(
predictions
[
-
1
])]
print
(
next_word
.
encode
(
'utf-8'
))
language_model/lm_rnn.py
已删除
100644 → 0
浏览文件 @
aa7cd753
# coding=utf-8
import
sys
import
paddle.v2
as
paddle
import
data_util
as
reader
import
gzip
import
os
import
numpy
as
np
def
lm
(
vocab_size
,
emb_dim
,
rnn_type
,
hidden_size
,
num_layer
):
"""
rnn language model definition.
:param vocab_size: size of vocab.
:param emb_dim: embedding vector's dimension.
:param rnn_type: the type of RNN cell.
:param hidden_size: number of unit.
:param num_layer: layer number.
:return: cost and output layer of model.
"""
assert
emb_dim
>
0
and
hidden_size
>
0
and
vocab_size
>
0
and
num_layer
>
0
# input layers
data
=
paddle
.
layer
.
data
(
name
=
"word"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
vocab_size
))
target
=
paddle
.
layer
.
data
(
"label"
,
paddle
.
data_type
.
integer_value_sequence
(
vocab_size
))
# embedding layer
emb
=
paddle
.
layer
.
embedding
(
input
=
data
,
size
=
emb_dim
)
# rnn layer
if
rnn_type
==
'lstm'
:
rnn_cell
=
paddle
.
networks
.
simple_lstm
(
input
=
emb
,
size
=
hidden_size
)
for
_
in
range
(
num_layer
-
1
):
rnn_cell
=
paddle
.
networks
.
simple_lstm
(
input
=
rnn_cell
,
size
=
hidden_size
)
elif
rnn_type
==
'gru'
:
rnn_cell
=
paddle
.
networks
.
simple_gru
(
input
=
emb
,
size
=
hidden_size
)
for
_
in
range
(
num_layer
-
1
):
rnn_cell
=
paddle
.
networks
.
simple_gru
(
input
=
rnn_cell
,
size
=
hidden_size
)
else
:
raise
Exception
(
'rnn_type error!'
)
# fc(full connected) and output layer
output
=
paddle
.
layer
.
fc
(
input
=
[
rnn_cell
],
size
=
vocab_size
,
act
=
paddle
.
activation
.
Softmax
())
# loss
cost
=
paddle
.
layer
.
classification_cost
(
input
=
output
,
label
=
target
)
return
cost
,
output
def
train
():
"""
train rnn language model.
:return: none, but this function will save the training model each epoch.
"""
# prepare word dictionary
print
(
'prepare vocab...'
)
word_id_dict
=
reader
.
build_vocab
(
train_file
,
vocab_max_size
)
# build vocab
reader
.
save_vocab
(
word_id_dict
,
vocab_file
)
# save vocab
# define data reader
train_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
reader
.
train_data
(
train_file
,
min_sentence_length
,
max_sentence_length
,
word_id_dict
),
buf_size
=
65536
),
batch_size
=
32
)
test_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
reader
.
test_data
(
test_file
,
min_sentence_length
,
max_sentence_length
,
word_id_dict
),
buf_size
=
65536
),
batch_size
=
8
)
# network config
print
(
'prepare model...'
)
cost
,
_
=
lm
(
len
(
word_id_dict
),
emb_dim
,
rnn_type
,
hidden_size
,
num_layer
)
# create parameters
parameters
=
paddle
.
parameters
.
create
(
cost
)
# create optimizer
adam_optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
1e-3
,
regularization
=
paddle
.
optimizer
.
L2Regularization
(
rate
=
1e-3
),
model_average
=
paddle
.
optimizer
.
ModelAverage
(
average_window
=
0.5
))
# create trainer
trainer
=
paddle
.
trainer
.
SGD
(
cost
=
cost
,
parameters
=
parameters
,
update_equation
=
adam_optimizer
)
# define event_handler callback
def
event_handler
(
event
):
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
if
event
.
batch_id
%
100
==
0
:
print
(
"
\n
Pass %d, Batch %d, Cost %f, %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
))
else
:
sys
.
stdout
.
write
(
'.'
)
sys
.
stdout
.
flush
()
# save model each pass
if
isinstance
(
event
,
paddle
.
event
.
EndPass
):
result
=
trainer
.
test
(
reader
=
test_reader
)
print
(
"
\n
Test with Pass %d, %s"
%
(
event
.
pass_id
,
result
.
metrics
))
with
gzip
.
open
(
model_file_name_prefix
+
str
(
event
.
pass_id
)
+
'.tar.gz'
,
'w'
)
as
f
:
parameters
.
to_tar
(
f
)
# start to train
print
(
'start training...'
)
trainer
.
train
(
reader
=
train_reader
,
event_handler
=
event_handler
,
num_passes
=
num_passs
)
print
(
"Training finished."
)
def
_generate_with_beamSearch
(
inferer
,
word_id_dict
,
input
,
num_words
,
beam_size
):
"""
Demo: generate 'num_words' words using "beam search" algorithm.
:param inferer: paddle's inferer
:type inferer: paddle.inference.Inference
:param word_id_dict: vocab.
:type word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type.
:param input: prefix text.
:type input: string.
:param num_words: the number of the words to generate.
:type num_words: int
:param beam_size: beam with.
:type beam_size: int
:return: text with generated words. dictionary with content of '{text, probability}'
"""
assert
beam_size
>
0
and
num_words
>
0
# load word dictionary
id_word_dict
=
dict
(
[(
v
,
k
)
for
k
,
v
in
word_id_dict
.
items
()])
# {id : word}
# tools
def
str2ids
(
str
):
return
[[[
word_id_dict
.
get
(
w
,
word_id_dict
[
'<UNK>'
])
for
w
in
str
.
split
()
]]]
def
ids2str
(
ids
):
return
[[[
id_word_dict
.
get
(
id
,
' '
)
for
id
in
ids
]]]
# generate
texts
=
{}
# type: {text : prob}
texts
[
input
]
=
1
for
_
in
range
(
num_words
):
texts_new
=
{}
for
(
text
,
prob
)
in
texts
.
items
():
# next word's prob distubution
predictions
=
inferer
.
infer
(
input
=
str2ids
(
text
))
predictions
[
-
1
][
word_id_dict
[
'<UNK>'
]]
=
-
1
# filter <UNK>
# find next beam_size words
for
_
in
range
(
beam_size
):
cur_maxProb_index
=
np
.
argmax
(
predictions
[
-
1
])
# next word's id
text_new
=
text
+
' '
+
id_word_dict
[
cur_maxProb_index
]
# text append nextWord
texts_new
[
text_new
]
=
texts
[
text
]
*
predictions
[
-
1
][
cur_maxProb_index
]
predictions
[
-
1
][
cur_maxProb_index
]
=
-
1
texts
.
clear
()
if
len
(
texts_new
)
<=
beam_size
:
texts
=
texts_new
else
:
# cutting
texts
=
dict
(
sorted
(
texts_new
.
items
(),
key
=
lambda
d
:
d
[
1
],
reverse
=
True
)
[:
beam_size
])
return
texts
def
predict
():
"""
demo: use model to do prediction.
:return: print result to console.
"""
# prepare and cache vocab
if
os
.
path
.
isfile
(
vocab_file
):
word_id_dict
=
reader
.
load_vocab
(
vocab_file
)
# load word dictionary
else
:
word_id_dict
=
reader
.
build_vocab
(
train_file
,
vocab_max_size
)
# build vocab
reader
.
save_vocab
(
word_id_dict
,
vocab_file
)
# save vocab
# prepare and cache model
_
,
output
=
lm
(
len
(
word_id_dict
),
emb_dim
,
rnn_type
,
hidden_size
,
num_layer
)
# network config
model_file_name
=
model_file_name_prefix
+
str
(
num_passs
-
1
)
+
'.tar.gz'
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
gzip
.
open
(
model_file_name
))
# load parameters
inferer
=
paddle
.
inference
.
Inference
(
output_layer
=
output
,
parameters
=
parameters
)
# generate text
while
True
:
input_str
=
raw_input
(
'input:'
)
input_str_uft8
=
input_str
.
decode
(
'utf-8'
)
generate_sentences
=
_generate_with_beamSearch
(
inferer
=
inferer
,
word_id_dict
=
word_id_dict
,
input
=
input_str_uft8
,
num_words
=
5
,
beam_size
=
5
)
# print result
for
(
sentence
,
prob
)
in
generate_sentences
.
items
():
print
(
sentence
.
encode
(
'utf-8'
,
'replace'
))
print
(
'prob: '
,
prob
)
print
(
'-------'
)
if
__name__
==
'__main__'
:
# -- config : model --
rnn_type
=
'gru'
# or 'lstm'
emb_dim
=
200
hidden_size
=
200
num_passs
=
2
num_layer
=
2
model_file_name_prefix
=
'lm_'
+
rnn_type
+
'_params_pass_'
# -- config : data --
train_file
=
'data/ptb.train.txt'
test_file
=
'data/ptb.test.txt'
vocab_file
=
'data/vocab_ptb.txt'
# the file to save vocab
vocab_max_size
=
3000
min_sentence_length
=
3
max_sentence_length
=
60
# -- train --
paddle
.
init
(
use_gpu
=
False
,
trainer_count
=
1
)
train
()
# -- predict --
predict
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录