Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
7d6f6d74
M
models
项目概览
PaddlePaddle
/
models
1 年多 前同步成功
通知
222
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7d6f6d74
编写于
7月 01, 2017
作者:
S
Superjom
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
dssm model ready
上级
5c5a64a3
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
197 addition
and
77 deletion
+197
-77
dssm/network_conf.py
dssm/network_conf.py
+121
-50
dssm/reader.py
dssm/reader.py
+10
-8
dssm/train.py
dssm/train.py
+62
-14
dssm/utils.py
dssm/utils.py
+4
-5
未找到文件。
dssm/network_conf.py
浏览文件 @
7d6f6d74
from
paddle
import
v2
as
paddle
from
paddle.v2.attr
import
ParamAttr
from
utils
import
TaskType
,
logger
,
ModelType
from
utils
import
TaskType
,
logger
,
ModelType
,
ModelArch
class
DSSM
(
object
):
def
__init__
(
self
,
dnn_dims
=
[],
vocab_sizes
=
[],
model_type
=
ModelType
.
CLASSIFICATION
,
model_type
=
ModelType
.
create_classification
(),
model_arch
=
ModelArch
.
create_cnn
(),
share_semantic_generator
=
False
,
class_num
=
None
,
share_embed
=
False
):
...
...
@@ -16,8 +17,10 @@ class DSSM(object):
dimentions of each layer in semantic vector generator.
@vocab_sizes: 2-d tuple
size of both left and right items.
@model_type: str
type of task, should be 'rank', 'regression' or 'classification'
@model_type: int
type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2'
@model_arch: int
model architecture
@share_semantic_generator: bool
whether to share the semantic vector generator for both left and right.
@share_embed: bool
...
...
@@ -28,18 +31,36 @@ class DSSM(object):
assert
len
(
vocab_sizes
)
==
2
,
"vocab_sizes specify the sizes left and right inputs, and dim should be 2."
assert
len
(
dnn_dims
)
>
1
,
"more than two layers is needed."
self
.
dnn_dims
=
dnn_dims
self
.
vocab_sizes
=
vocab_sizes
self
.
share_semantic_generator
=
share_semantic_generator
self
.
share_embed
=
share_embed
self
.
model_type
=
model_type
self
.
model_type
=
ModelType
(
model_type
)
self
.
model_arch
=
ModelArch
(
model_arch
)
self
.
class_num
=
class_num
logger
.
warning
(
"build DSSM model with config of %s, %s"
%
(
self
.
model_type
,
self
.
model_arch
))
logger
.
info
(
"vocabulary sizes: %s"
%
str
(
self
.
vocab_sizes
))
# bind model architecture
_model_arch
=
{
'cnn'
:
self
.
create_cnn
,
'fc'
:
self
.
create_fc
,
}
self
.
model_arch_creater
=
_model_arch
[
str
(
model_arch
)]
# build model type
_model_type
=
{
'classification'
:
self
.
_build_classification_model
,
'rank'
:
self
.
_build_rank_model
,
'regression'
:
self
.
_build_regression_model
,
}
self
.
model_type_creater
=
_model_type
[
str
(
self
.
model_type
)]
def
__call__
(
self
):
if
self
.
model_type
==
ModelType
.
CLASSIFICATION
:
if
self
.
model_type
.
is_classification
()
:
return
self
.
_build_classification_model
()
return
self
.
_build_rank_model
()
...
...
@@ -47,6 +68,8 @@ class DSSM(object):
'''
Create an embedding table whose name has a `prefix`.
'''
logger
.
info
(
"create embedding table [%s] which dimention is %d"
%
(
prefix
,
self
.
dnn_dims
[
0
]))
emb
=
paddle
.
layer
.
embedding
(
input
=
input
,
size
=
self
.
dnn_dims
[
0
],
...
...
@@ -66,6 +89,8 @@ class DSSM(object):
input
=
emb
,
pooling_type
=
paddle
.
pooling
.
Max
())
for
id
,
dim
in
enumerate
(
self
.
dnn_dims
[
1
:]):
name
=
"%s_fc_%d_%d"
%
(
prefix
,
id
,
dim
)
logger
.
info
(
"create fc layer [%s] which dimention is %d"
%
(
name
,
dim
))
fc
=
paddle
.
layer
.
fc
(
name
=
name
,
input
=
_input_layer
,
...
...
@@ -85,53 +110,49 @@ class DSSM(object):
@prefix: str
prefix of layers' names, used to share parameters between more than one `cnn` parts.
'''
pass
def
_build_classification_model
(
self
):
'''
Build a classification model, and the cost is returned.
A Classification has 3 inputs:
- source sentence
- target sentence
- classification label
def
create_conv
(
context_len
,
hidden_size
,
prefix
):
key
=
"%s_%d_%d"
%
(
prefix
,
context_len
,
hidden_size
)
conv
=
paddle
.
networks
.
sequence_conv_pool
(
input
=
emb
,
context_len
=
context_len
,
hidden_size
=
hidden_size
,
# set parameter attr for parameter sharing
context_proj_param_attr
=
ParamAttr
(
name
=
key
+
'contex_proj.w'
),
fc_param_attr
=
ParamAttr
(
name
=
key
+
'_fc.w'
),
fc_bias_attr
=
ParamAttr
(
name
=
key
+
'_fc.b'
),
pool_bias_attr
=
ParamAttr
(
name
=
key
+
'_pool.b'
))
return
conv
'''
# prepare inputs.
assert
self
.
class_num
logger
.
info
(
'create a sequence_conv_pool which context width is 3'
)
conv_3
=
create_conv
(
3
,
self
.
dnn_dims
[
1
],
"cnn"
)
logger
.
info
(
'create a sequence_conv_pool which context width is 4'
)
conv_4
=
create_conv
(
4
,
self
.
dnn_dims
[
1
],
"cnn"
)
source
=
paddle
.
layer
.
data
(
name
=
'source_input'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
self
.
vocab_sizes
[
0
]))
target
=
paddle
.
layer
.
data
(
name
=
'target_input'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
self
.
vocab_sizes
[
1
]))
label
=
paddle
.
layer
.
data
(
name
=
'label_input'
,
type
=
paddle
.
data_type
.
integer_value
(
self
.
class_num
))
prefixs
=
'_ _'
.
split
(
)
if
self
.
share_semantic_generator
else
'left right'
.
split
()
embed_prefixs
=
'_ _'
.
split
(
)
if
self
.
share_embed
else
'left right'
.
split
()
word_vecs
=
[]
for
id
,
input
in
enumerate
([
source
,
target
]):
x
=
self
.
create_embedding
(
input
,
prefix
=
embed_prefixs
[
id
])
word_vecs
.
append
(
x
)
# if more than three layers, than a fc layer will be added.
if
len
(
self
.
dnn_dims
)
>
2
:
_input_layer
=
[
conv_3
,
conv_4
]
for
id
,
dim
in
enumerate
(
self
.
dnn_dims
[
2
:]):
name
=
"%s_fc_%d_%d"
%
(
prefix
,
id
,
dim
)
logger
.
info
(
"create fc layer [%s] which dimention is %d"
%
(
name
,
dim
))
fc
=
paddle
.
layer
.
fc
(
name
=
name
,
input
=
_input_layer
,
size
=
dim
,
act
=
paddle
.
activation
.
Tanh
(),
param_attr
=
ParamAttr
(
name
=
'%s.w'
%
name
),
bias_attr
=
ParamAttr
(
name
=
'%s.b'
%
name
))
_input_layer
=
fc
return
_input_layer
semantics
=
[]
for
id
,
input
in
enumerate
(
word_vecs
):
x
=
self
.
create_fc
(
input
,
prefix
=
prefixs
[
id
])
semantics
.
append
(
x
)
def
_build_classification_model
(
self
):
return
self
.
_build_classification_or_regression_model
(
is_classification
=
True
)
concated_vector
=
paddle
.
layer
.
concat
(
semantics
)
prediction
=
paddle
.
layer
.
fc
(
input
=
concated_vector
,
size
=
self
.
class_num
,
act
=
paddle
.
activation
.
Softmax
())
cost
=
paddle
.
layer
.
classification_cost
(
input
=
prediction
,
label
=
label
)
return
cost
,
prediction
,
label
def
_build_regression_model
(
self
):
return
self
.
_build_classification_or_regression_model
(
is_classification
=
False
)
def
_build_rank_model
(
self
):
'''
...
...
@@ -167,7 +188,7 @@ class DSSM(object):
semantics
=
[]
for
id
,
input
in
enumerate
(
word_vecs
):
x
=
self
.
create_fc
(
input
,
prefix
=
prefixs
[
id
])
x
=
self
.
model_arch_creater
(
input
,
prefix
=
prefixs
[
id
])
semantics
.
append
(
x
)
# cossim score of source and left_target
...
...
@@ -182,6 +203,56 @@ class DSSM(object):
# so AUC will not used.
return
cost
,
None
,
None
def
_build_classification_or_regression_model
(
self
,
is_classification
):
'''
Build a classification model, and the cost is returned.
A Classification has 3 inputs:
- source sentence
- target sentence
- classification label
'''
# prepare inputs.
assert
self
.
class_num
source
=
paddle
.
layer
.
data
(
name
=
'source_input'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
self
.
vocab_sizes
[
0
]))
target
=
paddle
.
layer
.
data
(
name
=
'target_input'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
self
.
vocab_sizes
[
1
]))
label
=
paddle
.
layer
.
data
(
name
=
'label_input'
,
type
=
paddle
.
data_type
.
integer_value
(
self
.
class_num
)
if
is_classification
else
paddle
.
data_type
.
dense_input
)
prefixs
=
'_ _'
.
split
(
)
if
self
.
share_semantic_generator
else
'left right'
.
split
()
embed_prefixs
=
'_ _'
.
split
(
)
if
self
.
share_embed
else
'left right'
.
split
()
word_vecs
=
[]
for
id
,
input
in
enumerate
([
source
,
target
]):
x
=
self
.
create_embedding
(
input
,
prefix
=
embed_prefixs
[
id
])
word_vecs
.
append
(
x
)
semantics
=
[]
for
id
,
input
in
enumerate
(
word_vecs
):
x
=
self
.
model_arch_creater
(
input
,
prefix
=
prefixs
[
id
])
semantics
.
append
(
x
)
concated_vector
=
paddle
.
layer
.
concat
(
semantics
)
prediction
=
paddle
.
layer
.
fc
(
input
=
concated_vector
,
size
=
self
.
class_num
,
act
=
paddle
.
activation
.
Softmax
())
cost
=
paddle
.
layer
.
classification_cost
(
input
=
prediction
,
label
=
label
)
if
is_classification
else
paddle
.
layer
.
mse_cost
(
prediction
,
label
)
return
cost
,
prediction
,
label
class
RankMetrics
(
object
):
'''
...
...
dssm/reader.py
浏览文件 @
7d6f6d74
...
...
@@ -4,32 +4,34 @@ from utils import UNK, ModelType, TaskType, load_dic, sent2ids, logger, ModelTyp
class
Dataset
(
object
):
def
__init__
(
self
,
train_path
,
test_path
,
source_dic_path
,
target_dic_path
,
model_type
=
ModelType
.
RANK
):
def
__init__
(
self
,
train_path
,
test_path
,
source_dic_path
,
target_dic_path
,
model_type
):
self
.
train_path
=
train_path
self
.
test_path
=
test_path
self
.
source_dic_path
=
source_dic_path
self
.
target_dic_path
=
target_dic_path
self
.
model_type
=
model_type
self
.
model_type
=
ModelType
(
model_type
)
self
.
source_dic
=
load_dic
(
self
.
source_dic_path
)
self
.
target_dic
=
load_dic
(
self
.
target_dic_path
)
self
.
record_reader
=
self
.
_read_classification_record
\
if
self
.
model_type
==
ModelType
.
CLASSIFICATION
\
if
self
.
model_type
.
is_classification
()
\
else
self
.
_read_rank_record
def
train
(
self
):
'''
Load trainset.
'''
logger
.
info
(
"[reader] load trainset from %s"
%
self
.
train_path
)
with
open
(
self
.
train_path
)
as
f
:
for
line_id
,
line
in
enumerate
(
f
):
yield
self
.
record_reader
(
line
)
def
test
(
self
):
'''
Load testset.
'''
logger
.
info
(
"[reader] load testset from %s"
%
self
.
test_path
)
with
open
(
self
.
test_path
)
as
f
:
for
line_id
,
line
in
enumerate
(
f
):
...
...
dssm/train.py
浏览文件 @
7d6f6d74
...
...
@@ -6,21 +6,24 @@ import gzip
import
paddle.v2
as
paddle
from
network_conf
import
DSSM
import
reader
from
utils
import
TaskType
,
load_dic
,
logger
,
ModelType
from
utils
import
TaskType
,
load_dic
,
logger
,
ModelType
,
ModelArch
parser
=
argparse
.
ArgumentParser
(
description
=
"PaddlePaddle DSSM example"
)
parser
.
add_argument
(
'-i'
,
'--train_data_path'
,
type
=
str
,
required
=
False
,
help
=
"path of training dataset"
)
parser
.
add_argument
(
'-t'
,
'--test_data_path'
,
type
=
str
,
required
=
False
,
help
=
"path of testing dataset"
)
parser
.
add_argument
(
'-s'
,
'--source_dic_path'
,
type
=
str
,
required
=
False
,
...
...
@@ -32,21 +35,32 @@ parser.add_argument(
help
=
"path of the target's word dic, if not set, the `source_dic_path` will be used"
)
parser
.
add_argument
(
'-b'
,
'--batch_size'
,
type
=
int
,
default
=
10
,
help
=
"size of mini-batch (default:10)"
)
parser
.
add_argument
(
'-p'
,
'--num_passes'
,
type
=
int
,
default
=
10
,
help
=
"number of passes to run(default:10)"
)
parser
.
add_argument
(
'-y'
,
'--model_type'
,
type
=
int
,
default
=
ModelType
.
CLASSIFICATION
,
required
=
True
,
default
=
ModelType
.
CLASSIFICATION_MODE
,
help
=
"model type, %d for classification, %d for pairwise rank (default: classification)"
%
(
ModelType
.
CLASSIFICATION
,
ModelType
.
RANK
))
%
(
ModelType
.
CLASSIFICATION_MODE
,
ModelType
.
RANK_MODE
))
parser
.
add_argument
(
'--model_arch'
,
type
=
int
,
required
=
True
,
default
=
ModelArch
.
CNN_MODE
,
help
=
"model architecture, %d for CNN, %d for FC"
%
(
ModelArch
.
CNN_MODE
,
ModelArch
.
FC_MODE
))
parser
.
add_argument
(
'--share_network_between_source_target'
,
type
=
bool
,
...
...
@@ -61,36 +75,56 @@ parser.add_argument(
'--dnn_dims'
,
type
=
str
,
default
=
'256,128,64,32'
,
help
=
"dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, demention
s
of each layer is 256, 128, 64 and 32"
help
=
"dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, demention of each layer is 256, 128, 64 and 32"
)
parser
.
add_argument
(
'--num_workers'
,
type
=
int
,
default
=
1
,
help
=
"num worker threads, default 1"
)
parser
.
add_argument
(
'--use_gpu'
,
type
=
bool
,
default
=
False
,
help
=
"whether to use GPU devices (default: False)"
)
parser
.
add_argument
(
'-c'
,
'--class_num'
,
type
=
int
,
default
=
0
,
help
=
"number of categories for classification task."
)
# arguments check.
args
=
parser
.
parse_args
()
args
.
model_type
=
ModelType
(
args
.
model_type
)
args
.
model_arch
=
ModelArch
(
args
.
model_arch
)
if
args
.
model_type
.
is_classification
():
assert
args
.
class_num
>
1
,
"--class_num should be set in classification task."
layer_dims
=
[
int
(
i
)
for
i
in
args
.
dnn_dims
.
split
(
','
)]
target_dic_path
=
args
.
source_dic_path
if
not
args
.
target_dic_path
else
args
.
target_dic_path
model_save_name_prefix
=
"dssm_pass_%s_%s"
%
(
args
.
model_type
,
args
.
model_arch
,
)
def
train
(
train_data_path
=
None
,
test_data_path
=
None
,
source_dic_path
=
None
,
target_dic_path
=
None
,
model_type
=
ModelType
.
CLASSIFICATION
,
model_type
=
ModelType
.
create_classification
(),
model_arch
=
ModelArch
.
create_cnn
(),
batch_size
=
10
,
num_passes
=
10
,
share_semantic_generator
=
False
,
share_embed
=
False
,
class_num
=
None
,
num_workers
=
1
):
num_workers
=
1
,
use_gpu
=
False
):
'''
Train the DSSM.
'''
default_train_path
=
'./data/rank/train.txt'
default_test_path
=
'./data/rank/test.txt'
default_dic_path
=
'./data/vocab.txt'
if
model_type
==
ModelType
.
CLASSIFICATION
:
if
model_type
.
is_classification
()
:
default_train_path
=
'./data/classification/train.txt'
default_test_path
=
'./data/classification/test.txt'
...
...
@@ -107,7 +141,7 @@ def train(train_data_path=None,
test_path
=
test_data_path
,
source_dic_path
=
source_dic_path
,
target_dic_path
=
target_dic_path
,
model_type
=
args
.
model_type
,
)
model_type
=
model_type
,
)
train_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
dataset
.
train
,
buf_size
=
1000
),
...
...
@@ -117,7 +151,7 @@ def train(train_data_path=None,
paddle
.
reader
.
shuffle
(
dataset
.
test
,
buf_size
=
1000
),
batch_size
=
batch_size
)
paddle
.
init
(
use_gpu
=
False
,
trainer_count
=
num_workers
)
paddle
.
init
(
use_gpu
=
use_gpu
,
trainer_count
=
num_workers
)
cost
,
prediction
,
label
=
DSSM
(
dnn_dims
=
layer_dims
,
...
...
@@ -125,6 +159,7 @@ def train(train_data_path=None,
len
(
load_dic
(
path
))
for
path
in
[
source_dic_path
,
target_dic_path
]
],
model_type
=
model_type
,
model_arch
=
model_arch
,
share_semantic_generator
=
share_semantic_generator
,
class_num
=
class_num
,
share_embed
=
share_embed
)()
...
...
@@ -144,7 +179,7 @@ def train(train_data_path=None,
update_equation
=
adam_optimizer
)
feeding
=
{}
if
model_type
==
ModelType
.
CLASSIFICATION
:
if
model_type
.
is_classification
()
:
feeding
=
{
'source_input'
:
0
,
'target_input'
:
1
,
'label_input'
:
2
}
else
:
feeding
=
{
...
...
@@ -165,13 +200,14 @@ def train(train_data_path=None,
if
isinstance
(
event
,
paddle
.
event
.
EndPass
):
if
test_reader
is
not
None
:
if
model_type
==
ModelType
.
CLASSIFICATION
:
if
model_type
.
is_classification
()
:
result
=
trainer
.
test
(
reader
=
test_reader
,
feeding
=
feeding
)
logger
.
info
(
"Test at Pass %d, %s
\n
"
%
(
event
.
pass_id
,
result
.
metrics
))
else
:
result
=
None
with
gzip
.
open
(
"dssm_pass_%05d.tar.gz"
%
event
.
pass_id
,
"w"
)
as
f
:
with
gzip
.
open
(
"dssm_%s_pass_%05d.tar.gz"
%
(
model_save_name_prefix
,
event
.
pass_id
),
"w"
)
as
f
:
parameters
.
to_tar
(
f
)
trainer
.
train
(
...
...
@@ -184,5 +220,17 @@ def train(train_data_path=None,
if
__name__
==
'__main__'
:
# train(class_num=2)
train
(
model_type
=
ModelType
.
RANK
)
train
(
train_data_path
=
args
.
train_data_path
,
test_data_path
=
args
.
test_data_path
,
source_dic_path
=
args
.
source_dic_path
,
target_dic_path
=
args
.
target_dic_path
,
model_type
=
ModelType
(
args
.
model_type
),
model_arch
=
ModelArch
(
args
.
model_arch
),
batch_size
=
args
.
batch_size
,
num_passes
=
args
.
num_passes
,
share_semantic_generator
=
args
.
share_network_between_source_target
,
share_embed
=
args
.
share_embed
,
class_num
=
args
.
class_num
,
num_workers
=
args
.
num_workers
,
use_gpu
=
args
.
use_gpu
)
dssm/utils.py
浏览文件 @
7d6f6d74
...
...
@@ -43,7 +43,7 @@ def make_create_method(cls):
setattr
(
cls
,
'create_'
+
mode
,
method
(
mode
))
def
make_str_method
(
cls
):
def
make_str_method
(
cls
,
type_name
=
'unk'
):
def
_str_
(
self
):
for
mode
in
cls
.
modes
:
if
self
.
mode
==
getattr
(
cls
,
mode_attr_name
(
mode
)):
...
...
@@ -55,6 +55,7 @@ def make_str_method(cls):
setattr
(
cls
,
'__str__'
,
_str_
)
setattr
(
cls
,
'__repr__'
,
_str_
)
setattr
(
cls
,
'__hash__'
,
_hash_
)
cls
.
__name__
=
type_name
def
_init_
(
self
,
mode
,
cls
):
...
...
@@ -63,7 +64,8 @@ def _init_(self, mode, cls):
elif
isinstance
(
mode
,
cls
):
self
.
mode
=
mode
.
mode
else
:
raise
raise
Exception
(
"wrong mode type, get type: %s, value: %s"
%
(
type
(
mode
),
mode
))
def
build_mode_class
(
cls
):
...
...
@@ -74,9 +76,6 @@ def build_mode_class(cls):
class
TaskType
(
object
):
# TRAIN_MODE = 0
# TEST_MODE = 1
# INFER_MODE = 2
modes
=
'train test infer'
.
split
()
def
__init__
(
self
,
mode
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录