Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
7ec25147
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
1 年多 前同步成功
通知
284
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7ec25147
编写于
1月 02, 2019
作者:
Z
Zeyu Chen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix senta runing issues
上级
e687af48
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
124 addition
and
71 deletion
+124
-71
Senta/nets.py
Senta/nets.py
+2
-2
Senta/sentiment_classify.py
Senta/sentiment_classify.py
+86
-46
paddle_hub/module.py
paddle_hub/module.py
+36
-23
未找到文件。
Senta/nets.py
浏览文件 @
7ec25147
...
...
@@ -28,8 +28,8 @@ def bow_net(data,
fc_2
=
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
hid_dim2
,
act
=
"tanh"
,
name
=
"bow_fc2"
)
# softmax layer
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
class_dim
,
act
=
"softmax"
,
name
=
"fc_softmax"
)
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
class_dim
,
act
=
"softmax"
,
name
=
"fc_softmax"
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
...
...
Senta/sentiment_classify.py
浏览文件 @
7ec25147
# coding: utf-8
import
sys
# NOTE: just hack for fast test
sys
.
path
.
append
(
"../"
)
sys
.
path
.
append
(
"../paddle_hub/"
)
import
os
import
time
import
unittest
import
contextlib
...
...
@@ -114,7 +117,7 @@ def remove_feed_fetch_op(program):
def
train_net
(
train_reader
,
word_dict
,
network
,
network
_name
,
use_gpu
,
parallel
,
save_dirname
,
...
...
@@ -124,39 +127,85 @@ def train_net(train_reader,
"""
train network
"""
if
network
==
"bilstm_net"
:
if
network
_name
==
"bilstm_net"
:
network
=
bilstm_net
elif
network
==
"bow_net"
:
elif
network
_name
==
"bow_net"
:
network
=
bow_net
elif
network
==
"cnn_net"
:
elif
network
_name
==
"cnn_net"
:
network
=
cnn_net
elif
network
==
"lstm_net"
:
elif
network
_name
==
"lstm_net"
:
network
=
lstm_net
elif
network
==
"gru_net"
:
elif
network_name
==
"gru_net"
:
network
=
gru_net
else
:
print
(
"unknown network type"
)
return
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data
=
fluid
.
layers
.
data
(
name
=
"words"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
cost
,
acc
,
pred
,
emb
=
network
(
data
,
label
,
len
(
word_dict
)
+
2
)
# set optimizer
sgd_optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
lr
)
sgd_optimizer
.
minimize
(
cost
)
# set place, executor, datafeeder
place
=
fluid
.
CUDAPlace
(
0
)
if
use_gpu
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
"words"
,
"label"
],
place
=
place
)
exe
.
run
(
fluid
.
default_startup_program
())
# start training...
for
pass_id
in
range
(
pass_num
):
data_size
,
data_count
,
total_acc
,
total_cost
=
0
,
0
,
0.0
,
0.0
for
batch
in
train_reader
():
avg_cost_np
,
avg_acc_np
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
feeder
.
feed
(
batch
),
fetch_list
=
[
cost
,
acc
],
return_numpy
=
True
)
data_size
=
len
(
batch
)
total_acc
+=
data_size
*
avg_acc_np
total_cost
+=
data_size
*
avg_cost_np
data_count
+=
data_size
avg_cost
=
total_cost
/
data_count
avg_acc
=
total_acc
/
data_count
print
(
"[train info]: pass_id: %d, avg_acc: %f, avg_cost: %f"
%
(
pass_id
,
avg_acc
,
avg_cost
))
# save the model
module_path
=
os
.
path
.
join
(
save_dirname
,
network_name
)
hub
.
ModuleDesc
.
save_module_dict
(
module_path
=
module_path
,
word_dict
=
word_dict
)
fluid
.
io
.
save_inference_model
(
module_path
,
[
"words"
],
emb
,
exe
)
def
retrain_net
(
train_reader
,
word_dict
,
network_name
,
use_gpu
,
parallel
,
save_dirname
,
lr
=
0.002
,
batch_size
=
128
,
pass_num
=
30
):
"""
train network
"""
if
network_name
==
"bilstm_net"
:
network
=
bilstm_net
elif
network_name
==
"bow_net"
:
network
=
bow_net
elif
network_name
==
"cnn_net"
:
network
=
cnn_net
elif
network_name
==
"lstm_net"
:
network
=
lstm_net
elif
network_name
==
"gru_net"
:
network
=
gru_net
else
:
print
(
"unknown network type"
)
return
# word seq data
# data = fluid.layers.data(
# name="words", shape=[1], dtype="int64", lod_level=1)
# if not parallel:
# # set network
# cost, acc, pred, emb = network(data, label, len(word_dict) + 2)
# else:
# places = fluid.layers.get_places(device_count=2)
# pd = fluid.layers.ParallelDo(places)
# with pd.do():
# # set network
# cost, acc, prediction, emb = network(
# pd.read_input(data), pd.read_input(label),
# len(word_dict) + 2)
# pd.write_output(cost)
# pd.write_output(acc)
# cost, acc = pd()
# cost = fluid.layers.mean(cost)
# acc = fluid.layers.mean(acc)
dict_dim
=
len
(
word_dict
)
+
2
emb_dim
=
128
...
...
@@ -164,7 +213,8 @@ def train_net(train_reader,
hid_dim2
=
96
class_dim
=
2
module_link
=
"https://paddlehub.cdn.bcebos.com/senta/bow_module_3.tar.gz"
# module_link = "https://paddlehub.cdn.bcebos.com/senta/bow_module_3.tar.gz"
module_link
=
"./models/bow_net/"
module
=
hub
.
Module
(
module_link
)
main_program
=
fluid
.
Program
()
...
...
@@ -178,6 +228,7 @@ def train_net(train_reader,
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data
=
fluid
.
default_main_program
().
global_block
().
var
(
"words"
)
#TODO(ZeyuChen): how to get output paramter according to proto config
emb
=
module
.
get_module_output
()
# # # embedding layer
...
...
@@ -198,20 +249,6 @@ def train_net(train_reader,
fluid
.
layers
.
cross_entropy
(
input
=
pred
,
label
=
label
))
acc
=
fluid
.
layers
.
accuracy
(
input
=
pred
,
label
=
label
)
# Original Senta BoW networks
# label = fluid.layers.data(name="label", shape=[1], dtype="int64")
# data = fluid.layers.data(
# name="words", shape=[1], dtype="int64", lod_level=1)
# cost, acc, pred, emb = network(data, label, len(word_dict) + 2)
# print("new program")
# with open("program_senta.prototxt", "w") as fo:
# fo.write(str(fluid.default_main_program()))
# print("program_senta", fluid.default_main_program())
with
open
(
"senta_load_module.prototxt"
,
"w"
)
as
fo
:
fo
.
write
(
str
(
fluid
.
default_main_program
()))
print
(
"senta_load_module"
,
fluid
.
default_main_program
())
# set optimizer
sgd_optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
lr
)
sgd_optimizer
.
minimize
(
cost
)
...
...
@@ -246,8 +283,8 @@ def train_net(train_reader,
# print("senta_load_module", fluid.default_main_program())
# save the model
bow_module_path
=
save_dirname
+
"/"
+
"bow_module"
fluid
.
io
.
save_inference_model
(
bow_
module_path
,
[
"words"
],
emb
,
exe
)
module_path
=
os
.
path
.
join
(
save_dirname
,
network_name
+
"_retrain"
)
fluid
.
io
.
save_inference_model
(
module_path
,
[
"words"
],
emb
,
exe
)
def
eval_net
(
test_reader
,
use_gpu
,
model_path
=
None
):
...
...
@@ -339,9 +376,12 @@ def main(args):
args
.
word_dict_path
,
args
.
batch_size
,
args
.
mode
)
train_net
(
train_reader
,
word_dict
,
args
.
model_type
,
args
.
use_gpu
,
args
.
is_parallel
,
args
.
model_path
,
args
.
lr
,
args
.
batch_size
,
args
.
num_passes
)
# train_net(train_reader, word_dict, args.model_type, args.use_gpu,
# args.is_parallel, args.model_path, args.lr, args.batch_size,
# args.num_passes)
retrain_net
(
train_reader
,
word_dict
,
args
.
model_type
,
args
.
use_gpu
,
args
.
is_parallel
,
args
.
model_path
,
args
.
lr
,
args
.
batch_size
,
args
.
num_passes
)
# eval mode
elif
args
.
mode
==
"eval"
:
...
...
paddle_hub/module.py
浏览文件 @
7ec25147
...
...
@@ -19,18 +19,33 @@ from __future__ import print_function
import
paddle.fluid
as
fluid
import
numpy
as
np
import
tempfile
import
utils
import
os
from
collections
import
defaultdict
from
downloader
import
download_and_uncompress
__all__
=
[
"Module"
,
"ModuleDesc"
]
DICT_NAME
=
"dict.txt"
ASSETS_PATH
=
"assets"
def
mkdir
(
path
):
""" the same as the shell command mkdir -p "
"""
if
not
os
.
path
.
exists
(
path
):
os
.
makedirs
(
path
)
class
Module
(
object
):
def
__init__
(
self
,
module_url
):
# donwload module
module_dir
=
download_and_uncompress
(
module_url
)
if
module_url
.
startswith
(
"http"
):
# if it's remote url links
# if it's remote url link, then download and uncompress it
module_dir
=
download_and_uncompress
(
module_url
)
else
:
# otherwise it's local path, no need to deal with it
module_dir
=
module_url
# load paddle inference model
place
=
fluid
.
CPUPlace
()
...
...
@@ -90,6 +105,7 @@ class Module(object):
def
get_module_output
(
self
):
for
var
in
self
.
inference_program
.
list_vars
():
print
(
var
)
# NOTE: just hack for load Senta's
if
var
.
name
==
"embedding_0.tmp_0"
:
return
var
...
...
@@ -128,17 +144,22 @@ class Module(object):
# load assets folder
def
_load_assets
(
self
,
module_dir
):
assets_dir
=
os
.
path
.
join
(
module_dir
,
"assets"
)
tokens_path
=
os
.
path
.
join
(
assets_dir
,
"tokens.txt"
)
assets_dir
=
os
.
path
.
join
(
module_dir
,
ASSETS_PATH
)
tokens_path
=
os
.
path
.
join
(
assets_dir
,
DICT_NAME
)
word_id
=
0
with
open
(
tokens_path
)
as
fi
:
words
=
fi
.
readlines
()
words
=
map
(
str
.
strip
,
words
)
for
w
in
words
:
self
.
dict
[
w
]
=
word_id
word_id
+=
1
print
(
w
,
word_id
)
#TODO(ZeyuChen) check whether word id is duplicated and valid
for
line
in
fi
:
w
,
w_id
=
line
.
split
()
self
.
dict
[
w
]
=
int
(
w_id
)
# words = map(str.strip, words)
# for w in words:
# self.dict[w] = word_id
# word_id += 1
# print(w, word_id)
def
add_module_feed_list
(
self
,
feed_list
):
self
.
feed_list
=
feed_list
...
...
@@ -146,35 +167,27 @@ class Module(object):
def
add_module_output_list
(
self
,
output_list
):
self
.
output_list
=
output_list
def
_mkdir
(
self
,
path
):
if
not
os
.
path
.
exists
(
path
):
os
.
makedirs
(
path
)
class
ModuleDesc
(
object
):
def
__init__
(
self
):
pass
@
staticmethod
def
_mkdir
(
path
):
if
not
os
.
path
.
exists
(
path
):
os
.
makedirs
(
path
)
@
staticmethod
def
save_dict
(
path
,
word_dict
,
dict_name
):
""" Save dictionary for NLP module
"""
ModuleDesc
.
_
mkdir
(
path
)
mkdir
(
path
)
with
open
(
os
.
path
.
join
(
path
,
dict_name
),
"w"
)
as
fo
:
print
(
"tokens.txt path"
,
os
.
path
.
join
(
path
,
"tokens.txt"
))
dict_str
=
"
\n
"
.
join
(
word_dict
)
fo
.
write
(
dict_str
)
print
(
"tokens.txt path"
,
os
.
path
.
join
(
path
,
DICT_NAME
))
for
w
in
word_dict
:
w_id
=
word_dict
[
w
]
fo
.
write
(
"{}
\t
{}
\n
"
.
format
(
w
,
w_id
))
@
staticmethod
def
save_module_dict
(
module_path
,
word_dict
,
dict_name
=
"dict.txt"
):
def
save_module_dict
(
module_path
,
word_dict
,
dict_name
=
DICT_NAME
):
""" Save dictionary for NLP module
"""
assets_path
=
os
.
path
.
join
(
module_path
,
"assets"
)
assets_path
=
os
.
path
.
join
(
module_path
,
ASSETS_PATH
)
print
(
"save_module_dict"
,
assets_path
)
ModuleDesc
.
save_dict
(
assets_path
,
word_dict
,
dict_name
)
pass
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录