Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
767b41b3
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 1 年 前同步成功
通知
282
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
767b41b3
编写于
1月 08, 2019
作者:
Z
Zeyu Chen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add sentiment-classification example
上级
a00bc6a9
变更
12
展开全部
隐藏空白更改
内联
并排
Showing
12 changed file
with
44341 addition
and
1 deletion
+44341
-1
.pre-commit-config.yaml
.pre-commit-config.yaml
+1
-1
example/sentiment-classification/create_module.sh
example/sentiment-classification/create_module.sh
+1
-0
example/sentiment-classification/data/test_data/corpus.test
example/sentiment-classification/data/test_data/corpus.test
+200
-0
example/sentiment-classification/data/train.vocab
example/sentiment-classification/data/train.vocab
+32896
-0
example/sentiment-classification/data/train_data/corpus.train
...ple/sentiment-classification/data/train_data/corpus.train
+10000
-0
example/sentiment-classification/finetune.sh
example/sentiment-classification/finetune.sh
+1
-0
example/sentiment-classification/nets.py
example/sentiment-classification/nets.py
+192
-0
example/sentiment-classification/sentiment_classify.py
example/sentiment-classification/sentiment_classify.py
+381
-0
example/sentiment-classification/test_create_module.py
example/sentiment-classification/test_create_module.py
+329
-0
example/sentiment-classification/test_finetune.py
example/sentiment-classification/test_finetune.py
+226
-0
example/sentiment-classification/train.sh
example/sentiment-classification/train.sh
+1
-0
example/sentiment-classification/utils.py
example/sentiment-classification/utils.py
+113
-0
未找到文件。
.pre-commit-config.yaml
浏览文件 @
767b41b3
...
...
@@ -16,4 +16,4 @@
-
id
:
trailing-whitespace
-
id
:
detect-private-key
-
id
:
check-symlinks
-
id
:
check-added-large-files
#
- id: check-added-large-files
example/sentiment-classification/create_module.sh
0 → 100755
浏览文件 @
767b41b3
python test_create_module.py
--train_data_path
./data/train_data/corpus.train
--word_dict_path
./data/train.vocab
--mode
train
--model_path
./models
example/sentiment-classification/data/test_data/corpus.test
0 → 100755
浏览文件 @
767b41b3
此差异已折叠。
点击以展开。
example/sentiment-classification/data/train.vocab
0 → 100755
浏览文件 @
767b41b3
此差异已折叠。
点击以展开。
example/sentiment-classification/data/train_data/corpus.train
0 → 100755
浏览文件 @
767b41b3
此差异已折叠。
点击以展开。
example/sentiment-classification/finetune.sh
0 → 100755
浏览文件 @
767b41b3
python sentiment_classify.py
--train_data_path
./data/train_data/corpus.train
--word_dict_path
./data/train.vocab
--mode
finetune
--model_path
./models
example/sentiment-classification/nets.py
0 → 100755
浏览文件 @
767b41b3
import
sys
import
time
import
numpy
as
np
import
paddle.fluid
as
fluid
import
paddle
import
paddle_hub
as
hub
def
bow_net
(
data
,
label
,
dict_dim
,
emb_dim
=
128
,
hid_dim
=
128
,
hid_dim2
=
96
,
class_dim
=
2
):
"""
Bow net
"""
# embedding layer
emb
=
fluid
.
layers
.
embedding
(
input
=
data
,
size
=
[
dict_dim
,
emb_dim
],
param_attr
=
"bow_embedding"
)
# bow layer
bow
=
fluid
.
layers
.
sequence_pool
(
input
=
emb
,
pool_type
=
'sum'
)
bow_tanh
=
fluid
.
layers
.
tanh
(
bow
)
# full connect layer
fc_1
=
fluid
.
layers
.
fc
(
input
=
bow_tanh
,
size
=
hid_dim
,
act
=
"tanh"
,
name
=
"bow_fc1"
)
fc_2
=
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
hid_dim2
,
act
=
"tanh"
,
name
=
"bow_fc2"
)
# softmax layer
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
class_dim
,
act
=
"softmax"
,
name
=
"fc_softmax"
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
return
avg_cost
,
acc
,
prediction
,
bow_tanh
def
cnn_net
(
data
,
label
,
dict_dim
,
emb_dim
=
128
,
hid_dim
=
128
,
hid_dim2
=
96
,
class_dim
=
2
,
win_size
=
3
):
"""
Conv net
"""
# embedding layer
emb
=
fluid
.
layers
.
embedding
(
input
=
data
,
size
=
[
dict_dim
,
emb_dim
])
# convolution layer
conv_3
=
fluid
.
nets
.
sequence_conv_pool
(
input
=
emb
,
num_filters
=
hid_dim
,
filter_size
=
win_size
,
act
=
"tanh"
,
pool_type
=
"max"
)
# full connect layer
fc_1
=
fluid
.
layers
.
fc
(
input
=
[
conv_3
],
size
=
hid_dim2
)
# softmax layer
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_1
],
size
=
class_dim
,
act
=
"softmax"
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
return
avg_cost
,
acc
,
prediction
,
[
conv_3
]
def
lstm_net
(
data
,
label
,
dict_dim
,
emb_dim
=
128
,
hid_dim
=
128
,
hid_dim2
=
96
,
class_dim
=
2
,
emb_lr
=
30.0
):
"""
Lstm net
"""
# embedding layer
emb
=
fluid
.
layers
.
embedding
(
input
=
data
,
size
=
[
dict_dim
,
emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
learning_rate
=
emb_lr
))
# Lstm layer
fc0
=
fluid
.
layers
.
fc
(
input
=
emb
,
size
=
hid_dim
*
4
)
lstm_h
,
c
=
fluid
.
layers
.
dynamic_lstm
(
input
=
fc0
,
size
=
hid_dim
*
4
,
is_reverse
=
False
)
# max pooling layer
lstm_max
=
fluid
.
layers
.
sequence_pool
(
input
=
lstm_h
,
pool_type
=
'max'
)
lstm_max_tanh
=
fluid
.
layers
.
tanh
(
lstm_max
)
# full connect layer
fc1
=
fluid
.
layers
.
fc
(
input
=
lstm_max_tanh
,
size
=
hid_dim2
,
act
=
'tanh'
)
# softmax layer
prediction
=
fluid
.
layers
.
fc
(
input
=
fc1
,
size
=
class_dim
,
act
=
'softmax'
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
return
avg_cost
,
acc
,
prediction
,
lstm_max_tanh
def
bilstm_net
(
data
,
label
,
dict_dim
,
emb_dim
=
128
,
hid_dim
=
128
,
hid_dim2
=
96
,
class_dim
=
2
,
emb_lr
=
30.0
):
"""
Bi-Lstm net
"""
# embedding layer
emb
=
fluid
.
layers
.
embedding
(
input
=
data
,
size
=
[
dict_dim
,
emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
learning_rate
=
emb_lr
))
# bi-lstm layer
fc0
=
fluid
.
layers
.
fc
(
input
=
emb
,
size
=
hid_dim
*
4
)
rfc0
=
fluid
.
layers
.
fc
(
input
=
emb
,
size
=
hid_dim
*
4
)
lstm_h
,
c
=
fluid
.
layers
.
dynamic_lstm
(
input
=
fc0
,
size
=
hid_dim
*
4
,
is_reverse
=
False
)
rlstm_h
,
c
=
fluid
.
layers
.
dynamic_lstm
(
input
=
rfc0
,
size
=
hid_dim
*
4
,
is_reverse
=
True
)
# extract last layer
lstm_last
=
fluid
.
layers
.
sequence_last_step
(
input
=
lstm_h
)
rlstm_last
=
fluid
.
layers
.
sequence_last_step
(
input
=
rlstm_h
)
lstm_last_tanh
=
fluid
.
layers
.
tanh
(
lstm_last
)
rlstm_last_tanh
=
fluid
.
layers
.
tanh
(
rlstm_last
)
# concat layer
lstm_concat
=
fluid
.
layers
.
concat
(
input
=
[
lstm_last
,
rlstm_last
],
axis
=
1
)
# full connect layer
fc1
=
fluid
.
layers
.
fc
(
input
=
lstm_concat
,
size
=
hid_dim2
,
act
=
'tanh'
)
# softmax layer
prediction
=
fluid
.
layers
.
fc
(
input
=
fc1
,
size
=
class_dim
,
act
=
'softmax'
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
return
avg_cost
,
acc
,
prediction
,
lstm_concat
def
gru_net
(
data
,
label
,
dict_dim
,
emb_dim
=
128
,
hid_dim
=
128
,
hid_dim2
=
96
,
class_dim
=
2
,
emb_lr
=
30.0
):
"""
gru net
"""
emb
=
fluid
.
layers
.
embedding
(
input
=
data
,
size
=
[
dict_dim
,
emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
learning_rate
=
emb_lr
))
fc0
=
fluid
.
layers
.
fc
(
input
=
emb
,
size
=
hid_dim
*
3
)
gru_h
=
fluid
.
layers
.
dynamic_gru
(
input
=
fc0
,
size
=
hid_dim
,
is_reverse
=
False
)
gru_max
=
fluid
.
layers
.
sequence_pool
(
input
=
gru_h
,
pool_type
=
'max'
)
gru_max_tanh
=
fluid
.
layers
.
tanh
(
gru_max
)
fc1
=
fluid
.
layers
.
fc
(
input
=
gru_max_tanh
,
size
=
hid_dim2
,
act
=
'tanh'
)
prediction
=
fluid
.
layers
.
fc
(
input
=
fc1
,
size
=
class_dim
,
act
=
'softmax'
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
return
avg_cost
,
acc
,
prediction
,
gru_max_tanh
example/sentiment-classification/sentiment_classify.py
0 → 100755
浏览文件 @
767b41b3
# coding: utf-8
import
sys
import
os
import
time
import
unittest
import
contextlib
import
logging
import
argparse
import
ast
import
paddle.fluid
as
fluid
import
paddle_hub
as
hub
import
utils
from
nets
import
bow_net
from
nets
import
cnn_net
from
nets
import
lstm_net
from
nets
import
bilstm_net
from
nets
import
gru_net
logger
=
logging
.
getLogger
(
"paddle-fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
"Sentiment Classification."
)
# training data path
parser
.
add_argument
(
"--train_data_path"
,
type
=
str
,
required
=
False
,
help
=
"The path of trainning data. Should be given in train mode!"
)
# test data path
parser
.
add_argument
(
"--test_data_path"
,
type
=
str
,
required
=
False
,
help
=
"The path of test data. Should be given in eval or infer mode!"
)
# word_dict path
parser
.
add_argument
(
"--word_dict_path"
,
type
=
str
,
required
=
True
,
help
=
"The path of word dictionary."
)
# current mode
parser
.
add_argument
(
"--mode"
,
type
=
str
,
required
=
True
,
choices
=
[
'train'
,
'eval'
,
'infer'
,
'finetune'
],
help
=
"train/eval/infer mode"
)
# model type
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
"bow_net"
,
help
=
"type of model"
)
# model save path
parser
.
add_argument
(
"--model_path"
,
type
=
str
,
default
=
"models"
,
required
=
True
,
help
=
"The path to saved the trained models."
)
# Number of passes for the training task.
parser
.
add_argument
(
"--num_passes"
,
type
=
int
,
default
=
10
,
help
=
"Number of passes for the training task."
)
# Batch size
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
256
,
help
=
"The number of training examples in one forward/backward pass."
)
# lr value for training
parser
.
add_argument
(
"--lr"
,
type
=
float
,
default
=
0.002
,
help
=
"The lr value for training."
)
# Whether to use gpu
parser
.
add_argument
(
"--use_gpu"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether to use gpu to train the model."
)
# parallel train
parser
.
add_argument
(
"--is_parallel"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether to train the model in parallel."
)
args
=
parser
.
parse_args
()
return
args
def
train_net
(
train_reader
,
word_dict
,
network_name
,
use_gpu
,
parallel
,
save_dirname
,
lr
=
0.002
,
batch_size
=
128
,
pass_num
=
30
):
"""
train network
"""
if
network_name
==
"bilstm_net"
:
network
=
bilstm_net
elif
network_name
==
"bow_net"
:
network
=
bow_net
elif
network_name
==
"cnn_net"
:
network
=
cnn_net
elif
network_name
==
"lstm_net"
:
network
=
lstm_net
elif
network_name
==
"gru_net"
:
network
=
gru_net
else
:
print
(
"unknown network type"
)
return
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data
=
fluid
.
layers
.
data
(
name
=
"words"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
cost
,
acc
,
pred
,
sent_emb
=
network
(
data
,
label
,
len
(
word_dict
)
+
2
)
# set optimizer
sgd_optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
lr
)
sgd_optimizer
.
minimize
(
cost
)
# set place, executor, datafeeder
place
=
fluid
.
CUDAPlace
(
0
)
if
use_gpu
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
"words"
,
"label"
],
place
=
place
)
exe
.
run
(
fluid
.
default_startup_program
())
# start training...
for
pass_id
in
range
(
pass_num
):
data_size
,
data_count
,
total_acc
,
total_cost
=
0
,
0
,
0.0
,
0.0
for
batch
in
train_reader
():
avg_cost_np
,
avg_acc_np
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
feeder
.
feed
(
batch
),
fetch_list
=
[
cost
,
acc
],
return_numpy
=
True
)
data_size
=
len
(
batch
)
total_acc
+=
data_size
*
avg_acc_np
total_cost
+=
data_size
*
avg_cost_np
data_count
+=
data_size
avg_cost
=
total_cost
/
data_count
avg_acc
=
total_acc
/
data_count
print
(
"[train info]: pass_id: %d, avg_acc: %f, avg_cost: %f"
%
(
pass_id
,
avg_acc
,
avg_cost
))
# create Senta module
module_dir
=
os
.
path
.
join
(
save_dirname
,
network_name
)
signature
=
hub
.
create_signature
(
"default"
,
inputs
=
[
data
],
outputs
=
[
sent_emb
])
hub
.
create_module
(
sign_arr
=
signature
,
program
=
fluid
.
default_main_program
(),
path
=
module_dir
)
def
retrain_net
(
train_reader
,
word_dict
,
network_name
,
use_gpu
,
parallel
,
save_dirname
,
lr
=
0.002
,
batch_size
=
128
,
pass_num
=
30
):
"""
train network
"""
if
network_name
==
"bilstm_net"
:
network
=
bilstm_net
elif
network_name
==
"bow_net"
:
network
=
bow_net
elif
network_name
==
"cnn_net"
:
network
=
cnn_net
elif
network_name
==
"lstm_net"
:
network
=
lstm_net
elif
network_name
==
"gru_net"
:
network
=
gru_net
else
:
print
(
"unknown network type"
)
return
emb_dim
=
128
hid_dim
=
128
hid_dim2
=
96
class_dim
=
2
dict_dim
=
len
(
word_dict
)
+
2
module_dir
=
os
.
path
.
join
(
save_dirname
,
network_name
)
print
(
"module_dir"
,
module_dir
)
module
=
hub
.
Module
(
module_dir
=
module_dir
)
main_program
=
fluid
.
Program
()
startup_program
=
fluid
.
Program
()
# use switch program to test fine-tuning
fluid
.
framework
.
switch_main_program
(
module
.
get_inference_program
())
# remove feed fetch operator and variable
hub
.
ModuleUtils
.
remove_feed_fetch_op
(
fluid
.
default_main_program
())
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data
=
module
.
get_feed_var_by_index
(
0
)
#TODO(ZeyuChen): how to get output paramter according to proto config
sent_emb
=
module
.
get_fetch_var_by_index
(
0
)
fc_2
=
fluid
.
layers
.
fc
(
input
=
sent_emb
,
size
=
hid_dim2
,
act
=
"tanh"
,
name
=
"bow_fc2"
)
# softmax layer
pred
=
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
class_dim
,
act
=
"softmax"
)
# print(fluid.default_main_program())
cost
=
fluid
.
layers
.
mean
(
fluid
.
layers
.
cross_entropy
(
input
=
pred
,
label
=
label
))
acc
=
fluid
.
layers
.
accuracy
(
input
=
pred
,
label
=
label
)
# set optimizer
sgd_optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
lr
)
sgd_optimizer
.
minimize
(
cost
)
# set place, executor, datafeeder
place
=
fluid
.
CUDAPlace
(
0
)
if
use_gpu
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
"words"
,
"label"
],
place
=
place
)
exe
.
run
(
fluid
.
default_startup_program
())
# start training...
for
pass_id
in
range
(
pass_num
):
data_size
,
data_count
,
total_acc
,
total_cost
=
0
,
0
,
0.0
,
0.0
for
batch
in
train_reader
():
avg_cost_np
,
avg_acc_np
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
feeder
.
feed
(
batch
),
fetch_list
=
[
cost
,
acc
],
return_numpy
=
True
)
data_size
=
len
(
batch
)
total_acc
+=
data_size
*
avg_acc_np
total_cost
+=
data_size
*
avg_cost_np
data_count
+=
data_size
avg_cost
=
total_cost
/
data_count
avg_acc
=
total_acc
/
data_count
print
(
"[train info]: pass_id: %d, avg_acc: %f, avg_cost: %f"
%
(
pass_id
,
avg_acc
,
avg_cost
))
# save the model
module_dir
=
os
.
path
.
join
(
save_dirname
,
network_name
)
signature
=
hub
.
create_signature
(
"default"
,
inputs
=
[
data
],
outputs
=
[
sent_emb
])
hub
.
create_module
(
sign_arr
=
signature
,
program
=
fluid
.
default_main_program
(),
path
=
module_dir
)
def
eval_net
(
test_reader
,
use_gpu
,
model_path
=
None
):
"""
Evaluation function
"""
if
model_path
is
None
:
print
(
str
(
model_path
)
+
"can not be found"
)
return
# set place, executor
place
=
fluid
.
CUDAPlace
(
0
)
if
use_gpu
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
inference_scope
=
fluid
.
core
.
Scope
()
with
fluid
.
scope_guard
(
inference_scope
):
# load the saved model
[
inference_program
,
feed_target_names
,
fetch_targets
]
=
fluid
.
io
.
load_inference_model
(
model_path
,
exe
)
# compute 2class and 3class accuracy
class2_acc
,
class3_acc
=
0.0
,
0.0
total_count
,
neu_count
=
0
,
0
for
data
in
test_reader
():
# infer a batch
pred
=
exe
.
run
(
inference_program
,
feed
=
utils
.
data2tensor
(
data
,
place
),
fetch_list
=
fetch_targets
,
return_numpy
=
True
)
for
i
,
val
in
enumerate
(
data
):
class3_label
,
class2_label
=
utils
.
get_predict_label
(
pred
[
0
][
i
,
1
])
true_label
=
val
[
1
]
if
class2_label
==
true_label
:
class2_acc
+=
1
if
class3_label
==
true_label
:
class3_acc
+=
1
if
true_label
==
1.0
:
neu_count
+=
1
total_count
+=
len
(
data
)
class2_acc
=
class2_acc
/
(
total_count
-
neu_count
)
class3_acc
=
class3_acc
/
total_count
print
(
"[test info] model_path: %s, class2_acc: %f, class3_acc: %f"
%
(
model_path
,
class2_acc
,
class3_acc
))
def
infer_net
(
test_reader
,
use_gpu
,
model_path
=
None
):
"""
Inference function
"""
if
model_path
is
None
:
print
(
str
(
model_path
)
+
"can not be found"
)
return
# set place, executor
place
=
fluid
.
CUDAPlace
(
0
)
if
use_gpu
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
inference_scope
=
fluid
.
core
.
Scope
()
with
fluid
.
scope_guard
(
inference_scope
):
# load the saved model
[
inference_program
,
feed_target_names
,
fetch_targets
]
=
fluid
.
io
.
load_inference_model
(
model_path
,
exe
)
for
data
in
test_reader
():
# infer a batch
pred
=
exe
.
run
(
inference_program
,
feed
=
utils
.
data2tensor
(
data
,
place
),
fetch_list
=
fetch_targets
,
return_numpy
=
True
)
for
i
,
val
in
enumerate
(
data
):
class3_label
,
class2_label
=
utils
.
get_predict_label
(
pred
[
0
][
i
,
1
])
pos_prob
=
pred
[
0
][
i
,
1
]
neg_prob
=
1
-
pos_prob
print
(
"predict label: %d, pos_prob: %f, neg_prob: %f"
%
(
class3_label
,
pos_prob
,
neg_prob
))
def
main
(
args
):
# train mode
if
args
.
mode
==
"train"
:
# prepare_data to get word_dict, train_reader
word_dict
,
train_reader
=
utils
.
prepare_data
(
args
.
train_data_path
,
args
.
word_dict_path
,
args
.
batch_size
,
args
.
mode
)
train_net
(
train_reader
,
word_dict
,
args
.
model_type
,
args
.
use_gpu
,
args
.
is_parallel
,
args
.
model_path
,
args
.
lr
,
args
.
batch_size
,
args
.
num_passes
)
# train mode
if
args
.
mode
==
"finetune"
:
# prepare_data to get word_dict, train_reader
word_dict
,
train_reader
=
utils
.
prepare_data
(
args
.
train_data_path
,
args
.
word_dict_path
,
args
.
batch_size
,
args
.
mode
)
retrain_net
(
train_reader
,
word_dict
,
args
.
model_type
,
args
.
use_gpu
,
args
.
is_parallel
,
args
.
model_path
,
args
.
lr
,
args
.
batch_size
,
args
.
num_passes
)
# eval mode
elif
args
.
mode
==
"eval"
:
# prepare_data to get word_dict, test_reader
word_dict
,
test_reader
=
utils
.
prepare_data
(
args
.
test_data_path
,
args
.
word_dict_path
,
args
.
batch_size
,
args
.
mode
)
eval_net
(
test_reader
,
args
.
use_gpu
,
args
.
model_path
)
# infer mode
elif
args
.
mode
==
"infer"
:
# prepare_data to get word_dict, test_reader
word_dict
,
test_reader
=
utils
.
prepare_data
(
args
.
test_data_path
,
args
.
word_dict_path
,
args
.
batch_size
,
args
.
mode
)
infer_net
(
test_reader
,
args
.
use_gpu
,
args
.
model_path
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
example/sentiment-classification/test_create_module.py
0 → 100755
浏览文件 @
767b41b3
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding: utf-8
import
sys
import
os
import
time
import
unittest
import
contextlib
import
logging
import
argparse
import
ast
import
utils
import
paddle.fluid
as
fluid
import
paddle_hub
as
hub
from
nets
import
bow_net
from
nets
import
cnn_net
from
nets
import
lstm_net
from
nets
import
bilstm_net
from
nets
import
gru_net
logger
=
logging
.
getLogger
(
"paddle-fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
"Sentiment Classification."
)
# training data path
parser
.
add_argument
(
"--train_data_path"
,
type
=
str
,
required
=
False
,
help
=
"The path of trainning data. Should be given in train mode!"
)
# test data path
parser
.
add_argument
(
"--test_data_path"
,
type
=
str
,
required
=
False
,
help
=
"The path of test data. Should be given in eval or infer mode!"
)
# word_dict path
parser
.
add_argument
(
"--word_dict_path"
,
type
=
str
,
required
=
True
,
help
=
"The path of word dictionary."
)
# current mode
parser
.
add_argument
(
"--mode"
,
type
=
str
,
required
=
True
,
choices
=
[
'train'
,
'eval'
,
'infer'
],
help
=
"train/eval/infer mode"
)
# model type
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
"bow_net"
,
help
=
"type of model"
)
# model save path parser.add_argument(
parser
.
add_argument
(
"--model_path"
,
type
=
str
,
default
=
"models"
,
required
=
True
,
help
=
"The path to saved the trained models."
)
# Number of passes for the training task.
parser
.
add_argument
(
"--num_passes"
,
type
=
int
,
default
=
3
,
help
=
"Number of passes for the training task."
)
# Batch size
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
256
,
help
=
"The number of training examples in one forward/backward pass."
)
# lr value for training
parser
.
add_argument
(
"--lr"
,
type
=
float
,
default
=
0.002
,
help
=
"The lr value for training."
)
# Whether to use gpu
parser
.
add_argument
(
"--use_gpu"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether to use gpu to train the model."
)
# parallel train
parser
.
add_argument
(
"--is_parallel"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether to train the model in parallel."
)
args
=
parser
.
parse_args
()
return
args
def
bow_net_module
(
data
,
label
,
dict_dim
,
emb_dim
=
128
,
hid_dim
=
128
,
hid_dim2
=
96
,
class_dim
=
2
):
"""
Bow net
"""
module_dir
=
"./model/test_create_module"
# embedding layer
emb
=
fluid
.
layers
.
embedding
(
input
=
data
,
size
=
[
dict_dim
,
emb_dim
],
param_attr
=
"bow_embedding"
)
# bow layer
bow
=
fluid
.
layers
.
sequence_pool
(
input
=
emb
,
pool_type
=
'sum'
)
bow_tanh
=
fluid
.
layers
.
tanh
(
bow
)
# full connect layer
fc_1
=
fluid
.
layers
.
fc
(
input
=
bow_tanh
,
size
=
hid_dim
,
act
=
"tanh"
,
name
=
"bow_fc1"
)
fc_2
=
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
hid_dim2
,
act
=
"tanh"
,
name
=
"bow_fc2"
)
# softmax layer
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
class_dim
,
act
=
"softmax"
,
name
=
"fc_softmax"
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
return
avg_cost
,
acc
,
prediction
,
emb
def
train_net
(
train_reader
,
word_dict
,
network_name
,
use_gpu
,
parallel
,
save_dirname
,
lr
=
0.002
,
batch_size
=
128
,
pass_num
=
10
):
"""
train network
"""
if
network_name
==
"bilstm_net"
:
network
=
bilstm_net
elif
network_name
==
"bow_net"
:
network
=
bow_net
elif
network_name
==
"cnn_net"
:
network
=
cnn_net
elif
network_name
==
"lstm_net"
:
network
=
lstm_net
elif
network_name
==
"gru_net"
:
network
=
gru_net
else
:
print
(
"unknown network type"
)
return
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data
=
fluid
.
layers
.
data
(
name
=
"words"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
cost
,
acc
,
pred
,
emb
=
network
(
data
,
label
,
len
(
word_dict
)
+
2
)
# set optimizer
sgd_optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
lr
)
sgd_optimizer
.
minimize
(
cost
)
# set place, executor, datafeeder
place
=
fluid
.
CUDAPlace
(
0
)
if
use_gpu
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
"words"
,
"label"
],
place
=
place
)
exe
.
run
(
fluid
.
default_startup_program
())
# start training...
for
pass_id
in
range
(
pass_num
):
data_size
,
data_count
,
total_acc
,
total_cost
=
0
,
0
,
0.0
,
0.0
for
batch
in
train_reader
():
avg_cost_np
,
avg_acc_np
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
feeder
.
feed
(
batch
),
fetch_list
=
[
cost
,
acc
],
return_numpy
=
True
)
data_size
=
len
(
batch
)
total_acc
+=
data_size
*
avg_acc_np
total_cost
+=
data_size
*
avg_cost_np
data_count
+=
data_size
avg_cost
=
total_cost
/
data_count
avg_acc
=
total_acc
/
data_count
print
(
"[train info]: pass_id: %d, avg_acc: %f, avg_cost: %f"
%
(
pass_id
,
avg_acc
,
avg_cost
))
# save the model
module_dir
=
os
.
path
.
join
(
save_dirname
,
network_name
)
config
=
hub
.
ModuleConfig
(
module_dir
)
config
.
save_dict
(
word_dict
=
word_dict
)
# saving config
input_desc
=
{
"words"
:
data
.
name
}
output_desc
=
{
"emb"
:
emb
.
name
}
config
.
register_feed_signature
(
input_desc
)
config
.
register_fetch_signature
(
output_desc
)
config
.
dump
()
feed_var_name
=
config
.
feed_var_name
(
"words"
)
fluid
.
io
.
save_inference_model
(
module_dir
,
[
feed_var_name
],
emb
,
exe
)
def
retrain_net
(
train_reader
,
word_dict
,
network_name
,
use_gpu
,
parallel
,
save_dirname
,
lr
=
0.002
,
batch_size
=
128
,
pass_num
=
30
):
"""
train network
"""
if
network_name
==
"bilstm_net"
:
network
=
bilstm_net
elif
network_name
==
"bow_net"
:
network
=
bow_net
elif
network_name
==
"cnn_net"
:
network
=
cnn_net
elif
network_name
==
"lstm_net"
:
network
=
lstm_net
elif
network_name
==
"gru_net"
:
network
=
gru_net
else
:
print
(
"unknown network type"
)
return
dict_dim
=
len
(
word_dict
)
+
2
emb_dim
=
128
hid_dim
=
128
hid_dim2
=
96
class_dim
=
2
module_path
=
"./models/bow_net"
module
=
hub
.
Module
(
module_dir
=
module_path
)
main_program
=
fluid
.
Program
()
startup_program
=
fluid
.
Program
()
# use switch program to test fine-tuning
fluid
.
framework
.
switch_main_program
(
module
.
get_inference_program
())
# remove feed fetch operator and variable
hub
.
ModuleUtils
.
remove_feed_fetch_op
(
fluid
.
default_main_program
())
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data
=
module
.
get_feed_var
(
"words"
)
emb
=
module
.
get_fetch_var
(
"emb"
)
# bow layer
bow
=
fluid
.
layers
.
sequence_pool
(
input
=
emb
,
pool_type
=
'sum'
)
bow_tanh
=
fluid
.
layers
.
tanh
(
bow
)
# full connect layer
fc_1
=
fluid
.
layers
.
fc
(
input
=
bow_tanh
,
size
=
hid_dim
,
act
=
"tanh"
,
name
=
"bow_fc1"
)
fc_2
=
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
hid_dim2
,
act
=
"tanh"
,
name
=
"bow_fc2"
)
# softmax layer
pred
=
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
class_dim
,
act
=
"softmax"
)
cost
=
fluid
.
layers
.
mean
(
fluid
.
layers
.
cross_entropy
(
input
=
pred
,
label
=
label
))
acc
=
fluid
.
layers
.
accuracy
(
input
=
pred
,
label
=
label
)
# set optimizer
sgd_optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
lr
)
sgd_optimizer
.
minimize
(
cost
)
# set place, executor, datafeeder
place
=
fluid
.
CUDAPlace
(
0
)
if
use_gpu
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
"words"
,
"label"
],
place
=
place
)
exe
.
run
(
fluid
.
default_startup_program
())
# start training...
for
pass_id
in
range
(
pass_num
):
data_size
,
data_count
,
total_acc
,
total_cost
=
0
,
0
,
0.0
,
0.0
for
batch
in
train_reader
():
avg_cost_np
,
avg_acc_np
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
feeder
.
feed
(
batch
),
fetch_list
=
[
cost
,
acc
],
return_numpy
=
True
)
data_size
=
len
(
batch
)
total_acc
+=
data_size
*
avg_acc_np
total_cost
+=
data_size
*
avg_cost_np
data_count
+=
data_size
avg_cost
=
total_cost
/
data_count
avg_acc
=
total_acc
/
data_count
print
(
"[train info]: pass_id: %d, avg_acc: %f, avg_cost: %f"
%
(
pass_id
,
avg_acc
,
avg_cost
))
# save the model
module_dir
=
os
.
path
.
join
(
save_dirname
,
network_name
+
"_retrain"
)
fluid
.
io
.
save_inference_model
(
module_dir
,
[
"words"
],
emb
,
exe
)
config
=
hub
.
ModuleConfig
(
module_dir
)
config
.
save_dict
(
word_dict
=
word_dict
)
config
.
dump
()
def
main
(
args
):
# prepare_data to get word_dict, train_reader
word_dict
,
train_reader
=
utils
.
prepare_data
(
args
.
train_data_path
,
args
.
word_dict_path
,
args
.
batch_size
,
args
.
mode
)
train_net
(
train_reader
,
word_dict
,
args
.
model_type
,
args
.
use_gpu
,
args
.
is_parallel
,
args
.
model_path
,
args
.
lr
,
args
.
batch_size
,
args
.
num_passes
)
# NOTE(ZeyuChen): can't run train_net and retrain_net together
# retrain_net(train_reader, word_dict, args.model_type, args.use_gpu,
# args.is_parallel, args.model_path, args.lr, args.batch_size,
# args.num_passes)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
example/sentiment-classification/test_finetune.py
0 → 100755
浏览文件 @
767b41b3
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding: utf-8
import
sys
import
os
import
time
import
unittest
import
contextlib
import
logging
import
argparse
import
ast
import
utils
import
paddle.fluid
as
fluid
import
paddle_hub
as
hub
from
nets
import
bow_net
from
nets
import
cnn_net
from
nets
import
lstm_net
from
nets
import
bilstm_net
from
nets
import
gru_net
logger
=
logging
.
getLogger
(
"paddle-fluid"
)
logger
.
setLevel
(
logging
.
INFO
)
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
"Sentiment Classification."
)
# training data path
parser
.
add_argument
(
"--train_data_path"
,
type
=
str
,
required
=
False
,
help
=
"The path of trainning data. Should be given in train mode!"
)
# test data path
parser
.
add_argument
(
"--test_data_path"
,
type
=
str
,
required
=
False
,
help
=
"The path of test data. Should be given in eval or infer mode!"
)
# word_dict path
parser
.
add_argument
(
"--word_dict_path"
,
type
=
str
,
required
=
True
,
help
=
"The path of word dictionary."
)
# current mode
parser
.
add_argument
(
"--mode"
,
type
=
str
,
required
=
True
,
choices
=
[
'train'
,
'eval'
,
'infer'
],
help
=
"train/eval/infer mode"
)
# model type
parser
.
add_argument
(
"--model_type"
,
type
=
str
,
default
=
"bow_net"
,
help
=
"type of model"
)
# model save path
parser
.
add_argument
(
"--model_path"
,
type
=
str
,
default
=
"models"
,
required
=
True
,
help
=
"The path to saved the trained models."
)
# Number of passes for the training task.
parser
.
add_argument
(
"--num_passes"
,
type
=
int
,
default
=
10
,
help
=
"Number of passes for the training task."
)
# Batch size
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
256
,
help
=
"The number of training examples in one forward/backward pass."
)
# lr value for training
parser
.
add_argument
(
"--lr"
,
type
=
float
,
default
=
0.002
,
help
=
"The lr value for training."
)
# Whether to use gpu
parser
.
add_argument
(
"--use_gpu"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether to use gpu to train the model."
)
# parallel train
parser
.
add_argument
(
"--is_parallel"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether to train the model in parallel."
)
args
=
parser
.
parse_args
()
return
args
def
retrain_net
(
train_reader
,
word_dict
,
network_name
,
use_gpu
,
parallel
,
save_dirname
,
lr
=
0.002
,
batch_size
=
128
,
pass_num
=
30
):
"""
train network
"""
if
network_name
==
"bilstm_net"
:
network
=
bilstm_net
elif
network_name
==
"bow_net"
:
network
=
bow_net
elif
network_name
==
"cnn_net"
:
network
=
cnn_net
elif
network_name
==
"lstm_net"
:
network
=
lstm_net
elif
network_name
==
"gru_net"
:
network
=
gru_net
else
:
print
(
"unknown network type"
)
return
dict_dim
=
len
(
word_dict
)
+
2
emb_dim
=
128
hid_dim
=
128
hid_dim2
=
96
class_dim
=
2
module_path
=
"./models/bow_net"
module
=
hub
.
Module
(
module_dir
=
module_path
)
main_program
=
fluid
.
Program
()
startup_program
=
fluid
.
Program
()
# use switch program to test fine-tuning
fluid
.
framework
.
switch_main_program
(
module
.
get_inference_program
())
# remove feed fetch operator and variable
hub
.
ModuleUtils
.
remove_feed_fetch_op
(
fluid
.
default_main_program
())
# remove_feed_fetch_op(fluid.default_main_program())
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
#data = fluid.default_main_program().global_block().var("words")
data
=
module
.
get_feed_var
(
"words"
)
#TODO(ZeyuChen): how to get output paramter according to proto config
emb
=
module
.
get_fetch_var
(
"emb"
)
# # # embedding layer
# emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
# #input=data, size=[dict_dim, emb_dim], param_attr="bow_embedding")
# # bow layer
bow
=
fluid
.
layers
.
sequence_pool
(
input
=
emb
,
pool_type
=
'sum'
)
bow_tanh
=
fluid
.
layers
.
tanh
(
bow
)
# full connect layer
fc_1
=
fluid
.
layers
.
fc
(
input
=
bow_tanh
,
size
=
hid_dim
,
act
=
"tanh"
,
name
=
"bow_fc1"
)
fc_2
=
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
hid_dim2
,
act
=
"tanh"
,
name
=
"bow_fc2"
)
# softmax layer
pred
=
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
class_dim
,
act
=
"softmax"
)
# print(fluid.default_main_program())
cost
=
fluid
.
layers
.
mean
(
fluid
.
layers
.
cross_entropy
(
input
=
pred
,
label
=
label
))
acc
=
fluid
.
layers
.
accuracy
(
input
=
pred
,
label
=
label
)
# set optimizer
sgd_optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
lr
)
sgd_optimizer
.
minimize
(
cost
)
# set place, executor, datafeeder
place
=
fluid
.
CUDAPlace
(
0
)
if
use_gpu
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
"words"
,
"label"
],
place
=
place
)
exe
.
run
(
fluid
.
default_startup_program
())
# start training...
for
pass_id
in
range
(
pass_num
):
data_size
,
data_count
,
total_acc
,
total_cost
=
0
,
0
,
0.0
,
0.0
for
batch
in
train_reader
():
avg_cost_np
,
avg_acc_np
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
feeder
.
feed
(
batch
),
fetch_list
=
[
cost
,
acc
],
return_numpy
=
True
)
data_size
=
len
(
batch
)
total_acc
+=
data_size
*
avg_acc_np
total_cost
+=
data_size
*
avg_cost_np
data_count
+=
data_size
avg_cost
=
total_cost
/
data_count
avg_acc
=
total_acc
/
data_count
print
(
"[train info]: pass_id: %d, avg_acc: %f, avg_cost: %f"
%
(
pass_id
,
avg_acc
,
avg_cost
))
# save the model
module_dir
=
os
.
path
.
join
(
save_dirname
,
network_name
+
"_retrain"
)
fluid
.
io
.
save_inference_model
(
module_dir
,
[
"words"
],
emb
,
exe
)
input_desc
=
{
"words"
:
data
.
name
}
output_desc
=
{
"emb"
:
emb
.
name
}
config
=
hub
.
ModuleConfig
(
module_dir
)
config
.
save_dict
(
word_dict
=
word_dict
)
config
.
dump
()
def
main
(
args
):
# prepare_data to get word_dict, train_reader
word_dict
,
train_reader
=
utils
.
prepare_data
(
args
.
train_data_path
,
args
.
word_dict_path
,
args
.
batch_size
,
args
.
mode
)
retrain_net
(
train_reader
,
word_dict
,
args
.
model_type
,
args
.
use_gpu
,
args
.
is_parallel
,
args
.
model_path
,
args
.
lr
,
args
.
batch_size
,
args
.
num_passes
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
example/sentiment-classification/train.sh
0 → 100755
浏览文件 @
767b41b3
python sentiment_classify.py
--train_data_path
./data/train_data/corpus.train
--word_dict_path
./data/train.vocab
--mode
train
--model_path
./models
example/sentiment-classification/utils.py
0 → 100755
浏览文件 @
767b41b3
import
os
import
sys
import
time
import
numpy
as
np
import
random
import
paddle.fluid
as
fluid
import
paddle
def
get_predict_label
(
pos_prob
):
neg_prob
=
1
-
pos_prob
# threshold should be (1, 0.5)
neu_threshold
=
0.55
if
neg_prob
>
neu_threshold
:
class3_label
=
0
elif
pos_prob
>
neu_threshold
:
class3_label
=
2
else
:
class3_label
=
1
if
pos_prob
>=
neg_prob
:
class2_label
=
2
else
:
class2_label
=
0
return
class3_label
,
class2_label
def
to_lodtensor
(
data
,
place
):
"""
convert ot LODtensor
"""
seq_lens
=
[
len
(
seq
)
for
seq
in
data
]
cur_len
=
0
lod
=
[
cur_len
]
for
l
in
seq_lens
:
cur_len
+=
l
lod
.
append
(
cur_len
)
flattened_data
=
np
.
concatenate
(
data
,
axis
=
0
).
astype
(
"int64"
)
flattened_data
=
flattened_data
.
reshape
([
len
(
flattened_data
),
1
])
res
=
fluid
.
LoDTensor
()
res
.
set
(
flattened_data
,
place
)
res
.
set_lod
([
lod
])
return
res
def
data2tensor
(
data
,
place
):
"""
data2tensor
"""
input_seq
=
to_lodtensor
(
map
(
lambda
x
:
x
[
0
],
data
),
place
)
return
{
"words"
:
input_seq
}
def
data_reader
(
file_path
,
word_dict
,
is_shuffle
=
True
):
"""
Convert word sequence into slot
"""
unk_id
=
len
(
word_dict
)
all_data
=
[]
with
open
(
file_path
,
"r"
)
as
fin
:
for
line
in
fin
:
cols
=
line
.
strip
().
split
(
"
\t
"
)
label
=
int
(
cols
[
0
])
wids
=
[
word_dict
[
x
]
if
x
in
word_dict
else
unk_id
for
x
in
cols
[
1
].
split
(
" "
)
]
all_data
.
append
((
wids
,
label
))
if
is_shuffle
:
random
.
shuffle
(
all_data
)
def
reader
():
for
doc
,
label
in
all_data
:
yield
doc
,
label
return
reader
def
load_vocab
(
file_path
):
"""
load the given vocabulary
"""
vocab
=
{}
with
open
(
file_path
)
as
f
:
wid
=
0
for
line
in
f
:
vocab
[
line
.
strip
()]
=
wid
wid
+=
1
vocab
[
"<unk>"
]
=
len
(
vocab
)
return
vocab
def
prepare_data
(
data_path
,
word_dict_path
,
batch_size
,
mode
):
"""
prepare data
"""
assert
os
.
path
.
exists
(
word_dict_path
),
"The given word dictionary dose not exist."
if
mode
==
"train"
:
assert
os
.
path
.
exists
(
data_path
),
"The given training data does not exist."
if
mode
==
"eval"
or
mode
==
"infer"
:
assert
os
.
path
.
exists
(
data_path
),
"The given test data does not exist."
word_dict
=
load_vocab
(
word_dict_path
)
if
mode
==
"train"
:
train_reader
=
paddle
.
batch
(
data_reader
(
data_path
,
word_dict
,
True
),
batch_size
)
return
word_dict
,
train_reader
else
:
test_reader
=
paddle
.
batch
(
data_reader
(
data_path
,
word_dict
,
False
),
batch_size
)
return
word_dict
,
test_reader
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录