Cannot run operator on place CUDAPlace(0) at [/paddle/paddle/fluid/framework/operator.cc:104]
Created by: Angus07
训练脚本
import os
import gzip
import paddle
import paddle.fluid as fluid
import read_short as reader
import utils
from functools import partial
import network_conf_short5 as network
os.environ["CUDA_VISIBLE_DEVICES"] = '8'
GPU_CNT = 1
BUFFER_SIZE = 12800
step = 0
def load_initial_model(model_path, parameters):
""" Initalize parameters in the network from a trained model.
This is useful in resuming the training from previously saved models.
Arguments:
- model_path: The path of a trained model.
- parameters: The parameters in a network which will be initialized
from the specified model.
"""
with gzip.open(model_path, "rb") as f:
parameters.init_from_tar(f)
def train(topology,
train_data_dir=None,
test_data_dir=None,
word_dict_path=None,
l1_dict_path=None,
label_dict_path=None,
model_save_dir="models",
batch_size=128,
num_passes=20):
"""
train dnn model
"""
place = fluid.CUDAPlace(0)
if not os.path.exists(model_save_dir):
os.mkdir(model_save_dir)
if word_dict_path is None or not os.path.exists(word_dict_path):
utils.logger.info(("word dictionary is not given, the dictionary "
"is automatically built from the training data."))
utils.build_dict(
data_dir=train_data_dir,
save_path=word_dict_path,
use_col=0,
cutoff_fre=10,
insert_extra_words=["<UNK>"])
if not os.path.exists(label_dict_path):
utils.logger.info(("label dictionary is not given, the dictionary "
"is automatically built from the training data."))
utils.build_dict(
data_dir=train_data_dir, save_path=label_dict_path, use_col=2)
word_dict = utils.load_dict(word_dict_path)
#l1_dict is a feature of topic tagger task
l1_dict = utils.load_dict(l1_dict_path)
lbl_dict = utils.load_dict(label_dict_path)
class_num = len(lbl_dict)
utils.logger.info("class number is : %d." % (len(lbl_dict)))
train_reader = paddle.batch(
paddle.reader.shuffle(paddle.reader.buffered(
reader.train_reader(train_data_dir, word_dict, l1_dict, lbl_dict),
BUFFER_SIZE), buf_size=BUFFER_SIZE),
batch_size=batch_size)
if test_data_dir is not None:
test_reader = paddle.batch(
paddle.reader.shuffle(
reader.train_reader(test_data_dir, word_dict, l1_dict, lbl_dict),
buf_size=BUFFER_SIZE),
batch_size=batch_size)
else:
test_reader = None
dict_dim = len(word_dict)
utils.logger.info("length of word dictionary is : %d." % (dict_dim))
def optimizer_func():
return fluid.optimizer.Adam(learning_rate=1e-3, regularization=fluid.regularizer.L2DecayRegularizer(8e-4))
# create trainer
trainer = fluid.Trainer(
train_func=partial(topology, dict_dim, class_num),
place=place,
optimizer_func=optimizer_func)
feed_order = ["title1", "title2", "title3", "l1" , "label"]
def _event_handler(event):
"""
Define end batch and end pass event handler
"""
global step
if isinstance(event, fluid.EndStepEvent):
if step % 100 == 0:
utils.logger.info("Pass %d, Batch %d, Step %d, Cost %f\n" %
(event.pass_id, event.batch_id, step, event.cost))
if step % 10000 == 0:
if test_reader is not None:
result = trainer.test(reader=test_reader, feed_order =feed_order)
utils.logger.info("Test at Pass %d, Step %d, Cost %f\n" %
(event.pass_id, step, result.cost))
with gzip.open(
os.path.join(model_save_dir, "dnn_params_step_%d.tar.gz" %
step), "w") as f:
trainer.save_params(model_save_dir)
step += 1
trainer.train(
num_passes,
reader=train_reader,
event_handler=_event_handler,
feed_order=feed_order,
)
utils.logger.info("Training has finished.")
def main(args):
"""
main
"""
if args.nn_type == "cnn":
topology = network.convolution_net
train(
topology=topology,
train_data_dir=args.train_data_dir,
test_data_dir=args.test_data_dir,
word_dict_path=args.word_dict,
label_dict_path=args.label_dict,
l1_dict_path="../conf/level_1_tag_25_raw",
batch_size=args.batch_size,
num_passes=args.num_passes,
model_save_dir=args.model_save_dir)
if __name__ == "__main__":
args = utils.parse_train_cmd()
if args.train_data_dir is not None:
assert args.word_dict and args.label_dict, (
"the parameter train_data_dir, word_dict_path, and label_dict_path "
"should be set at the same time.")
main(args)
错误信息: Traceback (most recent call last): File "train_short5.py", line 193, in main(args) File "train_short5.py", line 184, in main model_save_dir=args.model_save_dir) File "train_short5.py", line 104, in train optimizer_func=optimizer_func) File "/home/du/chenliangyu/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/trainer.py", line 284, in init exe.run(self.startup_program) File "/home/du/chenliangyu/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/executor.py", line 443, in run self.executor.run(program.desc, scope, 0, True, True) paddle.fluid.core.EnforceNotMet: Cannot run operator on place CUDAPlace(0) at [/paddle/paddle/fluid/framework/operator.cc:104]
网络结构:
"""
Define the network
"""
import paddle.fluid as fluid
import sys
__all__ = ["fc_net", "convolution_net"]
LAYER1_SIZE = 896
LAYER2_SIZE = 448
LAYER3_SIZE = 224
def convolution_net(dict_dim, class_dim, emb_dim=128,
hid_dim=128, is_infer=False):
data1 = fluid.layers.data(
name="title1", shape=[1], dtype="int64", lod_level=1)
data2 = fluid.layers.data(
name="title2", shape=[1], dtype="int64", lod_level=1)
data3 = fluid.layers.data(
name="title3", shape=[1], dtype="int64", lod_level=1)
data4 = fluid.layers.data(
name="l1", shape=[1], dtype="int64", lod_level=1)
if not is_infer:
#lbl = paddle.layer.data("label", paddle.data_type.dense_vector(class_num, int))
#label = fluid.layers.data(name="label", shape=[class_dim], dtype="int64")
label = fluid.layers.data(name="label", shape=[class_dim], dtype="int64")
# define the embedding layer
emb1 = fluid.layers.embedding(
input=data1, size=[dict_dim, emb_dim], is_sparse=True, param_attr=fluid.ParamAttr(name='emb'))
emb2 = fluid.layers.embedding(
input=data2, size=[dict_dim, emb_dim], is_sparse=True,param_attr=fluid.ParamAttr(name='emb'))
emb3 = fluid.layers.embedding(
input=data3, size=[dict_dim, emb_dim], is_sparse=True,param_attr=fluid.ParamAttr(name='emb'))
emb4 = fluid.layers.embedding(
input=data4, size=[27, emb_dim], is_sparse=True,param_attr=fluid.ParamAttr(name='l1_emb'))
# max pooling to reduce the input sequence into a vector (non-sequence)
seq_pool1 = fluid.layers.sequence_pool(input=emb1, pool_type='average')
seq_pool2 = fluid.layers.sequence_pool(input=emb2, pool_type='average')
seq_pool3 = fluid.layers.sequence_pool(input=emb3, pool_type='average')
seq_pool4 = fluid.layers.sequence_pool(input=emb4, pool_type='average')
# convolution layers with max pooling
conv_1_win3 = fluid.nets.sequence_conv_pool(
input=emb1,
num_filters=hid_dim,
filter_size=3,
act="relu",
pool_type="max")
conv_1_win4 = fluid.nets.sequence_conv_pool(
input=emb1,
num_filters=hid_dim,
filter_size=4,
act="relu",
pool_type="max")
conv_1_win5 = fluid.nets.sequence_conv_pool(
input=emb1,
num_filters=hid_dim,
filter_size=5,
act="relu",
pool_type="max")
conv_2_win3 = fluid.nets.sequence_conv_pool(
input=emb2,
num_filters=hid_dim,
filter_size=3,
act="relu",
pool_type="max")
conv_2_win4 = fluid.nets.sequence_conv_pool(
input=emb2,
num_filters=hid_dim,
filter_size=4,
act="relu",
pool_type="max")
conv_2_win5 = fluid.nets.sequence_conv_pool(
input=emb2,
num_filters=hid_dim,
filter_size=5,
act="relu",
pool_type="max")
concat_vec = fluid.layers.concat(
input=[conv_1_win3, conv_1_win4, conv_1_win5,conv_2_win3, conv_2_win4, conv_2_win5,
seq_pool1, seq_pool2, seq_pool3
, seq_pool4], axis=1)
print >> sys.stderr, concat_vec
bn = fluid.layers.batch_norm(input=concat_vec, act='relu')
prediction = fluid.layers.fc(
bn, size=2000, act="relu")
#hidden = paddle.layer.dropout(input=hidden, dropout_rate=0.5)
prob = fluid.layers.fc(
input=prediction, size=class_dim,act='sigmoid')
cost = fluid.layers.cross_entropy(input=prob, label=label, soft_label= True)
cost = fluid.layers.mean(x=cost)
if is_infer:
return prob
else:
return [cost, prob, label]