关于gpu加速问题
Created by: ARDUJS
环境
- paddle 1.7.2
- python 3.7.5
问题
神经网络层使用多个crf,gpu使用率明显下降,对于crf运算是否没法加速,或是我哪没有设置
复现代码
# 导入 PaddlePaddle 函数库.
import paddle
from paddle import fluid
import paddle.fluid.layers as layers
# In[2]:
maxlen = 256
char_size = 128
char2id_num = 3172
word_vector_dim = 300
class_num = 110
# In[3]:
#栈式双向LSTM
def stacked_lstm_net(emb, hid_dim, stacked_num):
fc1 = fluid.layers.fc(input=emb, size=hid_dim)
#lstm层
lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
inputs = [fc1, lstm1]
# print("fc1", fc1.shape, "lstm1", lstm1.shape)
#其余的所有栈结构
for i in range(2, stacked_num + 1):
fc = fluid.layers.fc(input=inputs, size=hid_dim)
lstm, cell = fluid.layers.dynamic_lstm(
input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
inputs = [fc, lstm]
# layers.Print(inputs[0])
#池化层
# fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
# lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
fc_last = inputs[0]
lstm_last = inputs[1]
# #全连接层,softmax预测
# prediction = fluid.layers.fc(
# input=[fc_last, lstm_last], size=class_dim, act='softmax')
return (fc_last, lstm_last)
t_char = fluid.data(name='t_char', shape=[None, maxlen], dtype='int64')
t_word = fluid.data(name='t_word', shape=[None, maxlen, word_vector_dim], dtype='float32')
t_posid = fluid.data(name='pos_id', shape=[None, maxlen], dtype='int64')
seq_len = fluid.data(name='seq_len', shape=[None, 1], dtype='int64', lod_level=0)
seq_len_used = fluid.layers.squeeze(seq_len, axes=[1])
char_em = fluid.embedding(input=t_char, size=(char2id_num+2, char_size)) ## batch * 256 * 128
word_em = fluid.layers.fc(size=char_size,input=t_word, num_flatten_dims=2) ## batch * 256 * 128
# layers.Print(t_posid)
pv = fluid.embedding(input=t_posid, size=(maxlen, char_size)) ## batch * 256 * 128
# layers.Print(char_em)
# layers.Print(seq_len_used)
char_em = fluid.layers.sequence_unpad(char_em, length=seq_len_used)
word_em = fluid.layers.sequence_unpad(word_em, length=seq_len_used)
pv = fluid.layers.sequence_unpad(pv, length=seq_len_used)
t = char_em + word_em + pv
t = fluid.layers.dropout(t, dropout_prob=0.25)
hidden = stacked_lstm_net(t, 256, 2)
l1 = fluid.layers.data(name="l1", shape=[None, maxlen, 1], dtype='int64')
l2 = fluid.layers.data(
name="l2", shape=[None, maxlen, 1], dtype='int64')
l3 = fluid.layers.data(
name="l3", shape=[None, maxlen, 1], dtype='int64')
l4 = fluid.layers.data(
name="l4", shape=[None, maxlen, 1], dtype='int64')
l5 = fluid.layers.data(
name="l5", shape=[None, maxlen, 1], dtype='int64')
labels = [l1, l2, l3, l4, l5]
crf_num = 3
emissions = ["" for i in range(crf_num+1)]
ret_infers = ["" for i in range(crf_num+1)]
infers = ["" for i in range(crf_num+1)]
losses = ["" for i in range(crf_num+1)]
emissions[0] = hidden
# In[5]:
res = []
Loss = 0
for i in range(1, crf_num+1):
emissions[i] = fluid.layers.fc(size=class_num,input=emissions[i-1],param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-0.1, high=0.1), regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
size = emissions[1].shape[1]
fluid.layers.create_parameter(shape=[size + 2, size], dtype=emissions[1].dtype, name='crfw_'+str(i))
labels[i-1] = fluid.layers.sequence_unpad(labels[i-1], seq_len_used)
losses[i] = fluid.layers.linear_chain_crf(input=emissions[i],label=labels[i-1], param_attr=fluid.ParamAttr(name='crfw_'+str(i)))
losses[i] = fluid.layers.mean(losses[i])
Loss += losses[i]
# In[8]:
def optimizer_func():
learning_rate = 1e-4
return fluid.optimizer.Adam(learning_rate=learning_rate)
# ### 创建执行器
# In[10]:
test_program = fluid.default_main_program().clone(for_test=True)
sgd_optimizer = optimizer_func()#训练优化函数
sgd_optimizer.minimize(Loss)
use_cuda = True
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
main_program = fluid.default_main_program()
exe.run(fluid.default_startup_program())
feeder = fluid.DataFeeder(feed_list=['t_char', 't_word', 'pos_id', 'seq_len','l1', 'l2', 'l3','l4', 'l5'], place=place)
import numpy as np
T1 = np.random.randint(0,10,(32,256))
T2 = np.random.rand(32,256, 300)
L1 = np.random.randint(0,10,(32,256, 1))
L2 = np.random.randint(0,10,(32,256, 1))
L3 = np.random.randint(0,10,(32,256, 1))
L4 = np.random.randint(0,10,(32,256, 1))
L5 = np.random.randint(0,10,(32,256, 1))
Posid = np.random.randint(0,10,(32,256))
Seq_len = np.random.randint(0,256,(32,1))
batch_id = 0
# import reader_5 as R
# train_data = R.train_data
# dev_data = R.dev_data
# train_D = R.data_generator(train_data, batch_size=32, epoch=130, shuffle=True)
# st = tim
# for T1, T2, L1, L2, L3, L4, L5, Posid, Seq_len, Text, Ans, _ in train_D.__iter__():
for i in range(1000):
metrics = exe.run(main_program,
feed=feeder.feed([[T1, T2, Posid, Seq_len, L1, L2, L3, L4, L5]]),
fetch_list=[Loss])
if batch_id % 10 == 0:
print("loss {}".format(metrics[0][0]))
batch_id += 1
修改crf_num 变量, 范围[1, 5],越大gpu使用率越低。