Paddle.v2 multi GPU is slower than one gpu.
Created by: phonism
text classification, using CNN.
def convolution_net(dict_dim, class_dim=2, emb_dim=128, hid_dim=128):
# input layers
title_data = paddle.layer.data("title", paddle.data_type.integer_value_sequence(dict_dim))
lbl = paddle.layer.data("label", paddle.data_type.integer_value(class_dim))
#embedding layer
title_emb = paddle.layer.embedding(input=title_data, size=emb_dim)
# convolution layers with max pooling
title_conv_max_2 = paddle.networks.sequence_conv_pool(
input=title_emb, context_len=2, hidden_size=hid_dim)#, pool_type=paddle.pooling.Max())
title_conv_max_3 = paddle.networks.sequence_conv_pool(
input=title_emb, context_len=3, hidden_size=hid_dim)#, pool_type=paddle.pooling.Max())
title_conv_max_4 = paddle.networks.sequence_conv_pool(
input=title_emb, context_len=4, hidden_size=hid_dim)#, pool_type=paddle.pooling.Max())
concat_layer = paddle.layer.concat(
input=[
title_conv_max_2, title_conv_max_3, title_conv_max_4,
])
dropout_layer = paddle.layer.dropout(input=concat_layer, dropout_rate=0.5)
# fc and output layer
output = paddle.layer.fc(input=dropout_layer, size=class_dim, act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost, output, lbl
def train_cnn_model(num_pass):
dict_dim = 640000
class_dim = 4225
# define data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
lambda: data_reader(flag=True), buf_size=1000),
batch_size=64)
test_reader = paddle.batch(
lambda: data_reader(flag=False), batch_size=64)
# network config
[cost, output, label] = convolution_net(dict_dim, class_dim=class_dim)
# create parameters
parameters = paddle.parameters.create(cost)
# create optimizer
adam_optimizer = paddle.optimizer.Adam(
learning_rate=1e-3,
regularization=paddle.optimizer.L2Regularization(rate=1e-3),
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
# add auc evaluator
paddle.evaluator.auc(input=output, label=label)
# create trainer
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=adam_optimizer)
feeding = {'title': 0, 'label': 1}
trainer.train(
reader=train_reader,
event_handler=event_handler,
feeding=feeding,
# use_sparse_updater=1,
num_passes=num_pass)
if __name__ == "__main__":
paddle.init(use_gpu=True, trainer_count=1)
# paddle.init(use_gpu=True, trainer_count=4)
num_pass = 500
# train_cnn_model(num_pass=num_pass)
cnn_infer("in")
GPU info: Tesla K40m * 4
run with trainer_count=1
, 100 batch cost 6s.
+------------------------------------------------------+
| NVIDIA-SMI 352.39 Driver Version: 352.39 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla K40m On | 0000:03:00.0 Off | 0 |
| N/A 37C P0 98W / 235W | 2723MiB / 11519MiB | 30% Default |
+-------------------------------+----------------------+----------------------+
| 1 Tesla K40m On | 0000:04:00.0 Off | 0 |
| N/A 32C P0 61W / 235W | 143MiB / 11519MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 2 Tesla K40m On | 0000:83:00.0 Off | 0 |
| N/A 32C P0 62W / 235W | 143MiB / 11519MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 3 Tesla K40m On | 0000:84:00.0 Off | 0 |
| N/A 32C P0 62W / 235W | 143MiB / 11519MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 16645 C python 2697MiB |
| 1 16645 C python 118MiB |
| 2 16645 C python 118MiB |
| 3 16645 C python 118MiB |
+-----------------------------------------------------------------------------+
using trainer_count=4
100 batch cost 25s。
+------------------------------------------------------+
| NVIDIA-SMI 352.39 Driver Version: 352.39 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla K40m On | 0000:03:00.0 Off | 0 |
| N/A 26C P0 66W / 235W | 2683MiB / 11519MiB | 35% Default |
+-------------------------------+----------------------+----------------------+
| 1 Tesla K40m On | 0000:04:00.0 Off | 0 |
| N/A 25C P0 70W / 235W | 1139MiB / 11519MiB | 28% Default |
+-------------------------------+----------------------+----------------------+
| 2 Tesla K40m On | 0000:83:00.0 Off | 0 |
| N/A 26C P0 66W / 235W | 1117MiB / 11519MiB | 44% Default |
+-------------------------------+----------------------+----------------------+
| 3 Tesla K40m On | 0000:84:00.0 Off | 0 |
| N/A 25C P0 63W / 235W | 1153MiB / 11519MiB | 47% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 9520 C python 2658MiB |
| 1 9520 C python 1113MiB |
| 2 9520 C python 1091MiB |
| 3 9520 C python 1127MiB |
+-----------------------------------------------------------------------------+