动态图forward内存泄露
Created by: Gaffey
-
版本、环境信息: 1)PaddlePaddle版本:paddle 1.5 post 97 3)GPU:CUDA 9.0 CUDNN 7.1
-
训练信息 1)单机,单卡
-
复现信息: 复现代码
import os
import objgraph
from PIL import Image
import paddle
from paddle import fluid
from paddle.fluid.layer_helper import LayerHelper
import numpy as np
class CIFAR(fluid.dygraph.Layer):
def __init__(self, name_scope):
super(CIFAR, self).__init__(name_scope)
self._conv1 = fluid.dygraph.Conv2D(self.full_name(), 64, 3, 1, 1, act=None)
self._conv2 = fluid.dygraph.Conv2D(self.full_name(), 64, 3, 1, 1, act=None)
self.global_pooling = fluid.dygraph.Pool2D(self.full_name(), 32, "avg", 32, 0, True)
#scale = (2.0 / (512**2*10))**0.5
self._fc = fluid.dygraph.FC(self.full_name(),
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=0.01)),
act="softmax")
def forward(self, inputs):
#x = self._conv1(inputs)
#x = self._conv2(x)
x = self.global_pooling(inputs)
x = self._fc(x)
return x
def train(train_reader, test_reader, model):
optimizer = fluid.optimizer.MomentumOptimizer(learning_rate=0.1, momentum=0.9)#, regularization=fluid.regularizer.L2DecayRegularizer(5e-4))
for epoch in range(100):
acc_list = []
model.train()
for batch_id, data in enumerate(train_reader()):
dy_x_data = np.array([x[0].reshape(3, 32, 32)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(-1, 1)
img = fluid.dygraph.to_variable(dy_x_data)
label = fluid.dygraph.to_variable(y_data)
label.stop_gradient = True
print("before forward")
objgraph.show_most_common_types()
prediction = model(img)
print("after forward")
objgraph.show_most_common_types()
loss = fluid.layers.cross_entropy(prediction, label)
avg_loss = fluid.layers.mean(loss)
avg_loss.backward()
optimizer.minimize(avg_loss)
# save checkpoint
model.clear_gradients()
def main():
with fluid.dygraph.guard():
cifar = CIFAR("cifar10")
test_reader = paddle.batch(
paddle.dataset.cifar.test10(), batch_size=128, drop_last=True)
train_reader = paddle.batch(
paddle.dataset.cifar.train10(),
batch_size=128,
drop_last=True)
train(train_reader, test_reader, cifar)
if __name__ == "__main__":
main()
- 问题描述: 在训练时发现占用内存不断增长,从一开始的0.9%逐步提升到40%,会导致无法正常提交集群训练。用objgraph查看,发现dict类型数量不断增长。上述代码运行过程中(稳定后),每经过一次前向,dict增长5个,到下一次前向之前,会释放3个,净增长2个。且增长的dict数量和conv,fc数量相关(fc和conv增3释放1,pool增2释放2)。怀疑存在内存泄露问题。