Why dygraph runs faster than executer(static graph) in each training iteration?
Created by: larenzhang
如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息:
- 标题:简洁、精准概括您的问题,例如“Insufficient Memory xxx" ”
- 版本、环境信息: 1)PaddlePaddle版本:1.7.0 3)GPU: titan xp, cuda 10.1, cudnn 7.6.1 4)系统环境:ubuntu 16.04, python3.5
- 训练信息 1)单机, 单卡
dygraph训练代码:
import argparse
import re
import subprocess
import time
import numpy as np
import models.dynamic_resnet as res_models
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
import paddle
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-a', '--arch', default='ResNet18', type=str)
parser.add_argument('--batch-size', type=int, default=64) # 128
parser.add_argument('--learning-rate', type=float, default=0.025)
parser.add_argument('--momentum', type=float, default=0.9)
parser.add_argument('--weight-decay', type=float, default=1e-4)
parser.add_argument('--dynamic', action='store_true')
parser.add_argument('--single', action='store_true')
parser.add_argument('--use_data_parallel', action='store_true')
parser.add_argument('--step', type=int, default=100)
return parser.parse_args()
def main():
args = parse_args()
worker(args)
def worker(args):
def train_func(data):
model.train()
# for batch_id, data in enumerate(train_reader()):
image = np.array([x[0].reshape(3, 224, 224) for x in data]).astype('float32')
label = np.array([x[1] for x in data]).astype('int64').reshape(-1, 1)
img = to_variable(image)
lab = to_variable(label)
# lab.stop_gradient = True
logits = model(img)
loss = fluid.layers.softmax_with_cross_entropy(logits, lab)
avg_loss = fluid.layers.mean(loss)
avg_loss.backward()
optimizer.minimize(avg_loss)
model.clear_gradients()
def infer_func(image):
image = fluid.dygraph.to_variable(image)
logits = model(image)
return logits
device_num = fluid.core.get_cuda_device_count()
if device_num > 0:
use_cuda = True
else:
use_cuda = False
image = np.random.random([args.batch_size, 3, 224, 224]).astype(np.float32)
label = np.random.randint(1000, size=[args.batch_size , 1])
def reader_generator():
def reader():
for i in range(len(image)):
yield image[i, :], label[i]
return reader
place = fluid.CUDAPlace(0)
with fluid.dygraph.guard(place):
if 'ResNet' in args.arch:
model = res_models.__dict__[args.arch]()
else:
raise NotImplementedError
optimizer = fluid.optimizer.Momentum(parameter_list=model.parameters(), \
learning_rate=args.learning_rate, \
momentum=args.momentum, \
regularization=fluid.regularizer.L2Decay(args.weight_decay))
train_reader = paddle.batch(reader_generator(), batch_size=args.batch_size, drop_last=False)
data = iter(train_reader()).__next__()
for i in range(10):
train_func(data)
start_time = time.time()
for i in range(args.step):
train_func(data)
avg_time = (time.time()-start_time) / args.step
stdout = subprocess.getoutput('nvidia-smi')
mem = re.findall(r'\| (.*?)MiB /', stdout)[0].strip()
print('Paddle,Model:{0},Dy:{1},#GPU:{2},batch_size:{3},mem:{4}M, avg_time:{5:.3f}ms'.\
format(args.arch, args.dynamic, device_num, args.batch_size, mem, avg_time*1000))
if __name__ == "__main__":
main()
executor 代码
import argparse
import os
import re
import subprocess
import time
import numpy as np
import models.static_resnet as res_models
import paddle.fluid as fluid
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-a', '--arch', default='ResNet18', type=str)
parser.add_argument('--batch-size', type=int, default=64) # 128
parser.add_argument('--learning-rate', type=float, default=0.025)
parser.add_argument('--momentum', type=float, default=0.9)
parser.add_argument('--weight-decay', type=float, default=1e-4)
parser.add_argument('--dynamic', action='store_true')
parser.add_argument('--single', action='store_true')
parser.add_argument('--compiled', action='store_true')
parser.add_argument('--step', type=int, default=100)
return parser.parse_args()
def main():
args = parse_args()
worker(args)
def worker(args):
if 'ResNet' in args.arch:
model = res_models.__dict__[args.arch]()
else:
raise NotImplementedError
def inference_program():
data_shape = [None, 3, 224, 224]
images = fluid.data(name='image', shape=data_shape, dtype='float32')
predict = model.net(images)
return predict
def train_program():
logits = inference_program()
label = fluid.data(name='label', shape=[None, 1], dtype='int64')
cost = fluid.layers.softmax_with_cross_entropy(logits, label)
avg_cost = fluid.layers.mean(cost)
# accuracy = fluid.layers.accuracy(input=predict, label=label)
return [avg_cost, logits]
device_num = fluid.core.get_cuda_device_count()
if device_num > 0:
use_cuda = True
else:
use_cuda = False
loss, predict = train_program()
image = np.random.random([args.batch_size, 3, 224, 224]).astype(np.float32) # pylint: disable=no-member
label = np.random.randint(1000, size=[args.batch_size, 1])
momentum_kwargs = {'learning_rate': args.learning_rate,
'momentum': args.momentum,
'regularization': fluid.regularizer.L2Decay(args.weight_decay)}
optimizer = fluid.optimizer.Momentum(**momentum_kwargs)
optimizer.minimize(loss)
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
main_program = fluid.default_main_program()
if args.compiled:
main_program = fluid.compiler.CompiledProgram(main_program)
exe.run(fluid.default_startup_program())
for i in range(10):
exe.run(program=main_program,
feed={'image': image, 'label': label},
fetch_list=[loss.name])
start_time = time.time()
for i in range(args.step):
exe.run(program=main_program,
feed={'image':image, 'label':label},
fetch_list=[loss.name])
avg_time = (time.time() - start_time)/args.step
stdout = subprocess.getoutput('nvidia-smi')
mem = re.findall(r'\| (.*?)MiB /', stdout)[0].strip()
print('PaddlePaddle, model:{0}, Dy:{1}, #GPU:{2}, batch_size:{3}, mem:{4}M, time:{5:.3f}ms'. \
format(args.arch, args.dynamic, device_num, args.batch_size, mem, avg_time*1000))
exe.close()
if __name__ == "__main__":
main()
在单卡titan XP, batch_size=64 训练100个iteration, 每个iteration平均测速时间为: dygraph:102.8ms executor:126.5ms executor with compile:108.6ms
按理说executor是静态图执行,dygraph是动态执行,executor应该比dygraph快才对,但是测试结果与预期相悖。是我代码写的有问题还是其他什么原因,期望得到官方解答。