layer_norm_norm gpu kernel在大batch时吞吐下降为原来的28%
Created by: mapingshuo
背景
1)PaddlePaddle版本:develop 2)GPU:V100 card, cuda9, cudnn7.3 3)单机单卡
问题描述
使用layer_norm_gradop跑单元测试时,发现:
- 在设置shape为[16, 512, 1024]时,profile:
------------------------- Event Summary -------------------------
Event Calls Total CPU Time (Ratio) GPU Time (Ratio) Min. Max. Ave. Ratio.
thread0::fetch 6 234.245 219.624291 (0.937585) 14.620334 (0.062415) 0.032856 219.047 39.0408 0.989253
GpuMemcpySync:GPU->CPU 6 30.2892 15.668869 (0.517309) 14.620334 (0.482691) 0.026736 15.1487 5.0482 0.127916
thread0::layer_norm_grad 1 1.69437 0.517288 (0.305298) 1.177085 (0.694702) 1.69437 1.69437 1.69437 0.00715561
thread0::layer_norm 1 0.781905 0.580090 (0.741893) 0.201815 (0.258107) 0.781905 0.781905 0.781905 0.00330211
thread0::feed 4 0.0685 0.068500 (1.000000) 0.000000 (0.000000) 0.002199 0.057143 0.017125 0.000289287
- 在设置为shape是[92, 512, 1024]时,profile:
------------------------- Event Summary -------------------------
Event Calls Total CPU Time (Ratio) GPU Time (Ratio) Min. Max. Ave. Ratio.
thread0::fetch 6 354.742 276.745855 (0.780132) 77.996640 (0.219868) 0.031346 276.298 59.1237 0.929612
GpuMemcpySync:GPU->CPU 6 156.962 78.965679 (0.503087) 77.996640 (0.496913) 0.025372 78.5781 26.1604 0.411324
thread0::layer_norm_grad 1 25.0046 0.611513 (0.024456) 24.393101 (0.975544) 25.0046 25.0046 25.0046 0.0655253
thread0::layer_norm 1 1.81516 0.693955 (0.382311) 1.121205 (0.617689) 1.81516 1.81516 1.81516 0.00475668
thread0::feed 4 0.040369 0.040369 (1.000000) 0.000000 (0.000000) 0.002411 0.03102 0.0100923 0.000105788
GPU Time: 1.177085 -> 24.393101, 吞吐:16/1.177085 -> 92/24.393101, 下降为原来的28%。
影响面
影响Bert-large模型在大batch(单卡token 4w)时的吞吐。
复现问题
安装paddle后,python执行以下脚本,
from __future__ import print_function
import numpy as np
from operator import mul
import paddle.fluid.core as core
import paddle.fluid as fluid
from functools import reduce
import time
import paddle.fluid.profiler as profiler
np.random.random(123)
def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
x_shape = x.shape
N = reduce(mul, x_shape[0:begin_norm_axis], 1)
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
x.shape = [N, D]
mean = np.mean(x, axis=1)
var = np.var(x, axis=1) + epsilon
output = scale.reshape([1, D]) * np.divide(
(x - mean.reshape([N, 1])),
(np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D])
x.shape, output.shape = x_shape, x_shape
return output, mean, var
place = core.CUDAPlace(0)
#shape=[16, 512, 1024]
shape=[92, 512, 1024]
begin_norm_axis=2
epsilon = 0.00001
x_shape = shape
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
scale_shape = [D]
print("initializing")
np.random.seed(123)
x = np.random.random_sample(x_shape).astype(np.float32)
scale = np.random.random_sample(scale_shape).astype(np.float32)
bias = np.random.random_sample(scale_shape).astype(np.float32)
y_grad = np.random.random_sample(x_shape).astype(np.float32)
# reference forward & backward
print("running baseline")
y, mean, variance = _reference_layer_norm_naive(
x, scale, bias, epsilon, begin_norm_axis)
var_dict = locals()
var_dict['y@GRAD'] = y_grad
var_names = [
'x', 'scale', 'bias', 'y', 'mean', 'variance', 'y@GRAD'
]
ground_truth = {name: var_dict[name] for name in var_names}
print("run paddle model")
program = fluid.Program()
with fluid.program_guard(program):
block = program.global_block()
for name in ground_truth:
block.create_var(
name=name,
dtype='float32',
shape=ground_truth[name].shape)
layer_norm_op = block.append_op(
type="layer_norm",
inputs={
"X": block.var('x'),
"Scale": block.var('scale'),
"Bias": block.var('bias'),
},
outputs={
"Y": block.var('y'),
"Mean": block.var('mean'), # share the same memory
"Variance":
block.var('variance'), # share the same memory
},
attrs={
"epsilon": epsilon,
"begin_norm_axis": begin_norm_axis
})
# generate backward op_desc
grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
layer_norm_op.desc, set(), [])
grad_op_desc = grad_op_desc_list[0]
new_op_desc = block.desc.append_op()
new_op_desc.copy_from(grad_op_desc)
for var_name in grad_op_desc.output_arg_names():
block.desc.var(var_name.encode("ascii"))
grad_op_desc.infer_var_type(block.desc)
grad_op_desc.infer_shape(block.desc)
for arg in grad_op_desc.output_arg_names():
grad_var = block.desc.find_var(arg.encode("ascii"))
grad_var.set_dtype(core.VarDesc.VarType.FP32)
program._sync_with_cpp()
exe = fluid.Executor(place)
print("begin profiler")
profiler.start_profiler("All")
out = exe.run(program,
feed={
name: var_dict[name]
for name in ['x', 'scale', 'bias', 'y@GRAD']
},
fetch_list=[
'y', 'mean', 'variance', 'x@GRAD',
'scale@GRAD', 'bias@GRAD'
])
profiler.stop_profiler("total", "./profile_file")
print("end profiler break!")