elementwise_mul_grad 1.5比1.4慢很多
Created by: sneaxiy
以下代码use_constant
为True和False时,性能差异很大。
import paddle.fluid as fluid
from paddle.fluid.backward import append_backward
import numpy as np
import time
C = 3
batch_size = 32
scale = 224
use_constant = True
print('use_constant {}'.format(use_constant))
shape = [batch_size, C, scale, scale]
place = fluid.CUDAPlace(0)
image = fluid.layers.data(name='image', shape=shape, dtype='float32', append_batch_size=False)
image.stop_gradient = False
w = float(3.0)
if not use_constant:
w = fluid.layers.create_global_var(
shape=[scale, scale],
value=w,
dtype='float32',
persistable=True)
out = image * w
loss = fluid.layers.reduce_mean(out)
append_backward(loss)
for op in fluid.default_main_program().global_block().ops:
print(op.type)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
compiled_prog = fluid.CompiledProgram(fluid.default_main_program())
in_tensor = fluid.LoDTensor()
in_tensor.set(np.random.random(size=shape).astype('float32'), place)
start_t = time.time()
for _ in range(1000):
exe.run(compiled_prog, feed={image.name: in_tensor}, fetch_list=[out])
end_t = time.time()
print('time cost: {}'.format(end_t - start_t))
Paddle 1.4 | Paddle 1.5 | |
---|---|---|
use_constant = False | 10.821 | 15.4534 |
use_constant = True | 12.2862 | 67.80492 |
Related to #19023 (closed).
怀疑是FastElemwiseGradBroadcast1CUDAKernel导致的问题。