1000x slower than tensorflow when reducing large array
Created by: cjld
Here is the paddle code:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import paddle
import paddle.fluid as fluid
sp = fluid.Program()
tp = fluid.Program()
image_shape = [50* 1000 * 2000]
def foo(input):
mean = fluid.layers.reduce_mean(input, dim=0, keep_dim=True)
return input-mean
with fluid.program_guard(tp, sp):
x = fluid.layers.data(name='img', shape=image_shape, dtype='float32', append_batch_size=False)
for i in range(10):
x = foo(x)
ttp = tp.clone(True)
fluid.release_memory(ttp, skip_opt_set=[x.name])
exe = fluid.Executor(fluid.CUDAPlace(0))
exe.run(sp)
imgs = np.zeros(image_shape).astype('float32')
result = exe.run(tp, feed={'img':imgs*1}, fetch_list=[x])
Here is the equivalent tensorflow code
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import tensorflow as tf
import numpy as np
image_shape = [50* 1000 * 2000]
x = tf.placeholder("float", image_shape)
y = x
for i in range(1000):
mean = tf.reduce_mean(y, axis=0)
y = y - mean
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
data = np.zeros(image_shape)
for i in range(5):
sess.run(y, feed_dict={x:data})
test on TITAN X, 3s per loop in paddle and 3ms per loop in tensorflow.
After some digging, paddle reduce mean by using eigen, and tf reduce mean by using cub, I think it is the reason that causes this issue.