Created by: wangchaochaohu
import paddle.fluid as fluid
import paddle.fluid.compiler as compiler
import numpy
import os
from paddle.fluid import profiler
use_cuda = True
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
train_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(train_program, startup_program):
data = fluid.layers.data(name='X', shape=[1], dtype='float32')
hidden = fluid.layers.fc(input=data, size=10)
loss = fluid.layers.mean(hidden)
fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
startup_program.random_seed=1
exe.run(startup_program)
x = numpy.random.random(size=(10, 1)).astype('float32')
compiled_prog = compiler.CompiledProgram(
train_program).with_data_parallel(
loss_name=loss.name)
profiler.start_profiler("All", "Default")
loss_data, = exe.run(compiled_prog,
feed={"X": x},
fetch_list=[loss.name])
profiler.stop_profiler("total", "./profile/test")
result:
-------------------------> Profiling Report <-------------------------
Note! This Report merge all thread info into one.
Place: All
Time unit: ms
Sorted by total time in descending order in the same thread
Event Calls Total CPU Time (Ratio) GPU Time (Ratio) Min. Max. Ave. Ratio.
FastThreadedSSAGraphExecutorPrepare 1 16.5899 16.576763 (0.999205) 0.013184 (0.000795) 16.5899 16.5899 16.5899 0.869572
GpuMemcpyAsync:CPU->GPU 1 0.82097 0.818186 (0.996609) 0.002784 (0.003391) 0.82097 0.82097 0.82097 0.0430316
mul 1 0.307604 0.299764 (0.974513) 0.007840 (0.025487) 0.307604 0.307604 0.307604 0.0161232
mean_grad 1 0.307265 0.305441 (0.994064) 0.001824 (0.005936) 0.307265 0.307265 0.307265 0.0161055
mean 1 0.210628 0.207812 (0.986630) 0.002816 (0.013370) 0.210628 0.210628 0.210628 0.0110402
Scale LossGrad 1 0.181104 0.179248 (0.989752) 0.001856 (0.010248) 0.181104 0.181104 0.181104 0.00949267
Scale LossGrad/GpuMemcpyAsync:CPU->GPU 1 0.043618 0.041762 (0.957449) 0.001856 (0.042551) 0.043618 0.043618 0.043618 0.00228626
Fetch 1 0.154283 0.151531 (0.982163) 0.002752 (0.017837) 0.154283 0.154283 0.154283 0.00808683
Fetch/GpuMemcpyAsync:GPU->CPU 1 0.054635 0.051883 (0.949629) 0.002752 (0.050371) 0.054635 0.054635 0.054635 0.00286372
sgd 2 0.149299 0.146227 (0.979424) 0.003072 (0.020576) 0.046721 0.102578 0.0746495 0.00782559
mul_grad 1 0.146942 0.139166 (0.947081) 0.007776 (0.052919) 0.146942 0.146942 0.146942 0.00770205
elementwise_add 1 0.087953 0.085713 (0.974532) 0.002240 (0.025468) 0.087953 0.087953 0.087953 0.00461011
elementwise_add_grad 1 0.076498 0.072978 (0.953986) 0.003520 (0.046014) 0.076498 0.076498 0.076498 0.00400969
eager_deletion 5 0.026352 0.026352 (1.000000) 0.000000 (0.000000) 0.00237 0.008957 0.0052704 0.00138126
ScopeBufferedMonitor::post_local_exec_scopes_process 1 0.010649 0.010649 (1.000000) 0.000000 (0.000000) 0.010649 0.010649 0.010649 0.000558173
InitLocalVars 1 0.005806 0.005806 (1.000000) 0.000000 (0.000000) 0.005806 0.005806 0.005806 0.000304325
ScopeBufferedMonitor::pre_local_exec_scopes_process 1 0.002998 0.002998 (1.000000) 0.000000 (0.000000) 0.002998 0.002998 0.002998 0.000157142
Event Calls Total CPU Time (Ratio) GPU Time (Ratio) Min. Max. Ave. Ratio.
FastThreadedSSAGraphExecutorPrepare 1 5.63753 5.624824 (0.997747) 0.012704 (0.002253) 5.63753 5.63753 5.63753 0.722117
GpuMemcpyAsync:CPU->GPU 1 0.893649 0.891025 (0.997064) 0.002624 (0.002936) 0.893649 0.893649 0.893649 0.114469
mul 1 0.252361 0.243945 (0.966651) 0.008416 (0.033349) 0.252361 0.252361 0.252361 0.0323252
mul/prepare_data 1 0.004929 0.004929 (1.000000) 0.000000 (0.000000) 0.004929 0.004929 0.004929 0.000631361
mul/infer_shape 1 0.014168 0.014168 (1.000000) 0.000000 (0.000000) 0.014168 0.014168 0.014168 0.0018148
mul/compute 1 0.192943 0.184527 (0.956381) 0.008416 (0.043619) 0.192943 0.192943 0.192943 0.0247143
mean 1 0.17345 0.170698 (0.984134) 0.002752 (0.015866) 0.17345 0.17345 0.17345 0.0222174
mean/prepare_data 1 0.003962 0.003962 (1.000000) 0.000000 (0.000000) 0.003962 0.003962 0.003962 0.000507497
mean/infer_shape 1 0.003433 0.003433 (1.000000) 0.000000 (0.000000) 0.003433 0.003433 0.003433 0.000439737
mean/compute 1 0.135527 0.132775 (0.979694) 0.002752 (0.020306) 0.135527 0.135527 0.135527 0.0173598
sgd 2 0.168491 0.165387 (0.981578) 0.003104 (0.018422) 0.04685 0.121641 0.0842455 0.0215822
sgd/prepare_data 2 0.007545 0.007545 (1.000000) 0.000000 (0.000000) 0.003226 0.004319 0.0037725 0.000966448
sgd/infer_shape 2 0.013376 0.013376 (1.000000) 0.000000 (0.000000) 0.003874 0.009502 0.006688 0.00171335
sgd/compute 2 0.111892 0.108788 (0.972259) 0.003104 (0.027741) 0.029766 0.082126 0.055946 0.0143324
Fetch 1 0.145729 0.143233 (0.982872) 0.002496 (0.017128) 0.145729 0.145729 0.145729 0.0186666
Fetch/GpuMemcpyAsync:GPU->CPU 1 0.057451 0.054955 (0.956554) 0.002496 (0.043446) 0.057451 0.057451 0.057451 0.00735896
Scale LossGrad 1 0.137517 0.135661 (0.986503) 0.001856 (0.013497) 0.137517 0.137517 0.137517 0.0176147
Scale LossGrad/GpuMemcpyAsync:CPU->GPU 1 0.034418 0.032562 (0.946075) 0.001856 (0.053925) 0.034418 0.034418 0.034418 0.00440864
mul_grad 1 0.106383 0.098575 (0.926605) 0.007808 (0.073395) 0.106383 0.106383 0.106383 0.0136267
mul_grad/prepare_data 1 0.001949 0.001949 (1.000000) 0.000000 (0.000000) 0.001949 0.001949 0.001949 0.00024965
mul_grad/infer_shape 1 0.002696 0.002696 (1.000000) 0.000000 (0.000000) 0.002696 0.002696 0.002696 0.000345334
mul_grad/compute 1 0.091502 0.083694 (0.914669) 0.007808 (0.085331) 0.091502 0.091502 0.091502 0.0117206
elementwise_add 1 0.084146 0.081970 (0.974140) 0.002176 (0.025860) 0.084146 0.084146 0.084146 0.0107784
elementwise_add/prepare_data 1 0.002281 0.002281 (1.000000) 0.000000 (0.000000) 0.002281 0.002281 0.002281 0.000292176
elementwise_add/infer_shape 1 0.011783 0.011783 (1.000000) 0.000000 (0.000000) 0.011783 0.011783 0.011783 0.0015093
elementwise_add/compute 1 0.058798 0.056622 (0.962992) 0.002176 (0.037008) 0.058798 0.058798 0.058798 0.0075315
elementwise_add_grad 1 0.079198 0.075646 (0.955150) 0.003552 (0.044850) 0.079198 0.079198 0.079198 0.0101446
elementwise_add_grad/prepare_data 1 0.003329 0.003329 (1.000000) 0.000000 (0.000000) 0.003329 0.003329 0.003329 0.000426415
elementwise_add_grad/infer_shape 1 0.003818 0.003818 (1.000000) 0.000000 (0.000000) 0.003818 0.003818 0.003818 0.000489052
elementwise_add_grad/compute 1 0.059754 0.056202 (0.940556) 0.003552 (0.059444) 0.059754 0.059754 0.059754 0.00765396
mean_grad 1 0.076115 0.074387 (0.977298) 0.001728 (0.022702) 0.076115 0.076115 0.076115 0.00974966
mean_grad/prepare_data 1 0.003691 0.003691 (1.000000) 0.000000 (0.000000) 0.003691 0.003691 0.003691 0.000472784
mean_grad/infer_shape 1 0.002212 0.002212 (1.000000) 0.000000 (0.000000) 0.002212 0.002212 0.002212 0.000283338
mean_grad/compute 1 0.061153 0.059425 (0.971743) 0.001728 (0.028257) 0.061153 0.061153 0.061153 0.00783316
eager_deletion 5 0.036481 0.036481 (1.000000) 0.000000 (0.000000) 0.002893 0.022223 0.0072962 0.00467289
ScopeBufferedMonitor::post_local_exec_scopes_process 1 0.008679 0.008679 (1.000000) 0.000000 (0.000000) 0.008679 0.008679 0.008679 0.0011117
InitLocalVars 1 0.005406 0.005406 (1.000000) 0.000000 (0.000000) 0.005406 0.005406 0.005406 0.000692461
ScopeBufferedMonitor::pre_local_exec_scopes_process 1 0.001809 0.001809 (1.000000) 0.000000 (0.000000) 0.001809 0.001809 0.001809 0.000231717