Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
87b97776
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
87b97776
编写于
11月 29, 2021
作者:
Z
Zhanlue Yang
提交者:
GitHub
11月 29, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Added performance benchmakrs for Eager Dygraph (#37643)
上级
51804e4d
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
842 addition
and
0 deletion
+842
-0
paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
...luid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+180
-0
paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
...uid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+187
-0
paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
...luid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+221
-0
paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
...uid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+254
-0
未找到文件。
paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
0 → 100644
浏览文件 @
87b97776
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Eager Dygraph
#include <chrono>
#include "gtest/gtest.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/backward.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
// TODO(jiabin): remove nolint here!!!
using
namespace
egr
;
// NOLINT
// Disable pten path
DECLARE_bool
(
run_pten_kernel
);
TEST
(
Benchmark
,
Init
)
{
FLAGS_run_pten_kernel
=
false
;
}
TEST
(
Benchmark
,
EagerScaleCPU
)
{
// Prepare Device Contexts
egr
::
InitEnv
(
paddle
::
platform
::
CPUPlace
());
for
(
const
std
::
string
&
mode
:
{
"Accuracy"
,
"Performance"
})
{
paddle
::
framework
::
DDim
ddim
=
paddle
::
framework
::
make_ddim
({
2
,
4
,
4
,
4
});
egr
::
EagerTensor
tensor
=
EagerUtils
::
CreateTensorWithValue
(
ddim
,
paddle
::
platform
::
CPUPlace
(),
pten
::
DataType
::
FLOAT32
,
pten
::
DataLayout
::
NCHW
,
5.0
,
true
);
RetainGradForTensor
(
tensor
);
if
(
mode
==
"Accuracy"
)
{
benchmark_eager_scale
(
tensor
,
true
/* accuracy_check*/
);
}
else
if
(
mode
==
"Performance"
)
{
auto
t_start
=
std
::
chrono
::
high_resolution_clock
::
now
();
#ifdef WITH_GPERFTOOLS
ProfilerStart
(
"eager_scale_cpu.out"
);
#endif
benchmark_eager_scale
(
tensor
);
#ifdef WITH_GPERFTOOLS
ProfilerStop
();
#endif
auto
t_end
=
std
::
chrono
::
high_resolution_clock
::
now
();
double
elapsed_time_ms
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
t_end
-
t_start
).
count
();
std
::
cout
<<
"Duration: "
<<
elapsed_time_ms
<<
" ms"
<<
std
::
endl
;
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Fatal
(
"Unknown benchmark mode"
));
}
}
}
TEST
(
Benchmark
,
EagerIntermediateMatmulCPU
)
{
// Prepare Device Contexts
InitEnv
(
paddle
::
platform
::
CPUPlace
());
auto
tracer
=
std
::
make_shared
<
paddle
::
imperative
::
Tracer
>
();
paddle
::
imperative
::
SetCurrentTracer
(
tracer
);
for
(
const
std
::
string
&
mode
:
{
"Accuracy"
,
"Performance"
})
{
paddle
::
framework
::
DDim
ddimX
=
paddle
::
framework
::
make_ddim
({
2
,
2
});
egr
::
EagerTensor
X
=
EagerUtils
::
CreateTensorWithValue
(
ddimX
,
paddle
::
platform
::
CPUPlace
(),
pten
::
DataType
::
FLOAT32
,
pten
::
DataLayout
::
NCHW
,
1.0
,
true
);
RetainGradForTensor
(
X
);
paddle
::
framework
::
DDim
ddimY
=
paddle
::
framework
::
make_ddim
({
2
,
2
});
egr
::
EagerTensor
Y
=
EagerUtils
::
CreateTensorWithValue
(
ddimY
,
paddle
::
platform
::
CPUPlace
(),
pten
::
DataType
::
FLOAT32
,
pten
::
DataLayout
::
NCHW
,
2.0
,
true
);
RetainGradForTensor
(
Y
);
if
(
mode
==
"Accuracy"
)
{
benchmark_eager_intermediate_matmul
(
X
,
Y
,
true
/* accuracy_check */
);
}
else
if
(
mode
==
"Performance"
)
{
auto
t_start
=
std
::
chrono
::
high_resolution_clock
::
now
();
#ifdef WITH_GPERFTOOLS
ProfilerStart
(
"eager_intermediate_matmul_cpu.out"
);
#endif
benchmark_eager_intermediate_matmul
(
X
,
Y
);
#ifdef WITH_GPERFTOOLS
ProfilerStop
();
#endif
auto
t_end
=
std
::
chrono
::
high_resolution_clock
::
now
();
double
elapsed_time_ms
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
t_end
-
t_start
).
count
();
std
::
cout
<<
"Duration: "
<<
elapsed_time_ms
<<
" ms"
<<
std
::
endl
;
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Fatal
(
"Unknown benchmark mode"
));
}
}
}
TEST
(
Benchmark
,
EagerIntermediateMLPCPU
)
{
// Prepare Device Contexts
InitEnv
(
paddle
::
platform
::
CPUPlace
());
auto
tracer
=
std
::
make_shared
<
paddle
::
imperative
::
Tracer
>
();
paddle
::
imperative
::
SetCurrentTracer
(
tracer
);
for
(
const
std
::
string
&
mode
:
{
"Accuracy"
,
"Performance"
})
{
paddle
::
framework
::
DDim
ddimX
=
paddle
::
framework
::
make_ddim
({
MLP_M
,
MLP_N
});
egr
::
EagerTensor
X
=
EagerUtils
::
CreateTensorWithValue
(
ddimX
,
paddle
::
platform
::
CPUPlace
(),
pten
::
DataType
::
FLOAT32
,
pten
::
DataLayout
::
NCHW
,
MLP_X_VAL
,
true
);
RetainGradForTensor
(
X
);
std
::
vector
<
EagerTensor
>
Ws
;
std
::
vector
<
EagerTensor
>
Bs
;
for
(
size_t
i
=
0
;
i
<
MLP_NUM_LINEAR
;
i
++
)
{
paddle
::
framework
::
DDim
ddimW
=
paddle
::
framework
::
make_ddim
({
MLP_N
,
MLP_K
});
egr
::
EagerTensor
W
=
EagerUtils
::
CreateTensorWithValue
(
ddimW
,
paddle
::
platform
::
CPUPlace
(),
pten
::
DataType
::
FLOAT32
,
pten
::
DataLayout
::
NCHW
,
MLP_W_VAL
,
true
);
RetainGradForTensor
(
W
);
paddle
::
framework
::
DDim
ddimB
=
paddle
::
framework
::
make_ddim
({
MLP_K
});
egr
::
EagerTensor
B
=
EagerUtils
::
CreateTensorWithValue
(
ddimB
,
paddle
::
platform
::
CPUPlace
(),
pten
::
DataType
::
FLOAT32
,
pten
::
DataLayout
::
NCHW
,
MLP_B_VAL
,
true
);
RetainGradForTensor
(
B
);
Ws
.
emplace_back
(
std
::
move
(
W
));
Bs
.
emplace_back
(
std
::
move
(
B
));
}
if
(
mode
==
"Accuracy"
)
{
benchmark_eager_intermediate_mlp
(
X
,
Ws
,
Bs
,
true
/* accuracy_check */
);
}
else
if
(
mode
==
"Performance"
)
{
auto
t_start
=
std
::
chrono
::
high_resolution_clock
::
now
();
#ifdef WITH_GPERFTOOLS
ProfilerStart
(
"eager_intermediate_mlp_cpu.out"
);
#endif
benchmark_eager_intermediate_mlp
(
X
,
Ws
,
Bs
);
#ifdef WITH_GPERFTOOLS
ProfilerStop
();
#endif
auto
t_end
=
std
::
chrono
::
high_resolution_clock
::
now
();
double
elapsed_time_ms
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
t_end
-
t_start
).
count
();
std
::
cout
<<
"Duration: "
<<
elapsed_time_ms
<<
" ms"
<<
std
::
endl
;
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Fatal
(
"Unknown benchmark mode"
));
}
}
}
paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
0 → 100644
浏览文件 @
87b97776
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Eager Dygraph
#include <chrono>
#include "gtest/gtest.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/backward.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
// TODO(jiabin): remove nolint here!!!
using
namespace
egr
;
// NOLINT
DECLARE_bool
(
run_pten_kernel
);
TEST
(
Benchmark
,
Init
)
{
FLAGS_run_pten_kernel
=
false
;
}
TEST
(
Benchmark
,
EagerScaleCUDA
)
{
egr
::
InitEnv
(
paddle
::
platform
::
CUDAPlace
());
for
(
const
std
::
string
&
mode
:
{
"Accuracy"
,
"WarmUp"
,
"Performance"
})
{
paddle
::
framework
::
DDim
ddim
=
paddle
::
framework
::
make_ddim
({
2
,
4
,
4
,
4
});
egr
::
EagerTensor
tensor
=
EagerUtils
::
CreateTensorWithValue
(
ddim
,
paddle
::
platform
::
CUDAPlace
(),
pten
::
DataType
::
FLOAT32
,
pten
::
DataLayout
::
NCHW
,
5.0
/*value*/
,
true
/*is_leaf*/
);
RetainGradForTensor
(
tensor
);
if
(
mode
==
"Accuracy"
)
{
benchmark_eager_scale
(
tensor
,
true
/* accuracy_check */
);
}
else
if
(
mode
==
"WarmUp"
)
{
benchmark_eager_scale
(
tensor
);
}
else
if
(
mode
==
"Performance"
)
{
auto
t_start
=
std
::
chrono
::
high_resolution_clock
::
now
();
#ifdef WITH_GPERFTOOLS
ProfilerStart
(
"eager_scale_cuda.out"
);
#endif
benchmark_eager_scale
(
tensor
);
#ifdef WITH_GPERFTOOLS
ProfilerStop
();
#endif
auto
t_end
=
std
::
chrono
::
high_resolution_clock
::
now
();
double
elapsed_time_ms
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
t_end
-
t_start
).
count
();
std
::
cout
<<
"Duration: "
<<
elapsed_time_ms
<<
" ms"
<<
std
::
endl
;
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Fatal
(
"Unknown benchmark mode"
));
}
}
}
TEST
(
Benchmark
,
EagerIntermediateMatmulCUDA
)
{
paddle
::
platform
::
CUDAPlace
place
;
egr
::
InitEnv
(
place
);
auto
tracer
=
std
::
make_shared
<
paddle
::
imperative
::
Tracer
>
();
tracer
->
SetExpectedPlace
(
place
);
paddle
::
imperative
::
SetCurrentTracer
(
tracer
);
for
(
const
std
::
string
&
mode
:
{
"Accuracy"
,
"WarmUp"
,
"Performance"
})
{
paddle
::
framework
::
DDim
ddimX
=
paddle
::
framework
::
make_ddim
({
2
,
2
});
egr
::
EagerTensor
X
=
EagerUtils
::
CreateTensorWithValue
(
ddimX
,
paddle
::
platform
::
CUDAPlace
(),
pten
::
DataType
::
FLOAT32
,
pten
::
DataLayout
::
NCHW
,
1.0
,
true
);
RetainGradForTensor
(
X
);
paddle
::
framework
::
DDim
ddimY
=
paddle
::
framework
::
make_ddim
({
2
,
2
});
egr
::
EagerTensor
Y
=
EagerUtils
::
CreateTensorWithValue
(
ddimY
,
paddle
::
platform
::
CUDAPlace
(),
pten
::
DataType
::
FLOAT32
,
pten
::
DataLayout
::
NCHW
,
2.0
,
true
);
RetainGradForTensor
(
Y
);
if
(
mode
==
"Accuracy"
)
{
benchmark_eager_intermediate_matmul
(
X
,
Y
,
true
/* accuracy_check */
);
}
else
if
(
mode
==
"WarmUp"
)
{
benchmark_eager_intermediate_matmul
(
X
,
Y
);
}
else
if
(
mode
==
"Performance"
)
{
auto
t_start
=
std
::
chrono
::
high_resolution_clock
::
now
();
#ifdef WITH_GPERFTOOLS
ProfilerStart
(
"eager_intermediate_matmul_cuda.out"
);
#endif
benchmark_eager_intermediate_matmul
(
X
,
Y
);
#ifdef WITH_GPERFTOOLS
ProfilerStop
();
#endif
auto
t_end
=
std
::
chrono
::
high_resolution_clock
::
now
();
double
elapsed_time_ms
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
t_end
-
t_start
).
count
();
std
::
cout
<<
"Duration: "
<<
elapsed_time_ms
<<
" ms"
<<
std
::
endl
;
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Fatal
(
"Unknown benchmark mode"
));
}
}
}
TEST
(
Benchmark
,
EagerIntermediateMLPCUDA
)
{
paddle
::
platform
::
CUDAPlace
place
;
egr
::
InitEnv
(
place
);
auto
tracer
=
std
::
make_shared
<
paddle
::
imperative
::
Tracer
>
();
tracer
->
SetExpectedPlace
(
place
);
paddle
::
imperative
::
SetCurrentTracer
(
tracer
);
for
(
const
std
::
string
&
mode
:
{
"Accuracy"
,
"WarmUp"
,
"Performance"
})
{
paddle
::
framework
::
DDim
ddimX
=
paddle
::
framework
::
make_ddim
({
MLP_M
,
MLP_N
});
egr
::
EagerTensor
X
=
EagerUtils
::
CreateTensorWithValue
(
ddimX
,
paddle
::
platform
::
CUDAPlace
(),
pten
::
DataType
::
FLOAT32
,
pten
::
DataLayout
::
NCHW
,
MLP_X_VAL
,
true
);
RetainGradForTensor
(
X
);
std
::
vector
<
EagerTensor
>
Ws
;
std
::
vector
<
EagerTensor
>
Bs
;
for
(
size_t
i
=
0
;
i
<
MLP_NUM_LINEAR
;
i
++
)
{
paddle
::
framework
::
DDim
ddimW
=
paddle
::
framework
::
make_ddim
({
MLP_N
,
MLP_K
});
egr
::
EagerTensor
W
=
EagerUtils
::
CreateTensorWithValue
(
ddimW
,
paddle
::
platform
::
CUDAPlace
(),
pten
::
DataType
::
FLOAT32
,
pten
::
DataLayout
::
NCHW
,
MLP_W_VAL
,
true
);
RetainGradForTensor
(
W
);
paddle
::
framework
::
DDim
ddimB
=
paddle
::
framework
::
make_ddim
({
MLP_K
});
egr
::
EagerTensor
B
=
EagerUtils
::
CreateTensorWithValue
(
ddimB
,
paddle
::
platform
::
CUDAPlace
(),
pten
::
DataType
::
FLOAT32
,
pten
::
DataLayout
::
NCHW
,
MLP_B_VAL
,
true
);
RetainGradForTensor
(
B
);
Ws
.
emplace_back
(
std
::
move
(
W
));
Bs
.
emplace_back
(
std
::
move
(
B
));
}
if
(
mode
==
"Accuracy"
)
{
benchmark_eager_intermediate_mlp
(
X
,
Ws
,
Bs
,
true
/* accuracy_check */
);
}
else
if
(
mode
==
"WarmUp"
)
{
benchmark_eager_intermediate_mlp
(
X
,
Ws
,
Bs
);
}
else
if
(
mode
==
"Performance"
)
{
auto
t_start
=
std
::
chrono
::
high_resolution_clock
::
now
();
#ifdef WITH_GPERFTOOLS
ProfilerStart
(
"eager_intermediate_mlp_cuda.out"
);
#endif
benchmark_eager_intermediate_mlp
(
X
,
Ws
,
Bs
);
#ifdef WITH_GPERFTOOLS
ProfilerStop
();
#endif
auto
t_end
=
std
::
chrono
::
high_resolution_clock
::
now
();
double
elapsed_time_ms
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
t_end
-
t_start
).
count
();
std
::
cout
<<
"Duration: "
<<
elapsed_time_ms
<<
" ms"
<<
std
::
endl
;
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Fatal
(
"Unknown benchmark mode"
));
}
}
}
paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
0 → 100644
浏览文件 @
87b97776
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <paddle/fluid/framework/op_registry.h>
#include <chrono>
#include <iostream>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/fluid/imperative/basic_engine.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/memory/memcpy.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
// Disable pten path
DECLARE_bool
(
run_pten_kernel
);
TEST
(
Benchmark
,
Init
)
{
FLAGS_run_pten_kernel
=
false
;
}
namespace
paddle
{
namespace
imperative
{
TEST
(
Benchmark
,
FluidScaleCPU
)
{
// Prepare Device Contexts
platform
::
CPUPlace
place
;
egr
::
InitEnv
(
place
);
for
(
const
std
::
string
&
mode
:
{
"Accuracy"
,
"Performance"
})
{
std
::
shared_ptr
<
imperative
::
VarBase
>
X
(
new
imperative
::
VarBase
(
true
,
"X"
));
X
->
SetOverridedStopGradient
(
false
);
std
::
vector
<
float
>
src_data
(
128
,
5.0
);
std
::
vector
<
int64_t
>
dims
=
{
2
,
4
,
4
,
4
};
auto
*
x_tensor
=
X
->
MutableVar
()
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
dims
));
auto
*
mutable_x
=
x_tensor
->
mutable_data
<
float
>
(
place
);
paddle
::
memory
::
Copy
(
place
,
mutable_x
,
place
,
src_data
.
data
(),
sizeof
(
float
)
*
src_data
.
size
());
if
(
mode
==
"Accuracy"
)
{
benchmark_fluid_scale
(
X
,
platform
::
Place
(
place
),
true
/* accuracy_check */
);
}
else
if
(
mode
==
"Performance"
)
{
auto
t_start
=
std
::
chrono
::
high_resolution_clock
::
now
();
#ifdef WITH_GPERFTOOLS
ProfilerStart
(
"fluid_scale_cpu.out"
);
#endif
benchmark_fluid_scale
(
X
,
platform
::
Place
(
place
));
#ifdef WITH_GPERFTOOLS
ProfilerStop
();
#endif
auto
t_end
=
std
::
chrono
::
high_resolution_clock
::
now
();
double
elapsed_time_ms
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
t_end
-
t_start
).
count
();
std
::
cout
<<
"Duration: "
<<
elapsed_time_ms
<<
" ms"
<<
std
::
endl
;
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Fatal
(
"Unknown benchmark mode"
));
}
}
}
TEST
(
Benchmark
,
FluidMatmulCPU
)
{
// Prepare Device Contexts
platform
::
CPUPlace
place
;
egr
::
InitEnv
(
place
);
for
(
const
std
::
string
&
mode
:
{
"Accuracy"
,
"Performance"
})
{
std
::
shared_ptr
<
imperative
::
VarBase
>
X
(
new
imperative
::
VarBase
(
true
,
"X"
));
X
->
SetOverridedStopGradient
(
false
);
std
::
shared_ptr
<
imperative
::
VarBase
>
Y
(
new
imperative
::
VarBase
(
true
,
"Y"
));
Y
->
SetOverridedStopGradient
(
false
);
std
::
vector
<
float
>
x_src_data
(
4
,
1.0
);
std
::
vector
<
float
>
y_src_data
(
4
,
2.0
);
std
::
vector
<
int64_t
>
dims
=
{
2
,
2
};
auto
*
x_tensor
=
X
->
MutableVar
()
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
dims
));
auto
*
mutable_x
=
x_tensor
->
mutable_data
<
float
>
(
place
);
paddle
::
memory
::
Copy
(
place
,
mutable_x
,
place
,
x_src_data
.
data
(),
sizeof
(
float
)
*
x_src_data
.
size
());
auto
*
y_tensor
=
Y
->
MutableVar
()
->
GetMutable
<
framework
::
LoDTensor
>
();
y_tensor
->
Resize
(
framework
::
make_ddim
(
dims
));
auto
*
mutable_y
=
y_tensor
->
mutable_data
<
float
>
(
place
);
paddle
::
memory
::
Copy
(
place
,
mutable_y
,
place
,
y_src_data
.
data
(),
sizeof
(
float
)
*
y_src_data
.
size
());
if
(
mode
==
"Accuracy"
)
{
benchmark_fluid_matmul
(
X
,
Y
,
platform
::
Place
(
place
),
true
/* accuracy_check */
);
}
else
if
(
mode
==
"Performance"
)
{
auto
t_start
=
std
::
chrono
::
high_resolution_clock
::
now
();
#ifdef WITH_GPERFTOOLS
ProfilerStart
(
"fluid_matmul_cpu.out"
);
#endif
benchmark_fluid_matmul
(
X
,
Y
,
platform
::
Place
(
place
));
#ifdef WITH_GPERFTOOLS
ProfilerStop
();
#endif
auto
t_end
=
std
::
chrono
::
high_resolution_clock
::
now
();
double
elapsed_time_ms
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
t_end
-
t_start
).
count
();
std
::
cout
<<
"Duration: "
<<
elapsed_time_ms
<<
" ms"
<<
std
::
endl
;
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Fatal
(
"Unknown benchmark mode"
));
}
}
}
TEST
(
Benchmark
,
FluidMLPCPU
)
{
// Prepare Device Contexts
platform
::
CPUPlace
place
;
egr
::
InitEnv
(
place
);
for
(
const
std
::
string
&
mode
:
{
"Accuracy"
,
"Performance"
})
{
std
::
vector
<
float
>
x_src_data
(
MLP_M
*
MLP_N
,
MLP_X_VAL
);
std
::
vector
<
float
>
w_src_data
(
MLP_N
*
MLP_K
,
MLP_W_VAL
);
std
::
vector
<
float
>
b_src_data
(
MLP_K
,
MLP_B_VAL
);
std
::
vector
<
int64_t
>
x_dims
=
{
MLP_M
,
MLP_N
};
std
::
vector
<
int64_t
>
w_dims
=
{
MLP_N
,
MLP_K
};
std
::
vector
<
int64_t
>
b_dims
=
{
MLP_K
};
std
::
shared_ptr
<
imperative
::
VarBase
>
X
(
new
imperative
::
VarBase
(
true
,
"X"
));
X
->
SetOverridedStopGradient
(
false
);
auto
*
x_tensor
=
X
->
MutableVar
()
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
x_dims
));
auto
*
mutable_x
=
x_tensor
->
mutable_data
<
float
>
(
place
);
paddle
::
memory
::
Copy
(
place
,
mutable_x
,
place
,
x_src_data
.
data
(),
sizeof
(
float
)
*
x_src_data
.
size
());
std
::
vector
<
std
::
shared_ptr
<
imperative
::
VarBase
>>
Ws
;
std
::
vector
<
std
::
shared_ptr
<
imperative
::
VarBase
>>
Bs
;
for
(
size_t
i
=
0
;
i
<
MLP_NUM_LINEAR
;
i
++
)
{
std
::
shared_ptr
<
imperative
::
VarBase
>
W
(
new
imperative
::
VarBase
(
true
,
"W"
));
W
->
SetOverridedStopGradient
(
false
);
std
::
shared_ptr
<
imperative
::
VarBase
>
B
(
new
imperative
::
VarBase
(
true
,
"B"
));
B
->
SetOverridedStopGradient
(
false
);
auto
*
w_tensor
=
W
->
MutableVar
()
->
GetMutable
<
framework
::
LoDTensor
>
();
w_tensor
->
Resize
(
framework
::
make_ddim
(
w_dims
));
auto
*
mutable_w
=
w_tensor
->
mutable_data
<
float
>
(
place
);
paddle
::
memory
::
Copy
(
place
,
mutable_w
,
place
,
w_src_data
.
data
(),
sizeof
(
float
)
*
w_src_data
.
size
());
auto
*
b_tensor
=
B
->
MutableVar
()
->
GetMutable
<
framework
::
LoDTensor
>
();
b_tensor
->
Resize
(
framework
::
make_ddim
(
b_dims
));
auto
*
mutable_b
=
b_tensor
->
mutable_data
<
float
>
(
place
);
paddle
::
memory
::
Copy
(
place
,
mutable_b
,
place
,
b_src_data
.
data
(),
sizeof
(
float
)
*
b_src_data
.
size
());
Ws
.
emplace_back
(
std
::
move
(
W
));
Bs
.
emplace_back
(
std
::
move
(
B
));
}
if
(
mode
==
"Accuracy"
)
{
benchmark_fluid_mlp
(
X
,
Ws
,
Bs
,
platform
::
Place
(
place
),
true
/* accuracy_check */
);
}
else
if
(
mode
==
"Performance"
)
{
auto
t_start
=
std
::
chrono
::
high_resolution_clock
::
now
();
#ifdef WITH_GPERFTOOLS
ProfilerStart
(
"fluid_mlp_cpu.out"
);
#endif
benchmark_fluid_mlp
(
X
,
Ws
,
Bs
,
platform
::
Place
(
place
));
#ifdef WITH_GPERFTOOLS
ProfilerStop
();
#endif
auto
t_end
=
std
::
chrono
::
high_resolution_clock
::
now
();
double
elapsed_time_ms
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
t_end
-
t_start
).
count
();
std
::
cout
<<
"Duration: "
<<
elapsed_time_ms
<<
" ms"
<<
std
::
endl
;
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Fatal
(
"Unknown benchmark mode"
));
}
}
}
}
// namespace imperative
}
// namespace paddle
USE_OP
(
scale
);
USE_OP
(
matmul_v2
);
USE_OP
(
reduce_sum
);
paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
0 → 100644
浏览文件 @
87b97776
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <paddle/fluid/framework/op_registry.h>
#include <chrono>
#include <iostream>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/fluid/imperative/basic_engine.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/memory/memcpy.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
// Disable pten path
DECLARE_bool
(
run_pten_kernel
);
TEST
(
Benchmark
,
Init
)
{
FLAGS_run_pten_kernel
=
false
;
}
namespace
paddle
{
namespace
imperative
{
TEST
(
Benchmark
,
FluidScaleCUDA
)
{
// Prepare Device Contexts
platform
::
CUDAPlace
place
;
egr
::
InitEnv
(
place
);
for
(
const
std
::
string
&
mode
:
{
"Accuracy"
,
"WarmUp"
,
"Performance"
})
{
std
::
shared_ptr
<
imperative
::
VarBase
>
X
(
new
imperative
::
VarBase
(
true
,
"X"
));
X
->
SetOverridedStopGradient
(
false
);
std
::
vector
<
float
>
src_data
(
128
,
5.0
);
std
::
vector
<
int64_t
>
dims
=
{
2
,
4
,
4
,
4
};
auto
*
x_tensor
=
X
->
MutableVar
()
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
dims
));
auto
*
mutable_x
=
x_tensor
->
mutable_data
<
float
>
(
place
);
paddle
::
platform
::
DeviceContextPool
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
dynamic_cast
<
paddle
::
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
place
));
auto
stream
=
dev_ctx
->
stream
();
paddle
::
memory
::
Copy
(
place
,
mutable_x
,
platform
::
CPUPlace
(),
src_data
.
data
(),
sizeof
(
float
)
*
src_data
.
size
(),
stream
);
if
(
mode
==
"Accuracy"
)
{
benchmark_fluid_scale
(
X
,
platform
::
Place
(
place
),
true
/* accuracy_check */
);
}
else
if
(
mode
==
"WarmUp"
)
{
benchmark_fluid_scale
(
X
,
platform
::
Place
(
place
));
}
else
if
(
mode
==
"Performance"
)
{
auto
t_start
=
std
::
chrono
::
high_resolution_clock
::
now
();
#ifdef WITH_GPERFTOOLS
ProfilerStart
(
"fluid_scale_cuda.out"
);
#endif
benchmark_fluid_scale
(
X
,
platform
::
Place
(
place
));
#ifdef WITH_GPERFTOOLS
ProfilerStop
();
#endif
auto
t_end
=
std
::
chrono
::
high_resolution_clock
::
now
();
double
elapsed_time_ms
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
t_end
-
t_start
).
count
();
std
::
cout
<<
"Duration: "
<<
elapsed_time_ms
<<
" ms"
<<
std
::
endl
;
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Fatal
(
"Unknown benchmark mode"
));
}
}
}
TEST
(
Benchmark
,
FluidMatmulCUDA
)
{
// Prepare Device Contexts
platform
::
CUDAPlace
place
;
egr
::
InitEnv
(
place
);
for
(
const
std
::
string
&
mode
:
{
"Accuracy"
,
"WarmUp"
,
"Performance"
})
{
std
::
shared_ptr
<
imperative
::
VarBase
>
X
(
new
imperative
::
VarBase
(
true
,
"X"
));
X
->
SetOverridedStopGradient
(
false
);
std
::
shared_ptr
<
imperative
::
VarBase
>
Y
(
new
imperative
::
VarBase
(
true
,
"Y"
));
Y
->
SetOverridedStopGradient
(
false
);
std
::
vector
<
float
>
x_src_data
(
4
,
1.0
);
std
::
vector
<
float
>
y_src_data
(
4
,
2.0
);
std
::
vector
<
int64_t
>
dims
=
{
2
,
2
};
paddle
::
platform
::
DeviceContextPool
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
dynamic_cast
<
paddle
::
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
place
));
auto
stream
=
dev_ctx
->
stream
();
auto
*
x_tensor
=
X
->
MutableVar
()
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
dims
));
auto
*
mutable_x
=
x_tensor
->
mutable_data
<
float
>
(
place
);
paddle
::
memory
::
Copy
(
place
,
mutable_x
,
platform
::
CPUPlace
(),
x_src_data
.
data
(),
sizeof
(
float
)
*
x_src_data
.
size
(),
stream
);
auto
*
y_tensor
=
Y
->
MutableVar
()
->
GetMutable
<
framework
::
LoDTensor
>
();
y_tensor
->
Resize
(
framework
::
make_ddim
(
dims
));
auto
*
mutable_y
=
y_tensor
->
mutable_data
<
float
>
(
place
);
paddle
::
memory
::
Copy
(
place
,
mutable_y
,
platform
::
CPUPlace
(),
y_src_data
.
data
(),
sizeof
(
float
)
*
y_src_data
.
size
(),
stream
);
if
(
mode
==
"Accuracy"
)
{
benchmark_fluid_matmul
(
X
,
Y
,
platform
::
Place
(
place
),
true
/* accuracy_check */
);
}
else
if
(
mode
==
"WarmUp"
)
{
benchmark_fluid_matmul
(
X
,
Y
,
platform
::
Place
(
place
));
}
else
if
(
mode
==
"Performance"
)
{
auto
t_start
=
std
::
chrono
::
high_resolution_clock
::
now
();
#ifdef WITH_GPERFTOOLS
ProfilerStart
(
"fluid_matmul_cuda.out"
);
#endif
benchmark_fluid_matmul
(
X
,
Y
,
platform
::
Place
(
place
));
#ifdef WITH_GPERFTOOLS
ProfilerStop
();
#endif
auto
t_end
=
std
::
chrono
::
high_resolution_clock
::
now
();
double
elapsed_time_ms
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
t_end
-
t_start
).
count
();
std
::
cout
<<
"Duration: "
<<
elapsed_time_ms
<<
" ms"
<<
std
::
endl
;
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Fatal
(
"Unknown benchmark mode"
));
}
}
}
TEST
(
Benchmark
,
FluidMLPCUDA
)
{
// Prepare Device Contexts
platform
::
CUDAPlace
place
;
egr
::
InitEnv
(
place
);
for
(
const
std
::
string
&
mode
:
{
"Accuracy"
,
"WarmUp"
,
"Performance"
})
{
paddle
::
platform
::
DeviceContextPool
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
dynamic_cast
<
paddle
::
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
place
));
auto
stream
=
dev_ctx
->
stream
();
std
::
vector
<
float
>
x_src_data
(
MLP_M
*
MLP_N
,
MLP_X_VAL
);
std
::
vector
<
float
>
w_src_data
(
MLP_N
*
MLP_K
,
MLP_W_VAL
);
std
::
vector
<
float
>
b_src_data
(
MLP_K
,
MLP_B_VAL
);
std
::
vector
<
int64_t
>
x_dims
=
{
MLP_M
,
MLP_N
};
std
::
vector
<
int64_t
>
w_dims
=
{
MLP_N
,
MLP_K
};
std
::
vector
<
int64_t
>
b_dims
=
{
MLP_K
};
std
::
shared_ptr
<
imperative
::
VarBase
>
X
(
new
imperative
::
VarBase
(
true
,
"X"
));
X
->
SetOverridedStopGradient
(
false
);
auto
*
x_tensor
=
X
->
MutableVar
()
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
x_dims
));
auto
*
mutable_x
=
x_tensor
->
mutable_data
<
float
>
(
place
);
paddle
::
memory
::
Copy
(
place
,
mutable_x
,
platform
::
CPUPlace
(),
x_src_data
.
data
(),
sizeof
(
float
)
*
x_src_data
.
size
(),
stream
);
std
::
vector
<
std
::
shared_ptr
<
imperative
::
VarBase
>>
Ws
;
std
::
vector
<
std
::
shared_ptr
<
imperative
::
VarBase
>>
Bs
;
for
(
size_t
i
=
0
;
i
<
MLP_NUM_LINEAR
;
i
++
)
{
std
::
shared_ptr
<
imperative
::
VarBase
>
W
(
new
imperative
::
VarBase
(
true
,
"W"
));
W
->
SetOverridedStopGradient
(
false
);
std
::
shared_ptr
<
imperative
::
VarBase
>
B
(
new
imperative
::
VarBase
(
true
,
"B"
));
B
->
SetOverridedStopGradient
(
false
);
auto
*
w_tensor
=
W
->
MutableVar
()
->
GetMutable
<
framework
::
LoDTensor
>
();
w_tensor
->
Resize
(
framework
::
make_ddim
(
w_dims
));
auto
*
mutable_w
=
w_tensor
->
mutable_data
<
float
>
(
place
);
paddle
::
memory
::
Copy
(
place
,
mutable_w
,
platform
::
CPUPlace
(),
w_src_data
.
data
(),
sizeof
(
float
)
*
w_src_data
.
size
(),
stream
);
auto
*
b_tensor
=
B
->
MutableVar
()
->
GetMutable
<
framework
::
LoDTensor
>
();
b_tensor
->
Resize
(
framework
::
make_ddim
(
b_dims
));
auto
*
mutable_b
=
b_tensor
->
mutable_data
<
float
>
(
place
);
paddle
::
memory
::
Copy
(
place
,
mutable_b
,
platform
::
CPUPlace
(),
b_src_data
.
data
(),
sizeof
(
float
)
*
b_src_data
.
size
(),
stream
);
Ws
.
emplace_back
(
std
::
move
(
W
));
Bs
.
emplace_back
(
std
::
move
(
B
));
}
if
(
mode
==
"Accuracy"
)
{
benchmark_fluid_mlp
(
X
,
Ws
,
Bs
,
platform
::
Place
(
place
),
true
/* accuracy_check */
);
}
else
if
(
mode
==
"WarmUp"
)
{
benchmark_fluid_mlp
(
X
,
Ws
,
Bs
,
platform
::
Place
(
place
));
}
else
if
(
mode
==
"Performance"
)
{
auto
t_start
=
std
::
chrono
::
high_resolution_clock
::
now
();
#ifdef WITH_GPERFTOOLS
ProfilerStart
(
"fluid_mlp_cuda.out"
);
#endif
benchmark_fluid_mlp
(
X
,
Ws
,
Bs
,
platform
::
Place
(
place
));
#ifdef WITH_GPERFTOOLS
ProfilerStop
();
#endif
auto
t_end
=
std
::
chrono
::
high_resolution_clock
::
now
();
double
elapsed_time_ms
=
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
t_end
-
t_start
).
count
();
std
::
cout
<<
"Duration: "
<<
elapsed_time_ms
<<
" ms"
<<
std
::
endl
;
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Fatal
(
"Unknown benchmark mode"
));
}
}
}
}
// namespace imperative
}
// namespace paddle
USE_OP
(
scale
);
USE_OP
(
matmul_v2
);
USE_OP
(
reduce_sum
);
USE_OP
(
reduce_sum_grad
);
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录