未验证 提交 eb9e3305 编写于 作者: Z Zhanlue Yang 提交者: GitHub

Enabled performance benchmark tests for Eager Dygraph (#37653)

* Enabled performance benchmark tests for Eager Dygraph

* Protected CUDA tests with macro

* Fixed dependency issues for windows-ci
上级 1514eec6
...@@ -2,7 +2,7 @@ set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward ...@@ -2,7 +2,7 @@ set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward
set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
set(generated_deps dygraph_function dygraph_node) set(generated_deps dygraph_function dygraph_node)
if(NOT DEFINED ON_INFER) if(NOT ON_INFER)
message("Performing Eager Dygraph Auto Code Generation") message("Performing Eager Dygraph Auto Code Generation")
add_subdirectory(auto_code_generator) add_subdirectory(auto_code_generator)
endif() endif()
......
add_subdirectory(eager_generated) add_subdirectory(eager_generated)
if(NOT DEFINED ON_INFER) if(NOT ON_INFER)
add_subdirectory(fluid_generated) add_subdirectory(fluid_generated)
endif() endif()
...@@ -17,9 +17,38 @@ execute_process( ...@@ -17,9 +17,38 @@ execute_process(
) )
if(WIN32) if(WIN32)
set(EAGER_CODEGEN_DEPS eager_generator)
if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}")
else()
set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
endif()
if(${CBLAS_PROVIDER} STREQUAL MKLML)
message("Copied libiomp5md.dll for Eager AutoCodeGen")
ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/libiomp5md.dll
COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${eager_generator_path}
DEPENDS mklml)
list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/libiomp5md.dll)
else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
message("Copied openblas.dll for Eager AutoCodeGen")
ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/openblas.dll
COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${eager_generator_path}
DEPENDS extern_openblas)
list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/openblas.dll)
endif()
if(WITH_MKLDNN)
message("Copied mkldnn.dll for Eager AutoCodeGen")
ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/mkldnn.dll
COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${eager_generator_path}
DEPENDS mkldnn)
list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/mkldnn.dll)
endif()
add_custom_target(eager_codegen add_custom_target(eager_codegen
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" COMMAND "${eager_generator_path}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
DEPENDS eager_generator DEPENDS ${EAGER_CODEGEN_DEPS}
VERBATIM) VERBATIM)
else() else()
add_custom_target(eager_codegen add_custom_target(eager_codegen
......
add_subdirectory(data_structure_tests) add_subdirectory(data_structure_tests)
add_subdirectory(task_tests) add_subdirectory(task_tests)
if(NOT ON_INFER)
add_subdirectory(performance_tests)
endif()
cc_library(performance_benchmark_utils SRCS benchmark_utils.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node scale_op matmul_v2_op)
cc_test(test_egr_performance_benchmark_eager_cpu SRCS benchmark_eager_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
cc_test(test_egr_performance_benchmark_fluid_cpu SRCS benchmark_fluid_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
cc_test(test_egr_performance_benchmark_eager_cuda SRCS benchmark_eager_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
cc_test(test_egr_performance_benchmark_fluid_cuda SRCS benchmark_fluid_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/eager/tests/test_utils.h"
#ifdef WITH_GPERFTOOLS #ifdef WITH_GPERFTOOLS
...@@ -42,11 +42,11 @@ TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; } ...@@ -42,11 +42,11 @@ TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
TEST(Benchmark, EagerScaleCPU) { TEST(Benchmark, EagerScaleCPU) {
// Prepare Device Contexts // Prepare Device Contexts
egr::InitEnv(paddle::platform::CPUPlace()); eager_test::InitEnv(paddle::platform::CPUPlace());
for (const std::string& mode : {"Accuracy", "Performance"}) { for (const std::string& mode : {"Accuracy", "Performance"}) {
paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4}); paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue( egr::EagerTensor tensor = CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 5.0, true); pten::DataLayout::NCHW, 5.0, true);
RetainGradForTensor(tensor); RetainGradForTensor(tensor);
...@@ -78,20 +78,20 @@ TEST(Benchmark, EagerScaleCPU) { ...@@ -78,20 +78,20 @@ TEST(Benchmark, EagerScaleCPU) {
TEST(Benchmark, EagerIntermediateMatmulCPU) { TEST(Benchmark, EagerIntermediateMatmulCPU) {
// Prepare Device Contexts // Prepare Device Contexts
InitEnv(paddle::platform::CPUPlace()); eager_test::InitEnv(paddle::platform::CPUPlace());
auto tracer = std::make_shared<paddle::imperative::Tracer>(); auto tracer = std::make_shared<paddle::imperative::Tracer>();
paddle::imperative::SetCurrentTracer(tracer); paddle::imperative::SetCurrentTracer(tracer);
for (const std::string& mode : {"Accuracy", "Performance"}) { for (const std::string& mode : {"Accuracy", "Performance"}) {
paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2}); paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2});
egr::EagerTensor X = EagerUtils::CreateTensorWithValue( egr::EagerTensor X = CreateTensorWithValue(
ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 1.0, true); pten::DataLayout::NCHW, 1.0, true);
RetainGradForTensor(X); RetainGradForTensor(X);
paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2}); paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2});
egr::EagerTensor Y = EagerUtils::CreateTensorWithValue( egr::EagerTensor Y = CreateTensorWithValue(
ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 2.0, true); pten::DataLayout::NCHW, 2.0, true);
RetainGradForTensor(Y); RetainGradForTensor(Y);
...@@ -122,7 +122,7 @@ TEST(Benchmark, EagerIntermediateMatmulCPU) { ...@@ -122,7 +122,7 @@ TEST(Benchmark, EagerIntermediateMatmulCPU) {
TEST(Benchmark, EagerIntermediateMLPCPU) { TEST(Benchmark, EagerIntermediateMLPCPU) {
// Prepare Device Contexts // Prepare Device Contexts
InitEnv(paddle::platform::CPUPlace()); eager_test::InitEnv(paddle::platform::CPUPlace());
auto tracer = std::make_shared<paddle::imperative::Tracer>(); auto tracer = std::make_shared<paddle::imperative::Tracer>();
paddle::imperative::SetCurrentTracer(tracer); paddle::imperative::SetCurrentTracer(tracer);
...@@ -130,7 +130,7 @@ TEST(Benchmark, EagerIntermediateMLPCPU) { ...@@ -130,7 +130,7 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
for (const std::string& mode : {"Accuracy", "Performance"}) { for (const std::string& mode : {"Accuracy", "Performance"}) {
paddle::framework::DDim ddimX = paddle::framework::DDim ddimX =
paddle::framework::make_ddim({MLP_M, MLP_N}); paddle::framework::make_ddim({MLP_M, MLP_N});
egr::EagerTensor X = EagerUtils::CreateTensorWithValue( egr::EagerTensor X = CreateTensorWithValue(
ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_X_VAL, true); pten::DataLayout::NCHW, MLP_X_VAL, true);
RetainGradForTensor(X); RetainGradForTensor(X);
...@@ -140,13 +140,13 @@ TEST(Benchmark, EagerIntermediateMLPCPU) { ...@@ -140,13 +140,13 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
for (size_t i = 0; i < MLP_NUM_LINEAR; i++) { for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
paddle::framework::DDim ddimW = paddle::framework::DDim ddimW =
paddle::framework::make_ddim({MLP_N, MLP_K}); paddle::framework::make_ddim({MLP_N, MLP_K});
egr::EagerTensor W = EagerUtils::CreateTensorWithValue( egr::EagerTensor W = CreateTensorWithValue(
ddimW, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, ddimW, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_W_VAL, true); pten::DataLayout::NCHW, MLP_W_VAL, true);
RetainGradForTensor(W); RetainGradForTensor(W);
paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K}); paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K});
egr::EagerTensor B = EagerUtils::CreateTensorWithValue( egr::EagerTensor B = CreateTensorWithValue(
ddimB, paddle::platform::CPUPlace(), pten::DataType::FLOAT32, ddimB, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_B_VAL, true); pten::DataLayout::NCHW, MLP_B_VAL, true);
RetainGradForTensor(B); RetainGradForTensor(B);
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/eager/tests/test_utils.h"
#ifdef WITH_GPERFTOOLS #ifdef WITH_GPERFTOOLS
...@@ -38,12 +38,14 @@ DECLARE_bool(run_pten_kernel); ...@@ -38,12 +38,14 @@ DECLARE_bool(run_pten_kernel);
TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; } TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(Benchmark, EagerScaleCUDA) { TEST(Benchmark, EagerScaleCUDA) {
egr::InitEnv(paddle::platform::CUDAPlace()); eager_test::InitEnv(paddle::platform::CUDAPlace());
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4}); paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue( egr::EagerTensor tensor = CreateTensorWithValue(
ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
RetainGradForTensor(tensor); RetainGradForTensor(tensor);
...@@ -77,7 +79,7 @@ TEST(Benchmark, EagerScaleCUDA) { ...@@ -77,7 +79,7 @@ TEST(Benchmark, EagerScaleCUDA) {
TEST(Benchmark, EagerIntermediateMatmulCUDA) { TEST(Benchmark, EagerIntermediateMatmulCUDA) {
paddle::platform::CUDAPlace place; paddle::platform::CUDAPlace place;
egr::InitEnv(place); eager_test::InitEnv(place);
auto tracer = std::make_shared<paddle::imperative::Tracer>(); auto tracer = std::make_shared<paddle::imperative::Tracer>();
tracer->SetExpectedPlace(place); tracer->SetExpectedPlace(place);
...@@ -85,13 +87,13 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) { ...@@ -85,13 +87,13 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) {
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2}); paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2});
egr::EagerTensor X = EagerUtils::CreateTensorWithValue( egr::EagerTensor X = CreateTensorWithValue(
ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 1.0, true); pten::DataLayout::NCHW, 1.0, true);
RetainGradForTensor(X); RetainGradForTensor(X);
paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2}); paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2});
egr::EagerTensor Y = EagerUtils::CreateTensorWithValue( egr::EagerTensor Y = CreateTensorWithValue(
ddimY, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, ddimY, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 2.0, true); pten::DataLayout::NCHW, 2.0, true);
RetainGradForTensor(Y); RetainGradForTensor(Y);
...@@ -125,7 +127,7 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) { ...@@ -125,7 +127,7 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) {
TEST(Benchmark, EagerIntermediateMLPCUDA) { TEST(Benchmark, EagerIntermediateMLPCUDA) {
paddle::platform::CUDAPlace place; paddle::platform::CUDAPlace place;
egr::InitEnv(place); eager_test::InitEnv(place);
auto tracer = std::make_shared<paddle::imperative::Tracer>(); auto tracer = std::make_shared<paddle::imperative::Tracer>();
tracer->SetExpectedPlace(place); tracer->SetExpectedPlace(place);
...@@ -134,7 +136,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) { ...@@ -134,7 +136,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::framework::DDim ddimX = paddle::framework::DDim ddimX =
paddle::framework::make_ddim({MLP_M, MLP_N}); paddle::framework::make_ddim({MLP_M, MLP_N});
egr::EagerTensor X = EagerUtils::CreateTensorWithValue( egr::EagerTensor X = CreateTensorWithValue(
ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_X_VAL, true); pten::DataLayout::NCHW, MLP_X_VAL, true);
RetainGradForTensor(X); RetainGradForTensor(X);
...@@ -144,13 +146,13 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) { ...@@ -144,13 +146,13 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
for (size_t i = 0; i < MLP_NUM_LINEAR; i++) { for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
paddle::framework::DDim ddimW = paddle::framework::DDim ddimW =
paddle::framework::make_ddim({MLP_N, MLP_K}); paddle::framework::make_ddim({MLP_N, MLP_K});
egr::EagerTensor W = EagerUtils::CreateTensorWithValue( egr::EagerTensor W = CreateTensorWithValue(
ddimW, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, ddimW, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_W_VAL, true); pten::DataLayout::NCHW, MLP_W_VAL, true);
RetainGradForTensor(W); RetainGradForTensor(W);
paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K}); paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K});
egr::EagerTensor B = EagerUtils::CreateTensorWithValue( egr::EagerTensor B = CreateTensorWithValue(
ddimB, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32, ddimB, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_B_VAL, true); pten::DataLayout::NCHW, MLP_B_VAL, true);
RetainGradForTensor(B); RetainGradForTensor(B);
...@@ -185,3 +187,5 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) { ...@@ -185,3 +187,5 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
} }
} }
} }
#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include "glog/logging.h" #include "glog/logging.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/basic_engine.h"
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
...@@ -45,7 +45,7 @@ namespace imperative { ...@@ -45,7 +45,7 @@ namespace imperative {
TEST(Benchmark, FluidScaleCPU) { TEST(Benchmark, FluidScaleCPU) {
// Prepare Device Contexts // Prepare Device Contexts
platform::CPUPlace place; platform::CPUPlace place;
egr::InitEnv(place); eager_test::InitEnv(place);
for (const std::string& mode : {"Accuracy", "Performance"}) { for (const std::string& mode : {"Accuracy", "Performance"}) {
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X")); std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
...@@ -88,7 +88,7 @@ TEST(Benchmark, FluidScaleCPU) { ...@@ -88,7 +88,7 @@ TEST(Benchmark, FluidScaleCPU) {
TEST(Benchmark, FluidMatmulCPU) { TEST(Benchmark, FluidMatmulCPU) {
// Prepare Device Contexts // Prepare Device Contexts
platform::CPUPlace place; platform::CPUPlace place;
egr::InitEnv(place); eager_test::InitEnv(place);
for (const std::string& mode : {"Accuracy", "Performance"}) { for (const std::string& mode : {"Accuracy", "Performance"}) {
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X")); std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
...@@ -141,7 +141,7 @@ TEST(Benchmark, FluidMatmulCPU) { ...@@ -141,7 +141,7 @@ TEST(Benchmark, FluidMatmulCPU) {
TEST(Benchmark, FluidMLPCPU) { TEST(Benchmark, FluidMLPCPU) {
// Prepare Device Contexts // Prepare Device Contexts
platform::CPUPlace place; platform::CPUPlace place;
egr::InitEnv(place); eager_test::InitEnv(place);
for (const std::string& mode : {"Accuracy", "Performance"}) { for (const std::string& mode : {"Accuracy", "Performance"}) {
std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL); std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL);
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include "glog/logging.h" #include "glog/logging.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h" #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/basic_engine.h"
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
...@@ -39,13 +39,15 @@ DECLARE_bool(run_pten_kernel); ...@@ -39,13 +39,15 @@ DECLARE_bool(run_pten_kernel);
TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; } TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
TEST(Benchmark, FluidScaleCUDA) { TEST(Benchmark, FluidScaleCUDA) {
// Prepare Device Contexts // Prepare Device Contexts
platform::CUDAPlace place; platform::CUDAPlace place;
egr::InitEnv(place); eager_test::InitEnv(place);
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X")); std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
...@@ -98,7 +100,7 @@ TEST(Benchmark, FluidScaleCUDA) { ...@@ -98,7 +100,7 @@ TEST(Benchmark, FluidScaleCUDA) {
TEST(Benchmark, FluidMatmulCUDA) { TEST(Benchmark, FluidMatmulCUDA) {
// Prepare Device Contexts // Prepare Device Contexts
platform::CUDAPlace place; platform::CUDAPlace place;
egr::InitEnv(place); eager_test::InitEnv(place);
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X")); std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
...@@ -161,7 +163,7 @@ TEST(Benchmark, FluidMatmulCUDA) { ...@@ -161,7 +163,7 @@ TEST(Benchmark, FluidMatmulCUDA) {
TEST(Benchmark, FluidMLPCUDA) { TEST(Benchmark, FluidMLPCUDA) {
// Prepare Device Contexts // Prepare Device Contexts
platform::CUDAPlace place; platform::CUDAPlace place;
egr::InitEnv(place); eager_test::InitEnv(place);
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool& pool =
...@@ -252,3 +254,5 @@ USE_OP(scale); ...@@ -252,3 +254,5 @@ USE_OP(scale);
USE_OP(matmul_v2); USE_OP(matmul_v2);
USE_OP(reduce_sum); USE_OP(reduce_sum);
USE_OP(reduce_sum_grad); USE_OP(reduce_sum_grad);
#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
...@@ -36,10 +36,6 @@ ...@@ -36,10 +36,6 @@
#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
#include "paddle/pten/core/kernel_registry.h"
static size_t max_num_benchmark_runs = 5000; static size_t max_num_benchmark_runs = 5000;
namespace egr { namespace egr {
...@@ -64,9 +60,9 @@ void benchmark_eager_scale(const EagerTensor& tensor, bool accuracy_check) { ...@@ -64,9 +60,9 @@ void benchmark_eager_scale(const EagerTensor& tensor, bool accuracy_check) {
if (accuracy_check) { if (accuracy_check) {
// Examine Forward Grad (w.r.t max_num_runs = 10) // Examine Forward Grad (w.r.t max_num_runs = 10)
CompareTensorWithValue<float>(input_tensor, 8189.0); eager_test::CompareTensorWithValue<float>(input_tensor, 8189.0);
// Examine Backward Grad (w.r.t max_num_runs = 10) // Examine Backward Grad (w.r.t max_num_runs = 10)
CompareGradTensorWithValue<float>(tensor, 1024.0); eager_test::CompareGradTensorWithValue<float>(tensor, 1024.0);
} }
} }
...@@ -89,10 +85,10 @@ void benchmark_eager_intermediate_matmul(const EagerTensor& X, ...@@ -89,10 +85,10 @@ void benchmark_eager_intermediate_matmul(const EagerTensor& X,
if (accuracy_check) { if (accuracy_check) {
// Examine Forward Grad (w.r.t max_num_runs = 2) // Examine Forward Grad (w.r.t max_num_runs = 2)
CompareVariableWithValue<float>(input_tensor0, 16); eager_test::CompareVariableWithValue<float>(input_tensor0, 16);
// Examine Backward Grad (w.r.t max_num_runs = 2) // Examine Backward Grad (w.r.t max_num_runs = 2)
CompareGradVariableWithValue<float>(X, 16); eager_test::CompareGradVariableWithValue<float>(X, 16);
CompareGradVariableWithValue<float>(Y, 16); eager_test::CompareGradVariableWithValue<float>(Y, 16);
} }
} }
...@@ -122,11 +118,11 @@ void benchmark_eager_intermediate_mlp(const EagerTensor& X, ...@@ -122,11 +118,11 @@ void benchmark_eager_intermediate_mlp(const EagerTensor& X,
compute_mlp_expected_results(); compute_mlp_expected_results();
// Examine Forward Grad (w.r.t max_num_runs = 2) // Examine Forward Grad (w.r.t max_num_runs = 2)
CompareVariableWithValue<float>(Out, result["Out"]); eager_test::CompareVariableWithValue<float>(Out, result["Out"]);
// Examine Backward Grad (w.r.t max_num_runs = 2) // Examine Backward Grad (w.r.t max_num_runs = 2)
CompareGradVariableWithValue<float>(X, result["GradX"]); eager_test::CompareGradVariableWithValue<float>(X, result["GradX"]);
CompareGradVariableWithValue<float>(Ws[0], result["GradW"]); eager_test::CompareGradVariableWithValue<float>(Ws[0], result["GradW"]);
} }
} }
...@@ -141,6 +137,8 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X, ...@@ -141,6 +137,8 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X,
auto* tensor = X->MutableVar()->GetMutable<framework::LoDTensor>(); auto* tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
float* t_ptr = tensor->mutable_data<float>(place); float* t_ptr = tensor->mutable_data<float>(place);
std::vector<float> host_data(tensor->numel()); std::vector<float> host_data(tensor->numel());
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (place == paddle::platform::CUDAPlace()) { if (place == paddle::platform::CUDAPlace()) {
paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
...@@ -153,6 +151,8 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X, ...@@ -153,6 +151,8 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X,
sizeof(float) * tensor->numel(), stream); sizeof(float) * tensor->numel(), stream);
t_ptr = host_data.data(); t_ptr = host_data.data();
} }
#endif
VLOG(6) << "Tensor Value: " << t_ptr[0] << ", Expected Value: " << value; VLOG(6) << "Tensor Value: " << t_ptr[0] << ", Expected Value: " << value;
PADDLE_ENFORCE( PADDLE_ENFORCE(
t_ptr[0] == value, t_ptr[0] == value,
...@@ -166,6 +166,8 @@ static void FluidCheckGradTensorValue( ...@@ -166,6 +166,8 @@ static void FluidCheckGradTensorValue(
auto* grad_tensor = X->MutableGradVar()->GetMutable<framework::LoDTensor>(); auto* grad_tensor = X->MutableGradVar()->GetMutable<framework::LoDTensor>();
float* g_ptr = grad_tensor->mutable_data<float>(place); float* g_ptr = grad_tensor->mutable_data<float>(place);
std::vector<float> g_host_data(grad_tensor->numel()); std::vector<float> g_host_data(grad_tensor->numel());
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (place == paddle::platform::CUDAPlace()) { if (place == paddle::platform::CUDAPlace()) {
paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance(); paddle::platform::DeviceContextPool::Instance();
...@@ -178,6 +180,8 @@ static void FluidCheckGradTensorValue( ...@@ -178,6 +180,8 @@ static void FluidCheckGradTensorValue(
sizeof(float) * grad_tensor->numel(), stream); sizeof(float) * grad_tensor->numel(), stream);
g_ptr = g_host_data.data(); g_ptr = g_host_data.data();
} }
#endif
VLOG(6) << "Tensor Value: " << g_ptr[0] << ", Expected Value: " << value; VLOG(6) << "Tensor Value: " << g_ptr[0] << ", Expected Value: " << value;
PADDLE_ENFORCE( PADDLE_ENFORCE(
g_ptr[0] == value, g_ptr[0] == value,
......
...@@ -6,6 +6,6 @@ cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} ea ...@@ -6,6 +6,6 @@ cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} ea
cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
if(NOT DEFINED ON_INFER) if(NOT ON_INFER)
cc_test(test_egr_task_autocodegen SRCS generated_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps}) cc_test(test_egr_task_autocodegen SRCS generated_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps})
endif() endif()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册