diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index 698a698fc6d18492faac771e6e0e079a35953504..f9d1b705390cb1c22bf9336292af30363c0010cf 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -1,6 +1,6 @@ set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) -set(generated_deps dygraph_function dygraph_node) +set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) message("Performing Eager Dygraph Auto Code Generation") diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h index 9e1dc4f2c8c6ba5c1c7d0c49e5d141d1a6c4c6d3..dca76d3b8a0db8c4284960005bfbad33ce23e20d 100644 --- a/paddle/fluid/eager/autograd_meta.h +++ b/paddle/fluid/eager/autograd_meta.h @@ -145,8 +145,7 @@ class AutogradMeta : public AbstractAutogradMeta { private: // TODO(jiabin) :Should we use pointer instead of object? std::shared_ptr grad_{ - std::make_shared( - egr::Controller::Instance().GenerateUniqueName("@grad"))}; + std::make_shared()}; // GradNodeBase is base class of all grad op which is a // wrapper for grad op. This class will make grad op easy diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index af365322e606ebfaecb7233751cacc6aa1aac423..adb3246ee8c808c9f62fde0228f40cccb2f9ac88 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -80,6 +80,47 @@ TEST(Benchmark, EagerScaleCPU) { } } +TEST(Benchmark, EagerMatmulCPU) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + for (const std::string& mode : {"Accuracy", "Performance"}) { + paddle::framework::DDim ddimX = phi::make_ddim({2, 2}); + paddle::experimental::Tensor X = CreateTensorWithValue( + ddimX, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0, true); + RetainGradForTensor(X); + + paddle::framework::DDim ddimY = phi::make_ddim({2, 2}); + paddle::experimental::Tensor Y = CreateTensorWithValue( + ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 2.0, true); + RetainGradForTensor(Y); + + if (mode == "Accuracy") { + benchmark_eager_matmul(X, Y, true /* accuracy_check */); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_matmul_cpu.out"); +#endif + benchmark_eager_matmul(X, Y); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + TEST(Benchmark, EagerIntermediateMatmulCPU) { // Prepare Device Contexts eager_test::InitEnv(paddle::platform::CPUPlace()); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index 5b75f1242e69bc5b37dd97467b7c55bfc6bc3871..bd70e84d9b461490f53ac6692d55860da1bfc9d8 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -82,6 +82,50 @@ TEST(Benchmark, EagerScaleCUDA) { } } +TEST(Benchmark, EagerMatmulCUDA) { + paddle::platform::CUDAPlace place; + eager_test::InitEnv(place); + + for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { + paddle::framework::DDim ddimX = phi::make_ddim({2, 2}); + paddle::experimental::Tensor X = CreateTensorWithValue( + ddimX, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0, true); + RetainGradForTensor(X); + + paddle::framework::DDim ddimY = phi::make_ddim({2, 2}); + paddle::experimental::Tensor Y = CreateTensorWithValue( + ddimY, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 2.0, true); + RetainGradForTensor(Y); + + if (mode == "Accuracy") { + benchmark_eager_matmul(X, Y, true /* accuracy_check */); + + } else if (mode == "WarmUp") { + benchmark_eager_matmul(X, Y); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_matmul_cuda.out"); +#endif + benchmark_eager_matmul(X, Y); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + TEST(Benchmark, EagerIntermediateMatmulCUDA) { paddle::platform::CUDAPlace place; eager_test::InitEnv(place); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc index 96126fa5466aace442dfb742f9902539916b853e..769bd7f687f4584d44bbfa30b73611a3128289bf 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -28,6 +28,7 @@ #include "paddle/fluid/eager/utils.h" // Eager Generated +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" // Fluid @@ -67,6 +68,29 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, } } +void benchmark_eager_matmul(const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Y, + bool accuracy_check) { + paddle::experimental::Tensor input_tensor0 = X; + + size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs; + for (size_t i = 0; i < max_num_runs; i++) { + input_tensor0 = + matmul_final_state_dygraph_function(input_tensor0, Y, false, false); + } + + std::vector target_tensors = {input_tensor0}; + RunBackward(target_tensors, {}); + + if (accuracy_check) { + // Examine Forward Grad (w.r.t max_num_runs = 2) + eager_test::CompareTensorWithValue(input_tensor0, 16); + // Examine Backward Grad (w.r.t max_num_runs = 2) + eager_test::CompareGradTensorWithValue(X, 16); + eager_test::CompareGradTensorWithValue(Y, 16); + } +} + /* ----------------------------------- */ /* ---- Eager Intermediate Matmul ---- */ /* ----------------------------------- */ diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h index 0086b51b57e152c6da935eacba8d93c0d6ab1a71..86bf13707ed40b0c37ccb54695cca3d165768cb6 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h @@ -51,15 +51,10 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, bool accuracy_check = false); /* ---- Eager MatMul ---- */ -/* -void benchmark_eager_matmul(const paddle::experimental::Tensor& X, const -paddle::experimental::Tensor& Y, +void benchmark_eager_matmul(const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Y, bool accuracy_check = false); -void benchmark_eager_mlp(const paddle::experimental::Tensor& X, - const std::vector& Ws, - const std::vector& Bs, - bool accuracy_check = false); -*/ + void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X, const paddle::experimental::Tensor& Y, bool accuracy_check = false);