未验证 提交 6d830f6c 编写于 作者: Z Zhanlue Yang 提交者: GitHub

Added Final State Matmul_v2 to C++ performance test (#40391)

上级 47459e98
set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
set(generated_deps dygraph_function dygraph_node) set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node)
if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
message("Performing Eager Dygraph Auto Code Generation") message("Performing Eager Dygraph Auto Code Generation")
......
...@@ -145,8 +145,7 @@ class AutogradMeta : public AbstractAutogradMeta { ...@@ -145,8 +145,7 @@ class AutogradMeta : public AbstractAutogradMeta {
private: private:
// TODO(jiabin) :Should we use pointer instead of object? // TODO(jiabin) :Should we use pointer instead of object?
std::shared_ptr<paddle::experimental::Tensor> grad_{ std::shared_ptr<paddle::experimental::Tensor> grad_{
std::make_shared<paddle::experimental::Tensor>( std::make_shared<paddle::experimental::Tensor>()};
egr::Controller::Instance().GenerateUniqueName("@grad"))};
// GradNodeBase is base class of all grad op which is a // GradNodeBase is base class of all grad op which is a
// wrapper for grad op. This class will make grad op easy // wrapper for grad op. This class will make grad op easy
......
...@@ -80,6 +80,47 @@ TEST(Benchmark, EagerScaleCPU) { ...@@ -80,6 +80,47 @@ TEST(Benchmark, EagerScaleCPU) {
} }
} }
TEST(Benchmark, EagerMatmulCPU) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
for (const std::string& mode : {"Accuracy", "Performance"}) {
paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
paddle::experimental::Tensor X = CreateTensorWithValue(
ddimX, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0, true);
RetainGradForTensor(X);
paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
paddle::experimental::Tensor Y = CreateTensorWithValue(
ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 2.0, true);
RetainGradForTensor(Y);
if (mode == "Accuracy") {
benchmark_eager_matmul(X, Y, true /* accuracy_check */);
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("eager_matmul_cpu.out");
#endif
benchmark_eager_matmul(X, Y);
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
TEST(Benchmark, EagerIntermediateMatmulCPU) { TEST(Benchmark, EagerIntermediateMatmulCPU) {
// Prepare Device Contexts // Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace()); eager_test::InitEnv(paddle::platform::CPUPlace());
......
...@@ -82,6 +82,50 @@ TEST(Benchmark, EagerScaleCUDA) { ...@@ -82,6 +82,50 @@ TEST(Benchmark, EagerScaleCUDA) {
} }
} }
TEST(Benchmark, EagerMatmulCUDA) {
paddle::platform::CUDAPlace place;
eager_test::InitEnv(place);
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
paddle::experimental::Tensor X = CreateTensorWithValue(
ddimX, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0, true);
RetainGradForTensor(X);
paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
paddle::experimental::Tensor Y = CreateTensorWithValue(
ddimY, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 2.0, true);
RetainGradForTensor(Y);
if (mode == "Accuracy") {
benchmark_eager_matmul(X, Y, true /* accuracy_check */);
} else if (mode == "WarmUp") {
benchmark_eager_matmul(X, Y);
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("eager_matmul_cuda.out");
#endif
benchmark_eager_matmul(X, Y);
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
TEST(Benchmark, EagerIntermediateMatmulCUDA) { TEST(Benchmark, EagerIntermediateMatmulCUDA) {
paddle::platform::CUDAPlace place; paddle::platform::CUDAPlace place;
eager_test::InitEnv(place); eager_test::InitEnv(place);
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include "paddle/fluid/eager/utils.h" #include "paddle/fluid/eager/utils.h"
// Eager Generated // Eager Generated
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
// Fluid // Fluid
...@@ -67,6 +68,29 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, ...@@ -67,6 +68,29 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
} }
} }
void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
const paddle::experimental::Tensor& Y,
bool accuracy_check) {
paddle::experimental::Tensor input_tensor0 = X;
size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs;
for (size_t i = 0; i < max_num_runs; i++) {
input_tensor0 =
matmul_final_state_dygraph_function(input_tensor0, Y, false, false);
}
std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
RunBackward(target_tensors, {});
if (accuracy_check) {
// Examine Forward Grad (w.r.t max_num_runs = 2)
eager_test::CompareTensorWithValue<float>(input_tensor0, 16);
// Examine Backward Grad (w.r.t max_num_runs = 2)
eager_test::CompareGradTensorWithValue<float>(X, 16);
eager_test::CompareGradTensorWithValue<float>(Y, 16);
}
}
/* ----------------------------------- */ /* ----------------------------------- */
/* ---- Eager Intermediate Matmul ---- */ /* ---- Eager Intermediate Matmul ---- */
/* ----------------------------------- */ /* ----------------------------------- */
......
...@@ -51,15 +51,10 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, ...@@ -51,15 +51,10 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
bool accuracy_check = false); bool accuracy_check = false);
/* ---- Eager MatMul ---- */ /* ---- Eager MatMul ---- */
/* void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
void benchmark_eager_matmul(const paddle::experimental::Tensor& X, const const paddle::experimental::Tensor& Y,
paddle::experimental::Tensor& Y,
bool accuracy_check = false); bool accuracy_check = false);
void benchmark_eager_mlp(const paddle::experimental::Tensor& X,
const std::vector<paddle::experimental::Tensor>& Ws,
const std::vector<paddle::experimental::Tensor>& Bs,
bool accuracy_check = false);
*/
void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X, void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X,
const paddle::experimental::Tensor& Y, const paddle::experimental::Tensor& Y,
bool accuracy_check = false); bool accuracy_check = false);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册