未验证 提交 87b97776 编写于 作者: Z Zhanlue Yang 提交者: GitHub

Added performance benchmakrs for Eager Dygraph (#37643)

上级 51804e4d
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Eager Dygraph
#include <chrono>
#include "gtest/gtest.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/backward.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
// TODO(jiabin): remove nolint here!!!
using namespace egr; // NOLINT
// Disable pten path
DECLARE_bool(run_pten_kernel);
TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
TEST(Benchmark, EagerScaleCPU) {
// Prepare Device Contexts
egr::InitEnv(paddle::platform::CPUPlace());
for (const std::string& mode : {"Accuracy", "Performance"}) {
paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 5.0, true);
RetainGradForTensor(tensor);
if (mode == "Accuracy") {
benchmark_eager_scale(tensor, true /* accuracy_check*/);
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("eager_scale_cpu.out");
#endif
benchmark_eager_scale(tensor);
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
TEST(Benchmark, EagerIntermediateMatmulCPU) {
// Prepare Device Contexts
InitEnv(paddle::platform::CPUPlace());
auto tracer = std::make_shared<paddle::imperative::Tracer>();
paddle::imperative::SetCurrentTracer(tracer);
for (const std::string& mode : {"Accuracy", "Performance"}) {
paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2});
egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 1.0, true);
RetainGradForTensor(X);
paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2});
egr::EagerTensor Y = EagerUtils::CreateTensorWithValue(
ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 2.0, true);
RetainGradForTensor(Y);
if (mode == "Accuracy") {
benchmark_eager_intermediate_matmul(X, Y, true /* accuracy_check */);
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("eager_intermediate_matmul_cpu.out");
#endif
benchmark_eager_intermediate_matmul(X, Y);
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
TEST(Benchmark, EagerIntermediateMLPCPU) {
// Prepare Device Contexts
InitEnv(paddle::platform::CPUPlace());
auto tracer = std::make_shared<paddle::imperative::Tracer>();
paddle::imperative::SetCurrentTracer(tracer);
for (const std::string& mode : {"Accuracy", "Performance"}) {
paddle::framework::DDim ddimX =
paddle::framework::make_ddim({MLP_M, MLP_N});
egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_X_VAL, true);
RetainGradForTensor(X);
std::vector<EagerTensor> Ws;
std::vector<EagerTensor> Bs;
for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
paddle::framework::DDim ddimW =
paddle::framework::make_ddim({MLP_N, MLP_K});
egr::EagerTensor W = EagerUtils::CreateTensorWithValue(
ddimW, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_W_VAL, true);
RetainGradForTensor(W);
paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K});
egr::EagerTensor B = EagerUtils::CreateTensorWithValue(
ddimB, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_B_VAL, true);
RetainGradForTensor(B);
Ws.emplace_back(std::move(W));
Bs.emplace_back(std::move(B));
}
if (mode == "Accuracy") {
benchmark_eager_intermediate_mlp(X, Ws, Bs, true /* accuracy_check */);
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("eager_intermediate_mlp_cpu.out");
#endif
benchmark_eager_intermediate_mlp(X, Ws, Bs);
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Eager Dygraph
#include <chrono>
#include "gtest/gtest.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/fluid/eager/api/all.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/backward.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
// TODO(jiabin): remove nolint here!!!
using namespace egr; // NOLINT
DECLARE_bool(run_pten_kernel);
TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
TEST(Benchmark, EagerScaleCUDA) {
egr::InitEnv(paddle::platform::CUDAPlace());
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue(
ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
RetainGradForTensor(tensor);
if (mode == "Accuracy") {
benchmark_eager_scale(tensor, true /* accuracy_check */);
} else if (mode == "WarmUp") {
benchmark_eager_scale(tensor);
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("eager_scale_cuda.out");
#endif
benchmark_eager_scale(tensor);
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
TEST(Benchmark, EagerIntermediateMatmulCUDA) {
paddle::platform::CUDAPlace place;
egr::InitEnv(place);
auto tracer = std::make_shared<paddle::imperative::Tracer>();
tracer->SetExpectedPlace(place);
paddle::imperative::SetCurrentTracer(tracer);
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2});
egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 1.0, true);
RetainGradForTensor(X);
paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2});
egr::EagerTensor Y = EagerUtils::CreateTensorWithValue(
ddimY, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, 2.0, true);
RetainGradForTensor(Y);
if (mode == "Accuracy") {
benchmark_eager_intermediate_matmul(X, Y, true /* accuracy_check */);
} else if (mode == "WarmUp") {
benchmark_eager_intermediate_matmul(X, Y);
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("eager_intermediate_matmul_cuda.out");
#endif
benchmark_eager_intermediate_matmul(X, Y);
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
TEST(Benchmark, EagerIntermediateMLPCUDA) {
paddle::platform::CUDAPlace place;
egr::InitEnv(place);
auto tracer = std::make_shared<paddle::imperative::Tracer>();
tracer->SetExpectedPlace(place);
paddle::imperative::SetCurrentTracer(tracer);
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::framework::DDim ddimX =
paddle::framework::make_ddim({MLP_M, MLP_N});
egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_X_VAL, true);
RetainGradForTensor(X);
std::vector<EagerTensor> Ws;
std::vector<EagerTensor> Bs;
for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
paddle::framework::DDim ddimW =
paddle::framework::make_ddim({MLP_N, MLP_K});
egr::EagerTensor W = EagerUtils::CreateTensorWithValue(
ddimW, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_W_VAL, true);
RetainGradForTensor(W);
paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K});
egr::EagerTensor B = EagerUtils::CreateTensorWithValue(
ddimB, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
pten::DataLayout::NCHW, MLP_B_VAL, true);
RetainGradForTensor(B);
Ws.emplace_back(std::move(W));
Bs.emplace_back(std::move(B));
}
if (mode == "Accuracy") {
benchmark_eager_intermediate_mlp(X, Ws, Bs, true /* accuracy_check */);
} else if (mode == "WarmUp") {
benchmark_eager_intermediate_mlp(X, Ws, Bs);
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("eager_intermediate_mlp_cuda.out");
#endif
benchmark_eager_intermediate_mlp(X, Ws, Bs);
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <paddle/fluid/framework/op_registry.h>
#include <chrono>
#include <iostream>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/fluid/imperative/basic_engine.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/memory/memcpy.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
// Disable pten path
DECLARE_bool(run_pten_kernel);
TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
namespace paddle {
namespace imperative {
TEST(Benchmark, FluidScaleCPU) {
// Prepare Device Contexts
platform::CPUPlace place;
egr::InitEnv(place);
for (const std::string& mode : {"Accuracy", "Performance"}) {
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
X->SetOverridedStopGradient(false);
std::vector<float> src_data(128, 5.0);
std::vector<int64_t> dims = {2, 4, 4, 4};
auto* x_tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dims));
auto* mutable_x = x_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_x, place, src_data.data(),
sizeof(float) * src_data.size());
if (mode == "Accuracy") {
benchmark_fluid_scale(X, platform::Place(place),
true /* accuracy_check */);
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("fluid_scale_cpu.out");
#endif
benchmark_fluid_scale(X, platform::Place(place));
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
TEST(Benchmark, FluidMatmulCPU) {
// Prepare Device Contexts
platform::CPUPlace place;
egr::InitEnv(place);
for (const std::string& mode : {"Accuracy", "Performance"}) {
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
X->SetOverridedStopGradient(false);
std::shared_ptr<imperative::VarBase> Y(new imperative::VarBase(true, "Y"));
Y->SetOverridedStopGradient(false);
std::vector<float> x_src_data(4, 1.0);
std::vector<float> y_src_data(4, 2.0);
std::vector<int64_t> dims = {2, 2};
auto* x_tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dims));
auto* mutable_x = x_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_x, place, x_src_data.data(),
sizeof(float) * x_src_data.size());
auto* y_tensor = Y->MutableVar()->GetMutable<framework::LoDTensor>();
y_tensor->Resize(framework::make_ddim(dims));
auto* mutable_y = y_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_y, place, y_src_data.data(),
sizeof(float) * y_src_data.size());
if (mode == "Accuracy") {
benchmark_fluid_matmul(X, Y, platform::Place(place),
true /* accuracy_check */);
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("fluid_matmul_cpu.out");
#endif
benchmark_fluid_matmul(X, Y, platform::Place(place));
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
TEST(Benchmark, FluidMLPCPU) {
// Prepare Device Contexts
platform::CPUPlace place;
egr::InitEnv(place);
for (const std::string& mode : {"Accuracy", "Performance"}) {
std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL);
std::vector<float> w_src_data(MLP_N * MLP_K, MLP_W_VAL);
std::vector<float> b_src_data(MLP_K, MLP_B_VAL);
std::vector<int64_t> x_dims = {MLP_M, MLP_N};
std::vector<int64_t> w_dims = {MLP_N, MLP_K};
std::vector<int64_t> b_dims = {MLP_K};
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
X->SetOverridedStopGradient(false);
auto* x_tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(x_dims));
auto* mutable_x = x_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_x, place, x_src_data.data(),
sizeof(float) * x_src_data.size());
std::vector<std::shared_ptr<imperative::VarBase>> Ws;
std::vector<std::shared_ptr<imperative::VarBase>> Bs;
for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
std::shared_ptr<imperative::VarBase> W(
new imperative::VarBase(true, "W"));
W->SetOverridedStopGradient(false);
std::shared_ptr<imperative::VarBase> B(
new imperative::VarBase(true, "B"));
B->SetOverridedStopGradient(false);
auto* w_tensor = W->MutableVar()->GetMutable<framework::LoDTensor>();
w_tensor->Resize(framework::make_ddim(w_dims));
auto* mutable_w = w_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_w, place, w_src_data.data(),
sizeof(float) * w_src_data.size());
auto* b_tensor = B->MutableVar()->GetMutable<framework::LoDTensor>();
b_tensor->Resize(framework::make_ddim(b_dims));
auto* mutable_b = b_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_b, place, b_src_data.data(),
sizeof(float) * b_src_data.size());
Ws.emplace_back(std::move(W));
Bs.emplace_back(std::move(B));
}
if (mode == "Accuracy") {
benchmark_fluid_mlp(X, Ws, Bs, platform::Place(place),
true /* accuracy_check */);
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("fluid_mlp_cpu.out");
#endif
benchmark_fluid_mlp(X, Ws, Bs, platform::Place(place));
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
} // namespace imperative
} // namespace paddle
USE_OP(scale);
USE_OP(matmul_v2);
USE_OP(reduce_sum);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <paddle/fluid/framework/op_registry.h>
#include <chrono>
#include <iostream>
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/fluid/imperative/basic_engine.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/memory/memcpy.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
// Disable pten path
DECLARE_bool(run_pten_kernel);
TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
namespace paddle {
namespace imperative {
TEST(Benchmark, FluidScaleCUDA) {
// Prepare Device Contexts
platform::CUDAPlace place;
egr::InitEnv(place);
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
X->SetOverridedStopGradient(false);
std::vector<float> src_data(128, 5.0);
std::vector<int64_t> dims = {2, 4, 4, 4};
auto* x_tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dims));
auto* mutable_x = x_tensor->mutable_data<float>(place);
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto stream = dev_ctx->stream();
paddle::memory::Copy(place, mutable_x, platform::CPUPlace(),
src_data.data(), sizeof(float) * src_data.size(),
stream);
if (mode == "Accuracy") {
benchmark_fluid_scale(X, platform::Place(place),
true /* accuracy_check */);
} else if (mode == "WarmUp") {
benchmark_fluid_scale(X, platform::Place(place));
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("fluid_scale_cuda.out");
#endif
benchmark_fluid_scale(X, platform::Place(place));
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
TEST(Benchmark, FluidMatmulCUDA) {
// Prepare Device Contexts
platform::CUDAPlace place;
egr::InitEnv(place);
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
X->SetOverridedStopGradient(false);
std::shared_ptr<imperative::VarBase> Y(new imperative::VarBase(true, "Y"));
Y->SetOverridedStopGradient(false);
std::vector<float> x_src_data(4, 1.0);
std::vector<float> y_src_data(4, 2.0);
std::vector<int64_t> dims = {2, 2};
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto stream = dev_ctx->stream();
auto* x_tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dims));
auto* mutable_x = x_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_x, platform::CPUPlace(),
x_src_data.data(), sizeof(float) * x_src_data.size(),
stream);
auto* y_tensor = Y->MutableVar()->GetMutable<framework::LoDTensor>();
y_tensor->Resize(framework::make_ddim(dims));
auto* mutable_y = y_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_y, platform::CPUPlace(),
y_src_data.data(), sizeof(float) * y_src_data.size(),
stream);
if (mode == "Accuracy") {
benchmark_fluid_matmul(X, Y, platform::Place(place),
true /* accuracy_check */);
} else if (mode == "WarmUp") {
benchmark_fluid_matmul(X, Y, platform::Place(place));
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("fluid_matmul_cuda.out");
#endif
benchmark_fluid_matmul(X, Y, platform::Place(place));
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
TEST(Benchmark, FluidMLPCUDA) {
// Prepare Device Contexts
platform::CUDAPlace place;
egr::InitEnv(place);
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto stream = dev_ctx->stream();
std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL);
std::vector<float> w_src_data(MLP_N * MLP_K, MLP_W_VAL);
std::vector<float> b_src_data(MLP_K, MLP_B_VAL);
std::vector<int64_t> x_dims = {MLP_M, MLP_N};
std::vector<int64_t> w_dims = {MLP_N, MLP_K};
std::vector<int64_t> b_dims = {MLP_K};
std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
X->SetOverridedStopGradient(false);
auto* x_tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(x_dims));
auto* mutable_x = x_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_x, platform::CPUPlace(),
x_src_data.data(), sizeof(float) * x_src_data.size(),
stream);
std::vector<std::shared_ptr<imperative::VarBase>> Ws;
std::vector<std::shared_ptr<imperative::VarBase>> Bs;
for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
std::shared_ptr<imperative::VarBase> W(
new imperative::VarBase(true, "W"));
W->SetOverridedStopGradient(false);
std::shared_ptr<imperative::VarBase> B(
new imperative::VarBase(true, "B"));
B->SetOverridedStopGradient(false);
auto* w_tensor = W->MutableVar()->GetMutable<framework::LoDTensor>();
w_tensor->Resize(framework::make_ddim(w_dims));
auto* mutable_w = w_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_w, platform::CPUPlace(),
w_src_data.data(), sizeof(float) * w_src_data.size(),
stream);
auto* b_tensor = B->MutableVar()->GetMutable<framework::LoDTensor>();
b_tensor->Resize(framework::make_ddim(b_dims));
auto* mutable_b = b_tensor->mutable_data<float>(place);
paddle::memory::Copy(place, mutable_b, platform::CPUPlace(),
b_src_data.data(), sizeof(float) * b_src_data.size(),
stream);
Ws.emplace_back(std::move(W));
Bs.emplace_back(std::move(B));
}
if (mode == "Accuracy") {
benchmark_fluid_mlp(X, Ws, Bs, platform::Place(place),
true /* accuracy_check */);
} else if (mode == "WarmUp") {
benchmark_fluid_mlp(X, Ws, Bs, platform::Place(place));
} else if (mode == "Performance") {
auto t_start = std::chrono::high_resolution_clock::now();
#ifdef WITH_GPERFTOOLS
ProfilerStart("fluid_mlp_cuda.out");
#endif
benchmark_fluid_mlp(X, Ws, Bs, platform::Place(place));
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_ms =
std::chrono::duration<double, std::milli>(t_end - t_start).count();
std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl;
} else {
PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode"));
}
}
}
} // namespace imperative
} // namespace paddle
USE_OP(scale);
USE_OP(matmul_v2);
USE_OP(reduce_sum);
USE_OP(reduce_sum_grad);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册