未验证 提交 f9591bb1 编写于 作者: Z Zeng Jinle 提交者: GitHub

Support CUDA Graph on ParallelExecutor (#36250)

* support CUDA Graph on PE

* add ut, fix CI compile

* reduce memory consumption

* fix CUDA 10 CI

* improve coverage

* improve python coverage
上级 ca16e8fd
......@@ -143,6 +143,8 @@ struct BuildStrategy {
// Turn off inplace addto by default.
bool enable_addto_{false};
bool allow_cuda_graph_capture_{false};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model.
......
......@@ -86,19 +86,28 @@ struct ScaleLossGradFunctor {
}
};
std::string ScaleLossGradOpHandle::LossGradName() const {
return static_cast<VarHandle *>(this->outputs_[0])->name();
}
void ScaleLossGradOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
// Doesn't wait any event
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name();
RunOnVar(local_exec_scopes_[0]->FindVar(LossGradName()), true);
}
auto *tensor =
local_exec_scopes_[0]->FindVar(var_name)->GetMutable<LoDTensor>();
void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) {
auto *tensor = var->GetMutable<LoDTensor>();
tensor->Resize(make_ddim({1}));
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_,
this->dev_ctxes_.at(place_));
this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
if (record_event) {
this->RunAndRecordEvent(
[&] { framework::VisitDataType(out_dtype_, func); });
} else {
framework::VisitDataType(out_dtype_, func);
}
#else
ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, nullptr);
framework::VisitDataType(out_dtype_, func);
......
......@@ -46,6 +46,12 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
std::string Name() const override;
platform::Place GetPlace() const { return place_; }
void RunOnVar(Variable *var, bool record_event = false);
std::string LossGradName() const;
protected:
void RunImpl() override;
......
......@@ -22,7 +22,9 @@
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
namespace details {
......@@ -49,8 +51,29 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
PrepareLocalExeScopes();
}
static void RunProgramDescs(const ProgramDescs &programs,
const std::vector<Scope *> &local_exec_scopes,
const std::vector<platform::Place> &places) {
for (auto &program : programs) {
for (auto &op_desc : program.Block(0).AllOps()) {
for (size_t i = 0; i < local_exec_scopes.size(); ++i) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_exec_scopes[i], places[i]);
}
}
}
}
FetchResultType ScopeBufferedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors, bool return_merged) {
#ifdef PADDLE_WITH_CUDA
if (platform::IsCUDAGraphCapturing()) {
strategy_.num_iteration_per_drop_scope_ =
std::numeric_limits<size_t>::max();
DropLocalExeScopes(/*need_wait=*/false);
}
#endif
if (drop_scope_counter_ == 0) {
platform::RecordEvent e("InitLocalVars");
InitVariables();
......@@ -84,7 +107,7 @@ FetchResultType ScopeBufferedSSAGraphExecutor::Run(
++drop_scope_counter_;
if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_ ||
DropScopeOrNot()) {
DropLocalExeScopes();
DropLocalExeScopes(!platform::IsCUDAGraphCapturing());
}
if (VLOG_IS_ON(5)) {
......@@ -128,15 +151,7 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
if (graph.Has(details::kStartupProgramDescs)) {
auto &program_descs =
graph.Get<details::ProgramDescs>(details::kStartupProgramDescs);
for (auto &program_desc : program_descs) {
for (auto &op_desc : program_desc.Block(0).AllOps()) {
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_exec_scopes_[i], places_[i]);
}
}
}
RunProgramDescs(program_descs, local_exec_scopes_, places_);
}
is_initialized_ = true;
}
......@@ -144,23 +159,17 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
if (graph.Has(details::kProgramDescs)) {
auto &program_descs =
graph.Get<details::ProgramDescs>(details::kProgramDescs);
for (auto &program_desc : program_descs) {
for (auto &op_desc : program_desc.Block(0).AllOps()) {
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_exec_scopes_[i], places_[i]);
}
}
}
RunProgramDescs(program_descs, local_exec_scopes_, places_);
}
}
void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() {
void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes(bool need_wait) {
platform::RecordEvent drop_scope_event("DropLocalExeScopes");
drop_scope_counter_ = 0;
for (auto &p : places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
if (need_wait) {
for (auto &p : places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
}
scope_monitor_.ClearHistoryLocalExecScopes();
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
......
......@@ -53,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
FetchResultType Run(const std::vector<std::string>& fetch_tensors,
bool return_merged) override;
void DropLocalExeScopes();
void DropLocalExeScopes(bool need_wait = true);
bool NeedCreateLocalExeScope();
......
......@@ -115,6 +115,7 @@ message BuildStrategy {
optional bool enable_auto_fusion = 11 [ default = false ];
optional bool enable_addto = 12 [ default = false ];
optional bool fix_op_run_order = 13 [ default = false ];
optional bool allow_cuda_graph_capture = 14 [ default = false ];
}
message ExecutionStrategy {
......
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle op_graph_view multi_devices_helper)
cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"
......@@ -21,14 +22,23 @@ namespace paddle {
namespace framework {
namespace ir {
template <typename T>
static bool IsMatchedPlaceSingleDeviceOp(details::OpHandleBase *op_base,
const platform::Place &place) {
auto *op = dynamic_cast<T *>(op_base);
return op && op->GetPlace() == place;
}
static bool IsLockAndRecordEventFreeComputationOpHandle(
details::ComputationOpHandle *op, const OpGraphView &graph_view) {
if (!platform::is_gpu_place(op->GetPlace()) &&
!platform::is_xpu_place(op->GetPlace()))
return false;
for (auto &pending_op : graph_view.PendingOps(op)) {
auto *tmp = dynamic_cast<details::ComputationOpHandle *>(pending_op);
if (tmp == nullptr || !(tmp->GetPlace() == op->GetPlace())) {
if (!IsMatchedPlaceSingleDeviceOp<details::ComputationOpHandle>(
pending_op, op->GetPlace()) &&
!IsMatchedPlaceSingleDeviceOp<details::ScaleLossGradOpHandle>(
pending_op, op->GetPlace())) {
return false;
}
}
......
......@@ -27,6 +27,7 @@ limitations under the License. */
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
......@@ -34,6 +35,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -43,6 +45,10 @@ limitations under the License. */
DECLARE_double(eager_delete_tensor_gb);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_bool(sync_nccl_allreduce);
#endif
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
......@@ -669,6 +675,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
// ncclOp
std::vector<ir::Graph *> async_graphs =
CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name);
PrepareForCUDAGraphCapture(graph);
graph = member_->ApplyMemoryOptimizePass(graph);
async_graphs[0] = graph;
......@@ -882,6 +889,23 @@ void ParallelExecutor::BCastParamsToDevices(
FetchResultType ParallelExecutor::Run(
const std::vector<std::string> &fetch_tensors, bool return_merged) {
VLOG(3) << "enter ParallelExecutor Run";
#ifdef PADDLE_WITH_CUDA
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(fetch_tensors.empty(), true,
platform::errors::InvalidArgument(
"Cannot fetch data when using CUDA Graph."));
PADDLE_ENFORCE_EQ(
member_->build_strategy_.allow_cuda_graph_capture_, true,
platform::errors::InvalidArgument(
"You must turn on build_strategy.allow_cuda_graph_capture = True "
"to enable CUDA Graph capturing."));
PADDLE_ENFORCE_EQ(
member_->places_[0], platform::CUDAGraphCapturingPlace(),
platform::errors::InvalidArgument("The place to capture CUDAGraph is "
"not the same as the place to run."));
}
#endif
#ifdef WITH_GPERFTOOLS
if (gProfileStarted) {
ProfilerFlush();
......@@ -932,6 +956,16 @@ void ParallelExecutor::SkipMemoryReuse(
void ParallelExecutor::FeedTensorsIntoLocalScopes(
const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors) {
if (platform::IsCUDAGraphCapturing()) {
for (auto &tensor : tensors) {
PADDLE_ENFORCE_EQ(
tensor.empty(), true,
platform::errors::PermissionDenied(
"Feeding data is not permitted when capturing CUDA Graph."));
}
return;
}
if (!member_->AllowPartialFeed()) {
PADDLE_ENFORCE_EQ(tensors.size(), member_->local_scopes_.size(),
platform::errors::Unimplemented(
......@@ -987,6 +1021,14 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
const std::unordered_map<std::string, LoDTensor> &tensors) {
if (platform::IsCUDAGraphCapturing()) {
PADDLE_ENFORCE_EQ(
tensors.empty(), true,
platform::errors::PermissionDenied(
"Feeding data is not permitted when capturing CUDA Graph."));
return;
}
size_t num_places = member_->places_.size();
bool allow_partial_feed = member_->AllowPartialFeed();
......@@ -1568,6 +1610,107 @@ const ir::Graph &ParallelExecutor::Graph() const {
return member_->executor_->Graph();
}
void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) {
const auto &build_strategy = member_->build_strategy_;
if (!build_strategy.allow_cuda_graph_capture_) return;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_EQ(
build_strategy.async_mode_, false,
platform::errors::InvalidArgument(
"Async Executor does not support CUDA Graph capturing."));
PADDLE_ENFORCE_EQ(
platform::IsCUDAGraphCapturing(), false,
platform::errors::PermissionDenied("CUDA Graph is not allowed to capture "
"when running the first batch."));
PADDLE_ENFORCE_EQ(
member_->places_.size(), 1,
platform::errors::InvalidArgument(
"CUDA Graph is only supported when one GPU device is running."));
PADDLE_ENFORCE_EQ(platform::is_gpu_place(member_->places_[0]), true,
platform::errors::InvalidArgument(
"CUDA Graph is only supported on NVIDIA GPU device."));
PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce, false,
platform::errors::InvalidArgument(
"FLAGS_sync_nccl_allreduce must be False to support "
"CUDA Graph capturing."));
std::unordered_map<std::string, std::vector<VarDesc *>> all_vars;
for (auto &node : graph->Nodes()) {
if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
auto *var_desc = node->Var();
all_vars[var_desc->Name()].emplace_back(var_desc);
}
}
auto mark_var_as_persistable = [&all_vars](const std::string &name) {
auto iter = all_vars.find(name);
if (iter != all_vars.end()) {
for (auto *var_desc : iter->second) {
var_desc->SetPersistable(true);
}
}
};
// Step 1: All fused vars must be persistable.
if (graph->Has(details::kFusedVars)) {
auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
for (auto &fused_var : fused_vars) {
fused_var.second.persistable_ = true;
mark_var_as_persistable(fused_var.first);
}
}
// Step 2: All pinned vars must be persistable.
if (graph->Has(details::kPinnedVars)) {
auto &pinned_vars = graph->Get<details::PinnedVars>(details::kPinnedVars);
for (auto &pinned_var : pinned_vars) {
mark_var_as_persistable(pinned_var);
}
}
// Step 3: Move all main programs to startup programs to make sure that
// the main programs would only be run once.
if (graph->Has(details::kProgramDescs)) {
auto &startup_programs =
graph->GetOrInit<details::ProgramDescs>(details::kStartupProgramDescs);
auto &main_programs =
graph->Get<details::ProgramDescs>(details::kProgramDescs);
for (auto &main_program : main_programs) {
startup_programs.emplace_back(main_program);
}
graph->Erase(details::kProgramDescs);
}
// Step 4: Mark all vars in startup programs to be persistable.
if (graph->Has(details::kStartupProgramDescs)) {
auto &startup_programs =
graph->GetOrInit<details::ProgramDescs>(details::kStartupProgramDescs);
for (auto &startup_program : startup_programs) {
for (auto &op_desc : startup_program.Block(0).AllOps()) {
for (auto &output : op_desc->OutputArgumentNames()) {
mark_var_as_persistable(output);
}
}
}
}
// Step 5: ScaleLossGrad must be run beforehand to avoid H2D copy.
auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*graph);
auto *scope = member_->local_scopes_[0];
for (auto *op : ops) {
auto *loss_grad_op = dynamic_cast<details::ScaleLossGradOpHandle *>(op);
if (loss_grad_op == nullptr) continue;
auto loss_grad_name = loss_grad_op->LossGradName();
mark_var_as_persistable(loss_grad_name);
loss_grad_op->RunOnVar(scope->Var(loss_grad_name));
loss_grad_op->SetSkipRunning(true);
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"CUDA Graph is only supported on NVIDIA GPU device."));
#endif
}
} // namespace framework
} // namespace paddle
......
......@@ -144,6 +144,8 @@ class ParallelExecutor {
void SetReaderOpDeviceInfoOfGraphs(
const std::vector<ir::Graph *> &final_graphs);
void PrepareForCUDAGraphCapture(ir::Graph *graph);
ParallelExecutorPrivate *member_;
std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
std::vector<VariableInfo> var_infos_;
......
......@@ -24,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/cudnn_desc.h"
namespace paddle {
namespace operators {
......@@ -480,6 +481,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
static algo_t Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic,
const framework::ExecutionContext& ctx) {
platform::CUDAGraphCaptureModeGuard guard;
auto dtype = platform::CudnnDataType<T>::type;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
size_t workspace_size = 0;
......@@ -601,6 +603,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
}
static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
platform::CUDAGraphCaptureModeGuard guard;
size_t workspace_size = 0;
PADDLE_ENFORCE_CUDA_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
......
......@@ -70,6 +70,9 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
cudaStreamCaptureStatus status;
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamGetCaptureInfo(
capturing_graph_->stream_, &status, &(capturing_graph_->id_)));
PADDLE_ENFORCE_EQ(IsValidCapturing(), true,
platform::errors::PermissionDenied(
"CUDA Graph should not be invalidated."));
VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_;
}
......@@ -88,5 +91,14 @@ std::unique_ptr<CUDAGraph> CUDAGraph::EndCapture() {
#endif
}
bool CUDAGraph::IsValidCapturing() {
if (!IsCapturing()) return false;
cudaStreamCaptureStatus status;
CUDAGraphID id;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamGetCaptureInfo(capturing_graph_->stream_, &status, &id));
return status == cudaStreamCaptureStatusActive;
}
} // namespace platform
} // namespace paddle
......@@ -84,6 +84,10 @@ class CUDAGraph {
return capturing_graph_->place_;
}
// This API can be used to debug which GPU operation is not
// supported during capturing CUDA Graph.
static bool IsValidCapturing();
private:
#if CUDA_VERSION >= 10010
cudaGraph_t graph_{nullptr};
......@@ -104,7 +108,8 @@ class CUDAGraphCaptureModeGuard {
DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
public:
explicit CUDAGraphCaptureModeGuard(cudaStreamCaptureMode mode) {
explicit CUDAGraphCaptureModeGuard(
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {
if (UNLIKELY(CUDAGraph::IsCapturing())) {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
// After cudaThreadExchangeStreamCaptureMode is called,
......@@ -128,7 +133,8 @@ class CUDAGraphCaptureModeGuard {
DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
public:
explicit CUDAGraphCaptureModeGuard(cudaStreamCaptureMode) {}
explicit CUDAGraphCaptureModeGuard(
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {}
};
#endif
......
......@@ -22,8 +22,10 @@ namespace platform {
#ifdef PADDLE_WITH_CUDA
void BeginCUDAGraphCapture(platform::CUDAPlace place,
cudaStreamCaptureMode mode) {
auto stream =
platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
dev_ctx->cudnn_workspace_handle().ResetWorkspace();
auto stream = dev_ctx->stream();
CUDAGraph::BeginCapture(place, stream, mode);
auto id = CUDAGraph::CapturingID();
memory::allocation::AllocatorFacade::Instance().PrepareMemoryPoolForCUDAGraph(
......@@ -35,6 +37,9 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
}
std::unique_ptr<CUDAGraph> EndCUDAGraphCapture() {
auto place = CUDAGraph::CapturingPlace();
auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
dev_ctx->cudnn_workspace_handle().ResetWorkspace();
return CUDAGraph::EndCapture();
}
#endif
......
......@@ -558,7 +558,7 @@ class RecordedCudaMallocHelper {
#ifdef PADDLE_WITH_HIP
auto result = hipMalloc(ptr, size);
#else
CUDAGraphCaptureModeGuard capture_mode_guard{cudaStreamCaptureModeRelaxed};
CUDAGraphCaptureModeGuard capture_mode_guard;
auto result = cudaMalloc(ptr, size);
#endif
if (result == gpuSuccess) {
......
......@@ -736,6 +736,17 @@ PYBIND11_MODULE(core_noavx, m) {
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
})
.def("_copy_from",
[](framework::Tensor &self, const framework::Tensor &other,
const platform::Place &place, int64_t batch_size) {
if (batch_size < 0) {
framework::TensorCopy(other, place, &self);
} else {
auto sliced = other.Slice(0, batch_size);
framework::TensorCopy(sliced, place, &self);
}
},
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
.def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
......@@ -2299,7 +2310,14 @@ All parameter, weight, gradient are variables in Paddle.
m.def("op_support_gpu", OpSupportGPU);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
m.def("cuda_empty_cache", platform::EmptyCache);
m.def("cuda_empty_cache", [] {
for (int dev_id : platform::GetSelectedDevices()) {
auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(
platform::CUDAPlace(dev_id));
dev_ctx->cudnn_workspace_handle().ResetWorkspace();
}
platform::EmptyCache();
});
m.def("get_device_properties",
[](int id) -> const gpuDeviceProp & {
return platform::GetDeviceProperties(id);
......@@ -3211,6 +3229,13 @@ All parameter, weight, gradient are variables in Paddle.
[](BuildStrategy &self, bool fix_op_run_order) {
self.fix_op_run_order_ = fix_op_run_order;
})
.def_property("allow_cuda_graph_capture",
[](const BuildStrategy &self) {
return self.allow_cuda_graph_capture_;
},
[](BuildStrategy &self, bool allow_cuda_graph_capture) {
self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
})
.def("_copy",
[](const BuildStrategy &self) {
auto new_bs = self;
......
......@@ -1044,9 +1044,15 @@ class Executor(object):
lr_value = lr_sheduler()
lr_var = program._program.global_block().vars[lr_sheduler._var_name]
lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype)
exe.feed_and_split_tensor_into_local_scopes({
lr_sheduler._var_name: lr_tensor
})
if core.is_cuda_graph_capturing():
warnings.warn(
"Caution!!! When capturing CUDA Graph, the learning rate scheduler would not "
"take any effect! Please set the learning rate manually before each batch!"
)
else:
exe.feed_and_split_tensor_into_local_scopes({
lr_sheduler._var_name: lr_tensor
})
fetch_var_names = list(map(_to_name_str, fetch_list))
tensors = exe.run(fetch_var_names, return_merged)._move_to_list()
......
......@@ -17,18 +17,105 @@ import paddle.fluid as fluid
from paddle.device.cuda.graphs import CUDAGraph
import unittest
import numpy as np
from paddle.fluid.dygraph.base import switch_to_static_graph
from simple_nets import simple_fc_net_with_inputs
class TestCUDAGraph(unittest.TestCase):
def setUp(self):
fluid.set_flags({'FLAGS_allocator_strategy': 'auto_growth'})
if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(
):
fluid.set_flags({
'FLAGS_allocator_strategy': 'auto_growth',
'FLAGS_sync_nccl_allreduce': False,
'FLAGS_cudnn_deterministic': True
})
def random_tensor(self, shape):
return paddle.to_tensor(
np.random.randint(
low=0, high=10, size=shape).astype("float32"))
def test_cuda_graph(self):
@switch_to_static_graph
def test_cuda_graph_static_graph(self):
if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm():
return
seed = 100
loss_cuda_graph = self.cuda_graph_static_graph_main(
seed, use_cuda_graph=True)
loss_no_cuda_graph = self.cuda_graph_static_graph_main(
seed, use_cuda_graph=False)
self.assertEqual(loss_cuda_graph, loss_no_cuda_graph)
def cuda_graph_static_graph_main(self, seed, use_cuda_graph):
batch_size = 1
class_num = 10
image_shape = [batch_size, 784]
label_shape = [batch_size, 1]
paddle.seed(seed)
np.random.seed(seed)
startup = paddle.static.Program()
main = paddle.static.Program()
with paddle.static.program_guard(main, startup):
image = paddle.static.data(
name="image", shape=image_shape, dtype='float32')
label = paddle.static.data(
name="label", shape=label_shape, dtype='int64')
image.persistable = True
label.persistable = True
loss = simple_fc_net_with_inputs(image, label, class_num)
loss.persistable = True
lr = paddle.optimizer.lr.PiecewiseDecay(
boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04])
optimizer = paddle.optimizer.SGD(learning_rate=lr)
optimizer.minimize(loss)
place = paddle.CUDAPlace(0)
exe = paddle.static.Executor(place)
scope = paddle.static.Scope()
with paddle.static.scope_guard(scope):
exe.run(startup)
build_strategy = paddle.static.BuildStrategy()
build_strategy.allow_cuda_graph_capture = True
build_strategy.fix_op_run_order = True
build_strategy.fuse_all_optimizer_ops = True
compiled_program = paddle.static.CompiledProgram(
main).with_data_parallel(
loss_name=loss.name,
build_strategy=build_strategy,
places=place)
image_t = scope.var(image.name).get_tensor()
label_t = scope.var(label.name).get_tensor()
loss_t = scope.var(loss.name).get_tensor()
lr_var = main.global_block().var(lr._var_name)
self.assertTrue(lr_var.persistable)
lr_t = scope.var(lr_var.name).get_tensor()
cuda_graph = None
for batch_id in range(20):
image_t.set(
np.random.rand(*image_shape).astype('float32'), place)
label_t.set(np.random.randint(
low=0, high=class_num, size=label_shape, dtype='int64'),
place)
if batch_id == 1 and use_cuda_graph:
cuda_graph = CUDAGraph(place, mode="global")
cuda_graph.capture_begin()
exe.run(compiled_program)
cuda_graph.capture_end()
if cuda_graph:
lr_t.set(np.array([lr()], dtype='float32'), place)
cuda_graph.replay()
else:
exe.run(compiled_program)
lr.step()
if cuda_graph:
cuda_graph.reset()
return np.array(loss_t)
def test_cuda_graph_dynamic_graph(self):
if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm():
return
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册