未验证 提交 579784e2 编写于 作者: H huangjiyi 提交者: GitHub

[PHI decouple] move dropout_impl and cuda_graph_with_memory_pool from fluid to phi (#49139)

* move dropout_impl from fluid to phi

* move cuda_graph_with_memory_pool from fluid to phi

* update namespace

* remove cuad_graph in fluid

* fix mac-build

* fix bugs

* correct CodeStyle

* fix mac-build

* fix mutable_data

* fix stl include

* fix copy param
上级 44973c65
......@@ -39,7 +39,7 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
#endif
#if CUDA_VERSION >= 10020
......@@ -157,7 +157,7 @@ class CUDAGraphAllocator
static bool IsCUDAGraphCapturing() {
#ifdef PADDLE_WITH_CUDA
return UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing());
return UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing());
#else
return false;
#endif
......@@ -1007,7 +1007,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(IsCUDAGraphCapturing())) {
auto id = platform::CUDAGraph::CapturingPoolID();
auto id = phi::backends::gpu::CUDAGraph::CapturingPoolID();
auto iter = cuda_graph_map_.find(id);
PADDLE_ENFORCE_NE(
iter,
......
......@@ -19,7 +19,7 @@
#include "paddle/phi/backends/gpu/gpu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
#endif
namespace paddle {
......@@ -49,7 +49,7 @@ void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) {
std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) {
graph_capturing_stream_set_.insert(stream);
return;
}
......@@ -61,7 +61,7 @@ void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) {
bool StreamSafeCUDAAllocation::CanBeFreed() {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
if (UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing())) {
return graph_capturing_stream_set_.empty() &&
outstanding_event_map_.empty();
}
......
......@@ -319,7 +319,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
data, result, data_num_);
RecordStream(data_allocation, other_stream);
std::unique_ptr<platform::CUDAGraph> cuda_graph =
std::unique_ptr<phi::backends::gpu::CUDAGraph> cuda_graph =
platform::EndCUDAGraphCapture();
int replay_times = 10;
......
......@@ -89,7 +89,7 @@ class CUDAGraphWithInOuts {
int64_t PoolID() const { return graph_->PoolID(); }
private:
std::unique_ptr<platform::CUDAGraph> graph_;
std::unique_ptr<phi::backends::gpu::CUDAGraph> graph_;
std::vector<phi::DenseTensor> ins_;
std::vector<phi::DenseTensor> outs_;
std::vector<int64_t> in_indices_;
......
......@@ -14,10 +14,10 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/dropout_impl.cu.h"
#include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/dropout_impl.cu.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/funcs/functors.h"
......@@ -206,7 +206,7 @@ class FMHARef {
stride_b = gemm_k * gemm_n;
if (dropout_param_.dropout_prob_) {
DropoutFwGPUKernelDriver<T>(
phi::funcs::DropoutFwGPUKernelDriver<T>(
static_cast<const phi::GPUContext&>(dev_ctx_),
dropout_param_.is_test_,
dropout_param_.dropout_prob_,
......@@ -381,7 +381,7 @@ class FMHARef {
stride_b = gemm_k * gemm_n;
if (dropout_param_.dropout_prob_) {
DropoutFwGPUKernelDriver<T>(
phi::funcs::DropoutFwGPUKernelDriver<T>(
static_cast<const phi::GPUContext&>(dev_ctx_),
dropout_param_.is_test_,
dropout_param_.dropout_prob_,
......@@ -552,7 +552,7 @@ class FMHARef {
}
// dropout bw
if (dropout_param_.dropout_prob_) {
DropoutGradGPUKernelDriver<T>(
phi::funcs::DropoutGradGPUKernelDriver<T>(
static_cast<const phi::GPUContext&>(dev_ctx_),
false,
dropout_param_.dropout_prob_,
......
......@@ -15,10 +15,10 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/operators/dropout_impl_util.h"
#include "paddle/fluid/operators/fused/fused_dropout_act_bias.h"
#include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h"
#include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h"
#include "paddle/phi/kernels/funcs/dropout_impl_util.h"
#include "paddle/phi/kernels/funcs/functors.h"
#include "paddle/phi/kernels/layer_norm_kernel.h"
......@@ -106,7 +106,7 @@ struct DropoutParam {
int UpdateSeedAndIncrement(const phi::GPUContext& ctx, const int offset) {
uint64_t tmp_increment;
GetSeedDataAndIncrement(
phi::funcs::GetSeedDataAndIncrement(
ctx, tensor_seed, fix_seed, seed_val, offset, &seed, &tmp_increment);
increment = static_cast<int>(tmp_increment);
return increment;
......
......@@ -15,7 +15,7 @@
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/all_context.h"
DECLARE_bool(use_stream_safe_cuda_allocator);
......@@ -23,10 +23,10 @@ namespace paddle {
namespace platform {
#ifdef PADDLE_WITH_CUDA
void BeginCUDAGraphCapture(platform::CUDAPlace place,
void BeginCUDAGraphCapture(phi::GPUPlace place,
cudaStreamCaptureMode mode,
int64_t pool_id) {
auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place);
auto* mutable_dev_ctx = phi::DeviceContextPool::Instance().Get(place);
auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(mutable_dev_ctx);
dev_ctx->cudnn_workspace_handle().ResetWorkspace();
......@@ -64,7 +64,7 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
std::unique_ptr<CUDAGraph> EndCUDAGraphCapture() {
auto place = CUDAGraph::CapturingPlace();
auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place);
auto* mutable_dev_ctx = phi::DeviceContextPool::Instance().Get(place);
auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(mutable_dev_ctx);
dev_ctx->cudnn_workspace_handle().ResetWorkspace();
dev_ctx->SetCUDAGraphAllocator(nullptr);
......
......@@ -14,123 +14,38 @@
#pragma once
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
#endif
#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/macros.h"
namespace paddle {
namespace platform {
#ifdef PADDLE_WITH_CUDA
#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond, \
__kernel_func, \
__grid, \
__block, \
__sm_size, \
__stream, \
__seed_inc, \
__seed_expr, \
__offset_expr, \
...) \
do { \
if (::paddle::platform::CUDAGraph::IsThisThreadCapturing() && (__cond)) { \
using __Helper = \
::paddle::platform::IsSameKernelHelper<decltype(&__kernel_func), \
&__kernel_func>; \
auto *dev_ctx = \
::paddle::platform::DeviceContextPool::Instance().GetByPlace( \
::paddle::platform::CUDAGraph::CapturingPlace()); \
auto __set_seed_func = \
[=](::paddle::platform::CUDAKernelParams *__params, \
bool __check_only) -> bool { \
if (__check_only) { \
return __params->func() == &__kernel_func && \
__Helper::Compare(*__params, __VA_ARGS__); \
} \
auto &KERNEL_PARAMS = *__params; \
uint64_t __seed, __offset; \
::paddle::operators::GetSeedDataAndIncrement( \
*dev_ctx, nullptr, false, 0, __seed_inc, &__seed, &__offset); \
__seed_expr = static_cast<decltype(__seed_expr)>(__seed); \
__offset_expr = static_cast<decltype(__offset_expr)>(__offset); \
return true; \
}; \
::paddle::platform::CUDAGraph::RecordRandomKernelInfo(__set_seed_func); \
} \
__kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__); \
} while (0)
#else
#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond, \
__kernel_func, \
__grid, \
__block, \
__sm_size, \
__stream, \
__seed_inc, \
__seed_expr, \
__offset_expr, \
...) \
do { \
__kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__); \
} while (0)
#endif
// NOTE: These APIs are not thread-safe.
#ifdef PADDLE_WITH_CUDA
void BeginCUDAGraphCapture(platform::CUDAPlace place,
using CUDAGraph = phi::backends::gpu::CUDAGraph;
void BeginCUDAGraphCapture(phi::GPUPlace place,
cudaStreamCaptureMode mode,
int64_t pool_id = CUDAGraph::kInvalidPoolID);
std::unique_ptr<CUDAGraph> EndCUDAGraphCapture();
#endif
inline bool IsCUDAGraphCapturing() {
#ifdef PADDLE_WITH_CUDA
return CUDAGraph::IsCapturing();
#else
return false;
#endif
}
inline platform::CUDAPlace CUDAGraphCapturingPlace() {
inline phi::GPUPlace CUDAGraphCapturingPlace() {
#ifdef PADDLE_WITH_CUDA
return CUDAGraph::CapturingPlace();
#else
PADDLE_THROW(platform::errors::Unimplemented(
PADDLE_THROW(phi::errors::Unimplemented(
"CUDA Graph is only supported on NVIDIA GPU device."));
#endif
}
// Add reset callback if CUDA Graph is capturing.
// Otherwise, invoke callback directly.
template <typename Callback>
inline void AddResetCallbackIfCapturingCUDAGraph(Callback &&callback) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(IsCUDAGraphCapturing())) {
return CUDAGraph::AddResetCallbackDuringCapturing(
std::forward<Callback>(callback));
}
#endif
callback();
}
using phi::backends::gpu::IsCUDAGraphCapturing;
template <typename T>
inline T *RestoreHostMemIfCapturingCUDAGraph(T *host_mem, size_t size) {
static_assert(std::is_trivial<T>::value, "T must be trivial type");
static_assert(!std::is_same<T, void>::value, "T cannot be void");
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(IsCUDAGraphCapturing())) {
size_t nbytes = size * sizeof(T);
void *new_host_mem = new uint8_t[nbytes];
std::memcpy(new_host_mem, host_mem, nbytes);
AddResetCallbackIfCapturingCUDAGraph(
[new_host_mem] { delete[] reinterpret_cast<uint8_t *>(new_host_mem); });
return reinterpret_cast<T *>(new_host_mem);
}
#endif
return host_mem;
}
using phi::backends::gpu::AddResetCallbackIfCapturingCUDAGraph;
using phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph;
class SkipCUDAGraphCaptureGuard {
DISABLE_COPY_AND_ASSIGN(SkipCUDAGraphCaptureGuard);
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
namespace paddle {
namespace platform {
using CUDAKernelParams = phi::backends::gpu::CUDAKernelParams;
#if CUDA_VERSION < 10010
using cudaStreamCaptureMode = phi::backends::gpu::cudaStreamCaptureMode;
#endif
using CUDAGraph = phi::backends::gpu::CUDAGraph;
using CUDAGraphCaptureModeGuard = phi::backends::gpu::CUDAGraphCaptureModeGuard;
template <typename T>
static bool IsBitwiseEqual(const T &x, const T &y) {
return std::memcmp(&x, &y, sizeof(T)) == 0;
}
template <typename F, F f>
struct IsSameKernelHelper;
template <typename Return,
typename... FuncArgs,
Return (*kernel_fn)(FuncArgs...)>
struct IsSameKernelHelper<Return (*)(FuncArgs...), kernel_fn> {
private:
using FuncArgsTuple = decltype(std::make_tuple(std::declval<FuncArgs>()...));
template <typename TupleT, size_t IDX, bool IsEnd /*=false*/>
struct Impl {
static bool Compare(const CUDAKernelParams &params, const TupleT &args) {
using CompareT = typename std::tuple_element<IDX, FuncArgsTuple>::type;
if (!IsBitwiseEqual<CompareT>(params.As<CompareT>(IDX),
std::get<IDX>(args))) {
return false;
}
constexpr auto NewIsEnd = (IDX + 1 == std::tuple_size<TupleT>::value);
return Impl<TupleT, IDX + 1, NewIsEnd>::Compare(params, args);
}
};
template <typename TupleT, size_t IDX>
struct Impl<TupleT, IDX, true> {
static bool Compare(const CUDAKernelParams &params, const TupleT &args) {
return true;
}
};
public:
template <typename... Args>
static bool Compare(const CUDAKernelParams &params, Args... args) {
constexpr auto kNumArgs = sizeof...(FuncArgs);
static_assert(kNumArgs == sizeof...(Args), "Argument number not match");
auto args_tuple = std::make_tuple(args...);
using TupleT = typename std::decay<decltype(args_tuple)>::type;
return Impl<TupleT, 0, kNumArgs == 0>::Compare(params, args_tuple);
}
};
} // namespace platform
} // namespace paddle
......@@ -36,8 +36,8 @@ limitations under the License. */
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/miopen.h"
#else
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
#endif
#ifdef PADDLE_WITH_CUDA
......@@ -230,7 +230,7 @@ class RecordedGpuMallocHelper {
result = hipMalloc(ptr, size);
}
#else
CUDAGraphCaptureModeGuard capture_mode_guard;
phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard;
if (UNLIKELY(malloc_managed_memory)) {
result = cudaMallocManaged(ptr, size);
} else {
......
......@@ -673,7 +673,7 @@ PYBIND11_MODULE(libpaddle, m) {
m.def("is_cuda_graph_capturing", &platform::IsCUDAGraphCapturing);
#ifdef PADDLE_WITH_CUDA
py::class_<platform::CUDAGraph>(m, "CUDAGraph")
py::class_<phi::backends::gpu::CUDAGraph>(m, "CUDAGraph")
.def_static("begin_capture",
[](platform::CUDAPlace place, int mode) {
platform::BeginCUDAGraphCapture(
......@@ -681,10 +681,11 @@ PYBIND11_MODULE(libpaddle, m) {
})
.def_static("end_capture", &platform::EndCUDAGraphCapture)
.def_static("gen_new_memory_pool_id",
&platform::CUDAGraph::UniqueMemoryPoolID)
.def("replay", &platform::CUDAGraph::Replay)
.def("reset", &platform::CUDAGraph::Reset)
.def("print_to_dot_files", &platform::CUDAGraph::PrintToDotFiles);
&phi::backends::gpu::CUDAGraph::UniqueMemoryPoolID)
.def("replay", &phi::backends::gpu::CUDAGraph::Replay)
.def("reset", &phi::backends::gpu::CUDAGraph::Reset)
.def("print_to_dot_files",
&phi::backends::gpu::CUDAGraph::PrintToDotFiles);
#endif
m.def("wait_device", [](const platform::Place &place) {
......
......@@ -236,6 +236,54 @@ class CUDAGraphCaptureModeGuard {
};
#endif
template <typename T>
static bool IsBitwiseEqual(const T &x, const T &y) {
return std::memcmp(&x, &y, sizeof(T)) == 0;
}
template <typename F, F f>
struct IsSameKernelHelper;
template <typename Return,
typename... FuncArgs,
Return (*kernel_fn)(FuncArgs...)>
struct IsSameKernelHelper<Return (*)(FuncArgs...), kernel_fn> {
private:
using FuncArgsTuple = decltype(std::make_tuple(std::declval<FuncArgs>()...));
template <typename TupleT, size_t IDX, bool IsEnd /*=false*/>
struct Impl {
static bool Compare(const CUDAKernelParams &params, const TupleT &args) {
using CompareT = typename std::tuple_element<IDX, FuncArgsTuple>::type;
if (!IsBitwiseEqual<CompareT>(params.As<CompareT>(IDX),
std::get<IDX>(args))) {
return false;
}
constexpr auto NewIsEnd = (IDX + 1 == std::tuple_size<TupleT>::value);
return Impl<TupleT, IDX + 1, NewIsEnd>::Compare(params, args);
}
};
template <typename TupleT, size_t IDX>
struct Impl<TupleT, IDX, true> {
static bool Compare(const CUDAKernelParams &params, const TupleT &args) {
return true;
}
};
public:
template <typename... Args>
static bool Compare(const CUDAKernelParams &params, Args... args) {
constexpr auto kNumArgs = sizeof...(FuncArgs);
static_assert(kNumArgs == sizeof...(Args), "Argument number not match");
auto args_tuple = std::make_tuple(args...);
using TupleT = typename std::decay<decltype(args_tuple)>::type;
return Impl<TupleT, 0, kNumArgs == 0>::Compare(params, args_tuple);
}
};
} // namespace gpu
} // namespace backends
} // namespace phi
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstddef>
#include <utility>
#ifdef PADDLE_WITH_CUDA
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
#include "paddle/phi/kernels/funcs/dropout_impl_util.h"
#endif
namespace phi {
namespace backends {
namespace gpu {
#ifdef PADDLE_WITH_CUDA
#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond, \
__kernel_func, \
__grid, \
__block, \
__sm_size, \
__stream, \
__seed_inc, \
__seed_expr, \
__offset_expr, \
...) \
do { \
if (::phi::backends::gpu::CUDAGraph::IsThisThreadCapturing() && \
(__cond)) { \
using __Helper = \
::phi::backends::gpu::IsSameKernelHelper<decltype(&__kernel_func), \
&__kernel_func>; \
auto *dev_ctx = ::phi::DeviceContextPool::Instance().GetByPlace( \
::phi::backends::gpu::CUDAGraph::CapturingPlace()); \
auto __set_seed_func = \
[=](::phi::backends::gpu::CUDAKernelParams *__params, \
bool __check_only) -> bool { \
if (__check_only) { \
return __params->func() == &__kernel_func && \
__Helper::Compare(*__params, __VA_ARGS__); \
} \
auto &KERNEL_PARAMS = *__params; \
uint64_t __seed, __offset; \
::phi::funcs::GetSeedDataAndIncrement( \
*dev_ctx, nullptr, false, 0, __seed_inc, &__seed, &__offset); \
__seed_expr = static_cast<decltype(__seed_expr)>(__seed); \
__offset_expr = static_cast<decltype(__offset_expr)>(__offset); \
return true; \
}; \
::phi::backends::gpu::CUDAGraph::RecordRandomKernelInfo( \
__set_seed_func); \
} \
__kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__); \
} while (0)
#else
#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond, \
__kernel_func, \
__grid, \
__block, \
__sm_size, \
__stream, \
__seed_inc, \
__seed_expr, \
__offset_expr, \
...) \
do { \
__kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__); \
} while (0)
#endif
inline bool IsCUDAGraphCapturing() {
#ifdef PADDLE_WITH_CUDA
return CUDAGraph::IsCapturing();
#else
return false;
#endif
}
// Add reset callback if CUDA Graph is capturing.
// Otherwise, invoke callback directly.
template <typename Callback>
inline void AddResetCallbackIfCapturingCUDAGraph(Callback &&callback) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(IsCUDAGraphCapturing())) {
return CUDAGraph::AddResetCallbackDuringCapturing(
std::forward<Callback>(callback));
}
#endif
callback();
}
template <typename T>
inline T *RestoreHostMemIfCapturingCUDAGraph(T *host_mem, size_t size) {
static_assert(std::is_trivial<T>::value, "T must be trivial type");
static_assert(!std::is_same<T, void>::value, "T cannot be void");
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(IsCUDAGraphCapturing())) {
size_t nbytes = size * sizeof(T);
void *new_host_mem = new uint8_t[nbytes];
std::memcpy(new_host_mem, host_mem, nbytes);
AddResetCallbackIfCapturingCUDAGraph(
[new_host_mem] { delete[] reinterpret_cast<uint8_t *>(new_host_mem); });
return reinterpret_cast<T *>(new_host_mem);
}
#endif
return host_mem;
}
} // namespace gpu
} // namespace backends
} // namespace phi
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
namespace phi {
namespace funcs {
......@@ -319,7 +319,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
context.GetPlace(),
in_num * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
inputs_data, in_num);
paddle::memory::Copy(context.GetPlace(),
tmp_dev_ins_data->ptr(),
......@@ -368,7 +368,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
inputs_col_num * sizeof(int64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
inputs_col, inputs_col_num);
paddle::memory::Copy(context.GetPlace(),
tmp_dev_ins_col_data->ptr(),
......@@ -484,7 +484,7 @@ class SplitFunctor<phi::GPUContext, T> {
context.GetPlace(),
o_num * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
outputs_data, o_num);
paddle::memory::Copy(context.GetPlace(),
tmp_dev_outs_data->ptr(),
......@@ -535,7 +535,7 @@ class SplitFunctor<phi::GPUContext, T> {
context.GetPlace(),
outputs_cols_num * sizeof(int64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
outputs_cols, outputs_cols_num);
paddle::memory::Copy(context.GetPlace(),
tmp_dev_ins_col_data->ptr(),
......
......@@ -19,35 +19,29 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <curand_kernel.h>
#include "paddle/fluid/platform/dynload/curand.h"
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#include <hiprand_kernel.h>
#include "paddle/fluid/platform/dynload/hiprand.h"
#endif
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/dropout_impl_util.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/phi/kernels/funcs/dropout_impl_util.h"
#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/functors.h"
#include "paddle/phi/kernels/primitive/compute_primitives.h"
namespace paddle {
namespace operators {
namespace phi {
namespace funcs {
template <typename T1, typename T2 = T1, typename OutT = T1>
struct DstMaskFunctor {
const float retain_prob_;
const bool is_upscale_in_train_;
using MT = typename details::MPTypeTrait<T1>::Type;
using MT = typename phi::kps::details::MPTypeTrait<T1>::Type;
MT factor;
HOSTDEVICE inline DstMaskFunctor(const float retain_prob,
const bool is_upscale_in_train)
......@@ -149,7 +143,7 @@ __global__ void VectorizedRandomGenerator(const size_t n,
template <typename T1, typename T2 = T1, typename OutT = T1>
struct MaskFunctor {
const float retain_prob_;
using MT = typename details::MPTypeTrait<T1>::Type;
using MT = typename phi::kps::details::MPTypeTrait<T1>::Type;
MT factor;
HOSTDEVICE inline MaskFunctor(const float retain_prob)
: retain_prob_(retain_prob) {
......@@ -173,7 +167,7 @@ struct MaskFunctor {
template <typename T, typename MaskType>
struct DstFunctor {
using MT = typename details::MPTypeTrait<T>::Type;
using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
MT factor;
HOSTDEVICE inline DstFunctor(const float retain_prob,
const bool is_upscale_in_train,
......@@ -271,7 +265,7 @@ inline void CalcBroadcastedMask(const phi::GPUContext& dev_ctx,
phi::DenseTensor* broadcasted_mask) {
// The broadcast of mask can be combined to the following ElementwiseKernel
// when the BroadcastKernel supports different input types.
broadcasted_mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
dev_ctx.template Alloc<uint8_t>(broadcasted_mask);
std::vector<const phi::DenseTensor*> ins = {&mask};
std::vector<phi::DenseTensor*> outs = {broadcasted_mask};
......@@ -337,7 +331,7 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx,
size_t block_size = gpu_config.GetBlockSize();
int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
const auto& prop = platform::GetDeviceProperties(device_id);
const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
size_t max_grid_size = prop.maxThreadsPerMultiProcessor *
prop.multiProcessorCount / block_size;
grid_size = std::min(grid_size, max_grid_size);
......@@ -393,9 +387,9 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx,
} else {
if (upscale_in_train) {
// y = x
framework::TensorCopy(x, dev_ctx.GetPlace(), dev_ctx, y);
phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, y);
} else {
using MT = typename details::MPTypeTrait<T>::Type;
using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
MT factor = static_cast<MT>(1.0f - dropout_prob);
// y = factor * x
ScaleByDropoutFactor<T, MT>(dev_ctx, x, y, factor);
......@@ -405,7 +399,7 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx,
template <typename T, typename MaskType>
struct CudaDropoutGradFunctor {
using MT = typename details::MPTypeTrait<T>::Type;
using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {}
......@@ -428,7 +422,7 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
const phi::DenseTensor& mask,
phi::DenseTensor* grad_x,
bool is_dropout_nd = false) {
using MT = typename details::MPTypeTrait<T>::Type;
using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
auto stream = dev_ctx.stream();
if (is_test) {
......@@ -465,5 +459,5 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
}
}
} // namespace operators
} // namespace paddle
} // namespace funcs
} // namespace phi
......@@ -14,11 +14,13 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/generator.h"
#include "paddle/phi/core/tensor_utils.h"
namespace paddle {
namespace operators {
namespace phi {
namespace funcs {
inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
const phi::DenseTensor* seed,
......@@ -27,13 +29,11 @@ inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
const int offset,
uint64_t* seed_data,
uint64_t* increment) {
int device_id = dev_ctx.GetPlace().GetDeviceId();
auto gen_cuda = framework::DefaultCUDAGenerator(device_id);
auto gen_cuda = dev_ctx.GetGenerator();
if (seed) {
phi::DenseTensor seed_cpu_tensor;
paddle::framework::TensorCopySync(
*seed, platform::CPUPlace(), &seed_cpu_tensor);
phi::Copy(dev_ctx, *seed, phi::CPUPlace(), true, &seed_cpu_tensor);
*seed_data = static_cast<uint64_t>(seed_cpu_tensor.data<int>()[0]);
*increment = offset;
} else if (!is_fix_seed) {
......@@ -46,5 +46,5 @@ inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
}
}
} // namespace operators
} // namespace paddle
} // namespace funcs
} // namespace phi
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/dropout_grad_kernel.h"
#include "paddle/fluid/operators/dropout_impl.cu.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/dropout_impl.cu.h"
namespace phi {
......@@ -30,14 +30,14 @@ void DropoutGradRawKernel(const Context& dev_ctx,
DenseTensor* x_grad) {
bool upscale_in_train = (mode == "upscale_in_train");
dev_ctx.template Alloc<T>(x_grad);
paddle::operators::DropoutGradGPUKernelDriver<T>(dev_ctx,
is_test,
p.to<float>(),
upscale_in_train,
out_grad,
mask,
x_grad,
false);
phi::funcs::DropoutGradGPUKernelDriver<T>(dev_ctx,
is_test,
p.to<float>(),
upscale_in_train,
out_grad,
mask,
x_grad,
false);
}
template <typename T, typename Context>
......@@ -51,14 +51,14 @@ void DropoutNdGradKernel(const Context& dev_ctx,
DenseTensor* x_grad) {
bool upscale_in_train = (mode == "upscale_in_train");
dev_ctx.template Alloc<T>(x_grad);
paddle::operators::DropoutGradGPUKernelDriver<T>(dev_ctx,
is_test,
p.to<float>(),
upscale_in_train,
out_grad,
mask,
x_grad,
true);
phi::funcs::DropoutGradGPUKernelDriver<T>(dev_ctx,
is_test,
p.to<float>(),
upscale_in_train,
out_grad,
mask,
x_grad,
true);
}
} // namespace phi
......
......@@ -14,9 +14,9 @@
#include "paddle/phi/kernels/dropout_kernel.h"
#include "paddle/fluid/operators/dropout_impl.cu.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/dropout_impl.cu.h"
namespace phi {
......@@ -36,17 +36,17 @@ void DropoutRawKernel(const Context& dev_ctx,
if (mask) {
dev_ctx.template Alloc<uint8_t>(mask);
}
paddle::operators::DropoutFwGPUKernelDriver<T>(dev_ctx,
is_test,
p.to<float>(),
upscale_in_train,
fix_seed,
seed,
x,
seed_tensor.get_ptr(),
mask,
out,
false);
phi::funcs::DropoutFwGPUKernelDriver<T>(dev_ctx,
is_test,
p.to<float>(),
upscale_in_train,
fix_seed,
seed,
x,
seed_tensor.get_ptr(),
mask,
out,
false);
}
template <typename T, typename Context>
......@@ -66,17 +66,17 @@ void DropoutNdKernel(const Context& dev_ctx,
if (mask) {
dev_ctx.template Alloc<uint8_t>(mask);
}
paddle::operators::DropoutFwGPUKernelDriver<T>(dev_ctx,
is_test,
p.to<float>(),
upscale_in_train,
fix_seed,
seed,
x,
seed_tensor.get_ptr(),
mask,
out,
true);
phi::funcs::DropoutFwGPUKernelDriver<T>(dev_ctx,
is_test,
p.to<float>(),
upscale_in_train,
fix_seed,
seed,
x,
seed_tensor.get_ptr(),
mask,
out,
true);
}
} // namespace phi
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册