未验证 提交 b9672a1e 编写于 作者: L Leo Chen 提交者: GitHub

clean distribution_helper, index_impl, aligned_vector code in fluid (#40071)

* clean distribution_helper, index_impl, aligned_vector code in fluid

* fix conflicts
上级 e2e2d531
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef __NVCC__
#include <curand_kernel.h>
#endif
#ifdef __HIPCC__
#include <hiprand_kernel.h>
#endif
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/core/hostdevice.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
#endif
#if !defined(_WIN32)
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
#else
// there is no equivalent intrinsics in msvc.
#define UNLIKELY(condition) (condition)
#endif
namespace paddle {
namespace distribution {
using Tensor = framework::Tensor;
/********************* Transformation Function **********************/
template <typename T>
struct exponential_transform {
explicit exponential_transform(T lambda) : lambda_(lambda) {}
HOSTDEVICE inline T operator()(T val) const {
#if defined(__NVCC__) || defined(__HIPCC__)
if (std::is_same<T, double>::value) {
return static_cast<T>(-1.0) / lambda_ * log(val);
} else {
return static_cast<T>(-1.0) / lambda_ * __logf(val);
}
#else
return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val);
#endif
}
private:
T lambda_;
};
template <typename T>
struct uniform_transform {
explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {}
HOSTDEVICE inline T operator()(T val) const {
if (UNLIKELY(val == static_cast<T>(1.0))) {
return min_;
} else {
return val * range_ + min_;
}
}
private:
T range_;
T min_;
};
template <typename T>
struct normal_transform {
explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {}
HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; }
private:
T mean_;
T std_;
};
#if defined(__NVCC__) || defined(__HIPCC__)
namespace kps = phi::kps;
/*********************** Distribution Function *************************/
template <typename T>
struct uniform_distribution;
template <typename T>
struct normal_distribution;
#if defined(__NVCC__)
template <>
struct uniform_distribution<float> {
__device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
return curand_uniform4(state);
}
static constexpr int kReturnsCount = 4;
};
template <>
struct uniform_distribution<double> {
__device__ inline double2 operator()(
curandStatePhilox4_32_10_t *state) const {
return curand_uniform2_double(state);
}
static constexpr int kReturnsCount = 2;
};
template <>
struct normal_distribution<float> {
__device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
return curand_normal4(state);
}
static constexpr int kReturnsCount = 4;
};
template <>
struct normal_distribution<double> {
__device__ inline double2 operator()(
curandStatePhilox4_32_10_t *state) const {
return curand_normal2_double(state);
}
static constexpr int kReturnsCount = 2;
};
#else
template <>
struct uniform_distribution<float> {
__device__ inline float4 operator()(
hiprandStatePhilox4_32_10_t *state) const {
return hiprand_uniform4(state);
}
static constexpr int kReturnsCount = 4;
};
template <>
struct uniform_distribution<double> {
__device__ inline double2 operator()(
hiprandStatePhilox4_32_10_t *state) const {
return hiprand_uniform2_double(state);
}
static constexpr int kReturnsCount = 2;
};
template <>
struct normal_distribution<float> {
__device__ inline float4 operator()(
hiprandStatePhilox4_32_10_t *state) const {
return hiprand_normal4(state);
}
static constexpr int kReturnsCount = 4;
};
template <>
struct normal_distribution<double> {
__device__ inline double2 operator()(
hiprandStatePhilox4_32_10_t *state) const {
return hiprand_normal2_double(state);
}
static constexpr int kReturnsCount = 2;
};
#endif
/******** Launch GPU function of distribution and transformation *********/
template <typename T, typename DistOp, typename TransformOp>
__global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
DistOp dist, TransformOp trans, T *out_data,
size_t stride) {
size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
static constexpr int kCount = DistOp::kReturnsCount;
#if defined(__NVCC__)
curandStatePhilox4_32_10_t state;
curand_init(seed, idx + THREAD_ID_X, offset, &state);
using SType = curandStatePhilox4_32_10_t;
#else
hiprandStatePhilox4_32_10_t state;
hiprand_init(seed, idx + THREAD_ID_X, offset, &state);
using SType = hiprandStatePhilox4_32_10_t;
#endif
size_t total_thread = GRID_NUM_X * BLOCK_NUM_X;
T args[kCount];
T result[kCount];
for (size_t i = idx; i < size; i += total_thread * kCount) {
kps::ElementwiseRandom<SType, T, kCount, 1, DistOp>(&args[0], dist, &state);
kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(&result[0], &args[0],
trans);
kps::WriteData<T, T, kCount, 1, 1, true>(out_data + i, &result[0], size - i,
1, stride, 1);
__syncthreads();
}
}
template <typename T, typename DistOp, typename TransformOp>
void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx,
Tensor *out, DistOp dist, TransformOp trans) {
T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
auto size = out->numel();
int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
size_t block_size = 256;
size_t expect_grid_size = (size + block_size - 1) / block_size;
const auto &prop = platform::GetDeviceProperties(device_id);
size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
prop.multiProcessorCount;
size_t grid_size =
expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
size_t total_thread = block_size * grid_size;
size_t curand4_loop_times =
(size + 4 * total_thread - 1) / (4 * total_thread);
// 'increment' shoulde be multiple of 4
uint64_t increment = curand4_loop_times * 4;
auto seed_offset = gen_cuda->IncrementOffset(increment);
uint64_t seed = seed_offset.first;
uint64_t offset = seed_offset.second;
DistributionKernel<
T, DistOp, TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
size, seed, offset, dist, trans, out_data, total_thread);
}
#endif
} // namespace distribution
} // namespace paddle
......@@ -34,8 +34,8 @@ limitations under the License. */
#include "paddle/fluid/operators/dropout_impl_util.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/funcs/functors.h"
namespace paddle {
......@@ -86,8 +86,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
bool is_upscale_in_train,
uint64_t increment) {
using MT = typename details::MPTypeTrait<T>::Type;
using LoadT = platform::AlignedVector<T, VecSize>;
using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
using LoadT = phi::AlignedVector<T, VecSize>;
using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
#ifdef PADDLE_WITH_HIP
int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
......@@ -102,7 +102,7 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) {
LoadT src_val;
platform::Load<T, VecSize>(&src[i], &src_val);
phi::Load<T, VecSize>(&src[i], &src_val);
#ifdef PADDLE_WITH_HIP
float4 rand = hiprand_uniform4(&state);
......@@ -126,8 +126,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
}
}
platform::Store<T, VecSize>(dst_val, &dst[i]);
platform::Store<MaskType, VecSize>(mask_val, &mask[i]);
phi::Store<T, VecSize>(dst_val, &dst[i]);
phi::Store<MaskType, VecSize>(mask_val, &mask[i]);
}
}
......@@ -153,16 +153,16 @@ __global__ void DropoutGradCUDAKernel(
const typename details::MPTypeTrait<T>::Type factor, const int64_t size,
T* dx) {
using MT = typename details::MPTypeTrait<T>::Type;
using LoadT = platform::AlignedVector<T, VecSize>;
using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
using LoadT = phi::AlignedVector<T, VecSize>;
using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
LoadT dout_val;
platform::Load<T, VecSize>(&dout[i], &dout_val);
phi::Load<T, VecSize>(&dout[i], &dout_val);
MaskLoadT mask_val;
platform::Load<MaskType, VecSize>(&mask[i], &mask_val);
phi::Load<MaskType, VecSize>(&mask[i], &mask_val);
LoadT dx_val;
......@@ -172,7 +172,7 @@ __global__ void DropoutGradCUDAKernel(
static_cast<MT>(mask_val[j]) * factor);
}
platform::Store<T, VecSize>(dx_val, &dx[i]);
phi::Store<T, VecSize>(dx_val, &dx[i]);
}
}
......@@ -219,7 +219,7 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
uint64_t increment;
// VectorizedRandomGenerator use curand_uniform4, so we only support
// vec_size is 4;
int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
int vec_size = (phi::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
auto offset =
((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;
......
......@@ -76,7 +76,7 @@ class ExponentialKernel<platform::CPUDeviceContext, T>
auto engine = gen->GetCPUEngine();
std::uniform_real_distribution<T> uniform(0.0, 1.0);
distribution::exponential_transform<T> trans(lambda);
phi::funcs::exponential_transform<T> trans(lambda);
for (int64_t i = 0; i < size; ++i) {
out_data[i] = trans(uniform(*engine));
}
......
......@@ -26,9 +26,9 @@ class ExponentialKernel<platform::CUDADeviceContext, T>
auto& dev_cxt = ctx.template device_context<platform::CUDADeviceContext>();
T lambda = static_cast<T>(ctx.Attr<float>("lambda"));
distribution::uniform_distribution<T> dist;
distribution::exponential_transform<T> trans(lambda);
distribution::distribution_and_transform<T>(dev_cxt, out, dist, trans);
phi::funcs::uniform_distribution<T> dist;
phi::funcs::exponential_transform<T> trans(lambda);
phi::funcs::distribution_and_transform<T>(dev_cxt, out, dist, trans);
}
};
......
......@@ -17,7 +17,7 @@
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/distribution_helper.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
......
......@@ -89,9 +89,9 @@ __global__ void BroadcastKernelBinary(
template <typename T>
void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
const T* in0, const T* in1, T* out) {
int in_vec_size = std::min(platform::GetVectorizedSize<T>(in0),
platform::GetVectorizedSize<T>(in1));
int out_vec_size = std::min(4, platform::GetVectorizedSize<T>(out));
int in_vec_size =
std::min(phi::GetVectorizedSize<T>(in0), phi::GetVectorizedSize<T>(in1));
int out_vec_size = std::min(4, phi::GetVectorizedSize<T>(out));
int vec_size = std::min(out_vec_size, in_vec_size);
int numel = m * n;
......
......@@ -130,17 +130,17 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
const T factor, const int64_t size, T *dx) {
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
using LoadT = platform::AlignedVector<T, VecSize>;
using StoreT = platform::AlignedVector<T, VecSize>;
using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
using LoadT = phi::AlignedVector<T, VecSize>;
using StoreT = phi::AlignedVector<T, VecSize>;
using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
LoadT dout_vec;
LoadT src_vec;
MaskLoadT mask_vec;
platform::Load<T, VecSize>(&dout[i], &dout_vec);
platform::Load<MaskType, VecSize>(&mask[i], &mask_vec);
platform::Load<T, VecSize>(&src[i], &src_vec);
phi::Load<T, VecSize>(&dout[i], &dout_vec);
phi::Load<MaskType, VecSize>(&mask[i], &mask_vec);
phi::Load<T, VecSize>(&src[i], &src_vec);
StoreT dx_vec;
#pragma unroll
......@@ -148,7 +148,7 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout,
T tmp = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]);
}
platform::Store<T, VecSize>(dx_vec, &dx[i]);
phi::Store<T, VecSize>(dx_vec, &dx[i]);
}
}
......@@ -167,9 +167,9 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
T *dx, T *dbias) {
int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
using LoadT = platform::AlignedVector<T, VecSize>;
using StoreT = platform::AlignedVector<T, VecSize>;
using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
using LoadT = phi::AlignedVector<T, VecSize>;
using StoreT = phi::AlignedVector<T, VecSize>;
using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
T tmp_sum[VecSize] = {static_cast<T>(0)};
// calculate the dx and temporary sum
if (col_id * VecSize < cols) {
......@@ -180,10 +180,10 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
LoadT bias_vec;
MaskLoadT mask_vec;
platform::Load<T, VecSize>(&dout[index], &dout_vec);
platform::Load<T, VecSize>(&src[index], &src_vec);
platform::Load<MaskType, VecSize>(&mask[index], &mask_vec);
platform::Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);
phi::Load<T, VecSize>(&dout[index], &dout_vec);
phi::Load<T, VecSize>(&src[index], &src_vec);
phi::Load<MaskType, VecSize>(&mask[index], &mask_vec);
phi::Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);
StoreT dx_vec;
#pragma unroll
......@@ -194,7 +194,7 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout,
dx_vec[i] = val;
tmp_sum[i] += val;
}
platform::Store<T, VecSize>(dx_vec, &dx[index]);
phi::Store<T, VecSize>(dx_vec, &dx[index]);
}
}
......
......@@ -21,11 +21,11 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/funcs/functors.h"
namespace paddle {
......
......@@ -42,12 +42,12 @@ __device__ void CalcLayernormY(
const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *bias, const T *x,
T *y, const int row_id, const int col_id, const int cols,
const LayerNormParamType<T> mean_val, const LayerNormParamType<T> invvar) {
using LoadT = platform::AlignedVector<T, VecSize>;
using StoreT = platform::AlignedVector<T, VecSize>;
using LoadU = platform::AlignedVector<U, VecSize>;
using LoadT = phi::AlignedVector<T, VecSize>;
using StoreT = phi::AlignedVector<T, VecSize>;
using LoadU = phi::AlignedVector<U, VecSize>;
using LoadScaleOrBias =
platform::AlignedVector<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
VecSize>;
phi::AlignedVector<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
VecSize>;
for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) {
LoadScaleOrBias scale_vec;
LoadScaleOrBias bias_vec;
......@@ -60,15 +60,15 @@ __device__ void CalcLayernormY(
static_cast<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>>(0);
}
// vectorize load data from global
platform::Load<T, VecSize>(&x[row_id * cols + i], &x_vec);
phi::Load<T, VecSize>(&x[row_id * cols + i], &x_vec);
if (scale != nullptr) {
platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
VecSize>(&scale[i], &scale_vec);
phi::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, VecSize>(
&scale[i], &scale_vec);
}
if (bias != nullptr) {
platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
VecSize>(&bias[i], &bias_vec);
phi::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, VecSize>(
&bias[i], &bias_vec);
}
StoreT y_vec;
......@@ -78,7 +78,7 @@ __device__ void CalcLayernormY(
(static_cast<U>(x_vec[ii]) - mean_val) * invvar +
static_cast<U>(bias_vec[ii]));
}
platform::Store<T, VecSize>(y_vec, &y[row_id * cols + i]);
phi::Store<T, VecSize>(y_vec, &y[row_id * cols + i]);
}
}
......@@ -190,9 +190,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr,
U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) {
using Vec = platform::AlignedVector<T, VecSize>;
using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
using Vec = phi::AlignedVector<T, VecSize>;
using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
const int tidx = threadIdx.x;
const int bidx = blockIdx.x;
......@@ -214,8 +214,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
Vec_scale beta[LDGS];
#pragma unroll
for (int it = 0, col = c; it < LDGS; it++) {
platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
platform::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
col += THREADS_PER_ROW;
}
......@@ -225,10 +225,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
Vec residual[LDGS];
#pragma unroll
for (int it = 0, col = c; it < LDGS; it++) {
platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
&x[it]);
platform::Load<T, VecSize>(
residual_ptr + row * LN_NUM_COLS + col * VecSize, &residual[it]);
phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
phi::Load<T, VecSize>(residual_ptr + row * LN_NUM_COLS + col * VecSize,
&residual[it]);
col += THREADS_PER_ROW;
}
......@@ -270,9 +269,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
// store dropout_residual_out and mask_out
#pragma unroll
for (int it = 0, col = c; it < LDGS; it++) {
platform::Store<T, VecSize>(
phi::Store<T, VecSize>(
x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize);
platform::Store<MaskType, VecSize>(
phi::Store<MaskType, VecSize>(
mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize);
col += THREADS_PER_ROW;
}
......@@ -333,8 +332,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
#pragma unroll
for (int it = 0, col = c; it < LDGS; it++) {
platform::Store<T, VecSize>(x[it],
y_ptr + row * LN_NUM_COLS + col * VecSize);
phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
col += THREADS_PER_ROW;
}
}
......
......@@ -32,9 +32,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
const T *__restrict__ bias, T *dst, MaskType *mask, const bool is_test,
typename details::MPTypeTrait<T>::Type *mean_val,
typename details::MPTypeTrait<T>::Type *var_val, Functor act_func) {
using LoadT = platform::AlignedVector<T, VecSize>;
using StoreT = platform::AlignedVector<T, VecSize>;
using MaskStoreT = platform::AlignedVector<MaskType, VecSize>;
using LoadT = phi::AlignedVector<T, VecSize>;
using StoreT = phi::AlignedVector<T, VecSize>;
using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
using U = typename details::MPTypeTrait<T>::Type;
LoadT src_vec;
......@@ -46,14 +46,13 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
residual_vec[ii] = static_cast<T>(0);
}
// vectorize load data from global
platform::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
phi::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
if (residual) {
platform::Load<T, VecSize>(&residual[row_id * cols + col_id],
&residual_vec);
phi::Load<T, VecSize>(&residual[row_id * cols + col_id], &residual_vec);
}
if (bias) {
platform::Load<T, VecSize>(&bias[col_id], &bias_vec);
phi::Load<T, VecSize>(&bias[col_id], &bias_vec);
}
MaskStoreT mask_vec;
......@@ -89,9 +88,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
}
// store result to global
platform::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
phi::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
if (!is_test) {
platform::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
phi::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
}
}
......@@ -176,21 +175,21 @@ __global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask,
T *dx) {
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
using LoadT = platform::AlignedVector<T, VecSize>;
using StoreT = platform::AlignedVector<T, VecSize>;
using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
using LoadT = phi::AlignedVector<T, VecSize>;
using StoreT = phi::AlignedVector<T, VecSize>;
using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
LoadT dout_vec;
MaskLoadT mask_vec;
platform::Load<T, VecSize>(&dout[i], &dout_vec);
platform::Load<MaskType, VecSize>(&mask[i], &mask_vec);
phi::Load<T, VecSize>(&dout[i], &dout_vec);
phi::Load<MaskType, VecSize>(&mask[i], &mask_vec);
StoreT dx_vec;
#pragma unroll
for (int ii = 0; ii < VecSize; ii++) {
dx_vec[ii] = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
}
platform::Store<T, VecSize>(dx_vec, &dx[i]);
phi::Store<T, VecSize>(dx_vec, &dx[i]);
}
}
......@@ -209,9 +208,9 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
T *dbias) {
int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
using LoadT = platform::AlignedVector<T, VecSize>;
using StoreT = platform::AlignedVector<T, VecSize>;
using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
using LoadT = phi::AlignedVector<T, VecSize>;
using StoreT = phi::AlignedVector<T, VecSize>;
using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
T tmp_sum[VecSize] = {static_cast<T>(0)};
// calculate the dx and temporary sum
......@@ -221,8 +220,8 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
LoadT out_vec;
MaskLoadT mask_vec;
StoreT dx_vec;
platform::Load<T, VecSize>(&dout[index], &out_vec);
platform::Load<MaskType, VecSize>(&mask[index], &mask_vec);
phi::Load<T, VecSize>(&dout[index], &out_vec);
phi::Load<MaskType, VecSize>(&mask[index], &mask_vec);
#pragma unroll
for (int i = 0; i < VecSize; i++) {
......@@ -230,7 +229,7 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout,
tmp_sum[i] += out_vec[i];
}
platform::Store<T, VecSize>(dx_vec, &dx[index]);
phi::Store<T, VecSize>(dx_vec, &dx[index]);
}
}
......
......@@ -19,9 +19,10 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/distribution_helper.h"
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/operators/index_impl.cu.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/index_impl.cu.h"
DECLARE_bool(use_curand);
......@@ -79,10 +80,10 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
int64_t gen_offset = size * seed_offset.second;
auto func = GaussianGenerator<T>(mean, std, seed_offset.first,
seed_offset.second);
IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
} else {
auto func = GaussianGenerator<T>(mean, std, seed);
IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
}
}
};
......
......@@ -58,7 +58,7 @@ static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y,
static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
for (; offset < n; offset += stride) {
using ArrT = platform::AlignedVector<__half, VecSize>;
using ArrT = phi::AlignedVector<__half, VecSize>;
ArrT in_arr = *reinterpret_cast<const ArrT*>(x + offset);
#pragma unroll
for (int i = 0; i < VecSize; ++i) {
......@@ -77,7 +77,7 @@ static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x,
static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
for (; offset < n; offset += stride) {
using ArrT = platform::AlignedVector<__half, VecSize>;
using ArrT = phi::AlignedVector<__half, VecSize>;
ArrT x_in_arr = *reinterpret_cast<const ArrT*>(x + offset);
ArrT y_g_in_arr = *reinterpret_cast<const ArrT*>(y_g + offset);
#pragma unroll
......@@ -103,7 +103,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \
do { \
constexpr auto kAlignment = \
alignof(platform::AlignedVector<__half, __vec_size>); \
alignof(phi::AlignedVector<__half, __vec_size>); \
if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \
is_aligned(y, kAlignment)) { \
size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
......@@ -138,7 +138,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \
do { \
constexpr auto kAlignment = \
alignof(platform::AlignedVector<__half, __vec_size>); \
alignof(phi::AlignedVector<__half, __vec_size>); \
if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \
is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \
is_aligned(x_g, kAlignment)) { \
......
......@@ -19,11 +19,11 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/distribution_helper.h"
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
namespace paddle {
......@@ -58,7 +58,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) {
int numel = out->numel();
T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
if (numel <= 0) return;
int vec_size = paddle::platform::GetVectorizedSize(out_data);
int vec_size = phi::GetVectorizedSize(out_data);
#ifdef PADDLE_WITH_XPU_KP
int block = 64;
int grid = 8;
......
......@@ -22,10 +22,10 @@ limitations under the License. */
namespace cub = hipcub;
#endif
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
namespace paddle {
namespace operators {
......@@ -186,8 +186,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr,
U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
T *__restrict__ y_ptr) {
using Vec = platform::AlignedVector<T, VecSize>;
using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
using Vec = phi::AlignedVector<T, VecSize>;
using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
const int tidx = threadIdx.x;
const int bidx = blockIdx.x;
......@@ -203,8 +203,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
Vec_scale beta[LDGS];
#pragma unroll
for (int it = 0, col = c; it < LDGS; it++) {
platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
platform::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
col += THREADS_PER_ROW;
}
......@@ -213,8 +213,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
Vec x[LDGS];
#pragma unroll
for (int it = 0, col = c; it < LDGS; it++) {
platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
&x[it]);
phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
col += THREADS_PER_ROW;
}
U xf[LDGS * VecSize];
......@@ -276,8 +275,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel(
#pragma unroll
for (int it = 0, col = c; it < LDGS; it++) {
platform::Store<T, VecSize>(x[it],
y_ptr + row * LN_NUM_COLS + col * VecSize);
phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
col += THREADS_PER_ROW;
}
}
......@@ -401,9 +399,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
U *__restrict__ dgamma_temp_ptr, U *__restrict__ dbeta_temp_ptr,
T *__restrict__ dx_ptr, const MaskType *mask_ptr = nullptr,
T factor = static_cast<T>(0), T *d_dropout_src_ptr = nullptr) {
using Vec = platform::AlignedVector<T, VecSize>;
using Vec_scale = platform::AlignedVector<ScaleT, VecSize>;
using MaskLoadT = platform::AlignedVector<MaskType, VecSize>;
using Vec = phi::AlignedVector<T, VecSize>;
using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
const int tidx = threadIdx.x;
const int bidx = blockIdx.x;
......@@ -439,7 +437,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
int col = c;
#pragma unroll
for (int it = 0; it < LDGS; it++) {
platform::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
col += THREADS_PER_ROW;
}
......@@ -452,12 +450,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
int col = c;
#pragma unroll
for (int it = 0; it < LDGS; it++) {
platform::Load<T, VecSize>(dout_ptr + row * LN_NUM_COLS + col * VecSize,
&dout[it]);
platform::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize,
&x[it]);
phi::Load<T, VecSize>(dout_ptr + row * LN_NUM_COLS + col * VecSize,
&dout[it]);
phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
if (isFusedDropoutResidualLn) {
platform::Load<MaskType, VecSize>(
phi::Load<MaskType, VecSize>(
mask_ptr + row * LN_NUM_COLS + col * VecSize, &mask_vec[it]);
}
......@@ -552,10 +549,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
col = c;
#pragma unroll
for (int it = 0; it < LDGS; it++) {
platform::Store<T, VecSize>(x[it],
dx_ptr + row * LN_NUM_COLS + col * VecSize);
phi::Store<T, VecSize>(x[it], dx_ptr + row * LN_NUM_COLS + col * VecSize);
if (isFusedDropoutResidualLn) {
platform::Store<T, VecSize>(
phi::Store<T, VecSize>(
dout[it], d_dropout_src_ptr + row * LN_NUM_COLS + col * VecSize);
}
col += THREADS_PER_ROW;
......@@ -641,7 +637,7 @@ template <
__global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
const int rows, U *__restrict__ dg_part_, U *__restrict__ db_part_,
ScaleT *__restrict__ dg_, ScaleT *__restrict__ db_) {
using Vec = platform::AlignedVector<U, VecSize>;
using Vec = phi::AlignedVector<U, VecSize>;
static_assert(VEC_COLS == LN_NUM_COLS / VecSize, "");
const int tidx = threadIdx.x;
......@@ -669,8 +665,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
for (int row = r; row < rows; row += ROWS_PER_CTA) {
Vec dg;
Vec db;
platform::Load<U, VecSize>(dg_part_ptr, &dg);
platform::Load<U, VecSize>(db_part_ptr, &db);
phi::Load<U, VecSize>(dg_part_ptr, &dg);
phi::Load<U, VecSize>(db_part_ptr, &db);
dg_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;
db_part_ptr += ROWS_PER_CTA * LN_NUM_COLS;
......
......@@ -57,8 +57,7 @@ static void LaunchCastKernel(const platform::CUDADeviceContext &ctx,
PADDLE_ENFORCE_NE(
static_cast<const void *>(x), static_cast<void *>(y),
platform::errors::InvalidArgument("Inplace cast is not supported yet."));
int vec_size =
std::min(platform::GetVectorizedSize(x), platform::GetVectorizedSize(y));
int vec_size = std::min(phi::GetVectorizedSize(x), phi::GetVectorizedSize(y));
switch (vec_size) {
case 4:
return details::VecCastKernel<InT, OutT, 4>(ctx, x, y, n);
......
......@@ -19,11 +19,11 @@
#include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h"
#include "paddle/fluid/operators/optimizers/multi_tensor_apply.h"
#include "paddle/fluid/operators/tensor_to_string.h"
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#ifdef __NVCC__
#include "cub/cub.cuh"
......@@ -66,8 +66,8 @@ struct L2NormFunctor {
int i;
for (i = threadIdx.x * VecSize; i + VecSize <= size;
i += (BlockDim * VecSize)) {
platform::AlignedVector<T, VecSize> tmp_vec;
platform::Load(ptr + i, &tmp_vec);
phi::AlignedVector<T, VecSize> tmp_vec;
phi::Load(ptr + i, &tmp_vec);
#pragma unroll
for (int j = 0; j < VecSize; ++j) {
auto tmp = static_cast<MT>(tmp_vec[j]);
......@@ -111,9 +111,9 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
constexpr int max_load_bits = 128;
int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
auto address = reinterpret_cast<uintptr_t>(ptr);
constexpr int vec8 = alignof(platform::AlignedVector<T, 8>);
constexpr int vec4 = alignof(platform::AlignedVector<T, 4>);
constexpr int vec2 = alignof(platform::AlignedVector<T, 2>);
constexpr int vec8 = alignof(phi::AlignedVector<T, 8>);
constexpr int vec4 = alignof(phi::AlignedVector<T, 4>);
constexpr int vec2 = alignof(phi::AlignedVector<T, 2>);
chunk_size *= sizeof(T);
if (address % vec8 == 0 && chunk_size % vec8 == 0) {
return std::min(8, valid_vec_size);
......@@ -316,15 +316,15 @@ static __global__ void ScaleCUDAKernel(const T1 *__restrict__ x,
int stride = blockDim.x * gridDim.x * VecSize;
for (; i + VecSize <= num; i += stride) {
platform::AlignedVector<T1, VecSize> x_vec;
platform::AlignedVector<T1, VecSize> y_vec;
phi::AlignedVector<T1, VecSize> x_vec;
phi::AlignedVector<T1, VecSize> y_vec;
platform::Load(x + i, &x_vec);
phi::Load(x + i, &x_vec);
#pragma unroll
for (int j = 0; j < VecSize; ++j) {
y_vec[j] = static_cast<T1>(static_cast<T2>(x_vec[j]) * s);
}
platform::Store(y_vec, y + i);
phi::Store(y_vec, y + i);
}
for (; i < num; ++i) {
......@@ -410,24 +410,24 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
int stride = blockDim.x * gridDim.x * VecSize;
for (; i + VecSize <= num; i += stride) {
platform::AlignedVector<T, VecSize> param_vec;
platform::AlignedVector<GradT, VecSize> grad_vec;
platform::AlignedVector<T, VecSize> mom1_vec;
platform::AlignedVector<T, VecSize> mom2_vec;
platform::AlignedVector<T, VecSize> trust_ratio_div_vec;
phi::AlignedVector<T, VecSize> param_vec;
phi::AlignedVector<GradT, VecSize> grad_vec;
phi::AlignedVector<T, VecSize> mom1_vec;
phi::AlignedVector<T, VecSize> mom2_vec;
phi::AlignedVector<T, VecSize> trust_ratio_div_vec;
T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
if (cur_weight_decay != static_cast<T>(0.0)) {
platform::Load(param_p + i, &param_vec);
phi::Load(param_p + i, &param_vec);
} else {
#pragma unroll
for (int j = 0; j < VecSize; ++j) {
param_vec[j] = static_cast<T>(0);
}
}
platform::Load(grad_p + i, &grad_vec);
platform::Load(mom1_p + i, &mom1_vec);
platform::Load(mom2_p + i, &mom2_vec);
phi::Load(grad_p + i, &grad_vec);
phi::Load(mom1_p + i, &mom1_vec);
phi::Load(mom2_p + i, &mom2_vec);
#define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2, \
__trust_ratio_div, __idx) \
......@@ -450,9 +450,9 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
mom2_vec, trust_ratio_div_vec, j);
}
platform::Store(mom1_vec, mom1_p + i);
platform::Store(mom2_vec, mom2_p + i);
platform::Store(trust_ratio_div_vec, trust_ratio_div_p + i);
phi::Store(mom1_vec, mom1_p + i);
phi::Store(mom2_vec, mom2_p + i);
phi::Store(trust_ratio_div_vec, trust_ratio_div_p + i);
}
for (; i < num; ++i) {
......@@ -632,29 +632,29 @@ struct LambUpdateParamAndBetaPowsFunctor {
trust_ratio_div += offset;
for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) {
platform::AlignedVector<MT, VecSize> trust_ratio_div_vec;
platform::Load(trust_ratio_div + i, &trust_ratio_div_vec);
phi::AlignedVector<MT, VecSize> trust_ratio_div_vec;
phi::Load(trust_ratio_div + i, &trust_ratio_div_vec);
if (HasMasterParam) {
platform::AlignedVector<MT, VecSize> master_param_vec;
platform::Load(master_param + i, &master_param_vec);
platform::AlignedVector<ParamT, VecSize> param_vec;
phi::AlignedVector<MT, VecSize> master_param_vec;
phi::Load(master_param + i, &master_param_vec);
phi::AlignedVector<ParamT, VecSize> param_vec;
#pragma unroll
for (int j = 0; j < VecSize; ++j) {
MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j];
master_param_vec[j] = p;
param_vec[j] = static_cast<ParamT>(p);
}
platform::Store(master_param_vec, master_param + i);
platform::Store(param_vec, param + i);
phi::Store(master_param_vec, master_param + i);
phi::Store(param_vec, param + i);
} else {
platform::AlignedVector<ParamT, VecSize> param_vec;
platform::Load(param + i, &param_vec);
phi::AlignedVector<ParamT, VecSize> param_vec;
phi::Load(param + i, &param_vec);
#pragma unroll
for (int j = 0; j < VecSize; ++j) {
MT p = static_cast<MT>(param_vec[j]) - ratio * trust_ratio_div_vec[j];
param_vec[j] = static_cast<ParamT>(p);
}
platform::Store(param_vec, param + i);
phi::Store(param_vec, param + i);
}
}
......
......@@ -88,8 +88,8 @@ __device__ inline void VectorizeLarsUpdate(
T* param_out, MT* velocity_out, const MT mu, MT local_lr,
const MT lars_weight_decay, const MT rescale_grad, const int tid,
const int grid_stride, const int numel, MT* master_param_out = nullptr) {
using VecType = paddle::platform::AlignedVector<T, VecSize>;
using VecMType = paddle::platform::AlignedVector<MT, VecSize>;
using VecType = phi::AlignedVector<T, VecSize>;
using VecMType = phi::AlignedVector<MT, VecSize>;
int main = numel >> (VecSize >> 1);
int tail_offset = main * VecSize;
......
......@@ -25,8 +25,9 @@ DECLARE_bool(use_curand);
#include <thrust/random.h>
#include <thrust/transform.h>
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/operators/index_impl.cu.h"
#include "paddle/phi/kernels/full_kernel.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/index_impl.cu.h"
#endif
namespace paddle {
......@@ -206,21 +207,21 @@ void UniformRandom(const framework::ExecutionContext& context,
if (gen_cuda->GetIsInitPy() && seed_flag) {
if (FLAGS_use_curand) {
using MT = typename details::MPTypeTrait<T>::Type;
distribution::uniform_distribution<MT> dist;
distribution::uniform_transform<MT> trans(min, max);
distribution::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
phi::funcs::uniform_distribution<MT> dist;
phi::funcs::uniform_real_transform<MT> trans(min, max);
phi::funcs::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
} else {
auto seed_offset = gen_cuda->IncrementOffset(1);
int64_t gen_offset = size * seed_offset.second;
auto func =
UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
diag_step, diag_val, gen_offset);
IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
phi::IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
}
} else {
auto func =
UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
phi::IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
}
}
#endif
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include <cstdint>
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#define INT_BITS 32
......@@ -25,7 +25,7 @@ namespace platform {
struct FastDivMod {
// 1st value represents the result of input number divides by recorded divisor
// 2nd value represents the result of input number modulo by recorded divisor
using DivModT = AlignedVector<uint32_t, 2>;
using DivModT = phi::AlignedVector<uint32_t, 2>;
FastDivMod() {}
HOSTDEVICE FastDivMod(uint32_t d) : divisor(d) {
......
......@@ -493,16 +493,14 @@ void BroadcastKernelForDifferentVecSize(
"%d-th output tensor`s shape is not.",
i));
out_vec_size = std::min(
paddle::platform::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()),
out_vec_size);
phi::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()), out_vec_size);
}
} else {
out_vec_size =
paddle::platform::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
out_vec_size = phi::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
}
for (auto *in : ins) {
auto temp_size = paddle::platform::GetVectorizedSize<InT>(in->data<InT>());
auto temp_size = phi::GetVectorizedSize<InT>(in->data<InT>());
in_vec_size = in->dims() == (*outs)[0]->dims()
? std::min(temp_size, in_vec_size)
: in_vec_size;
......
......@@ -28,6 +28,7 @@ limitations under the License. */
#include "paddle/phi/core/hostdevice.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/phi/kernels/funcs/index_impl.cu.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
#endif
......
......@@ -23,9 +23,9 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/function_traits.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
#define HOSTDEVICE __host__ __device__
......@@ -546,9 +546,8 @@ struct VecSizeGetter {
const ArgsT &args,
int *vec_size) {
using Type = std::tuple_element_t<Index, ArgsT>;
*vec_size = std::min<int>(
*vec_size,
paddle::platform::GetVectorizedSize(ins[Index]->data<Type>()));
*vec_size = std::min<int>(*vec_size,
phi::GetVectorizedSize(ins[Index]->data<Type>()));
}
};
......@@ -563,8 +562,8 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
// The Arg VecSize=1 is to match the Unroller template.
Unroller<VecSizeGetter, 1, Arity>::step(ins, arg, &vec_size);
for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
vec_size = std::min<int>(
vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>()));
vec_size =
std::min<int>(vec_size, phi::GetVectorizedSize((*iter)->data<OutT>()));
}
return vec_size;
}
......
......@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/bernoulli_kernel.h"
#include <thrust/random.h>
#include <thrust/transform.h>
#ifdef __NVCC__
......@@ -28,7 +30,6 @@
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/bernoulli_kernel.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
// See Note [ Why still include the fluid headers? ]
......
......@@ -20,11 +20,11 @@
#include "paddle/phi/kernels/funcs/elementwise_base.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
namespace phi {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册