未验证 提交 038883fd 编写于 作者: 李季 提交者: GitHub

Fix cast op that can not cast the arrays that the size of arrays is beyond int32 (#34209)

* fix cast
上级 c8fb6fc4
...@@ -40,7 +40,8 @@ __global__ void VecCastCUDAKernel(const InT* in, const int64_t N, OutT* out) { ...@@ -40,7 +40,8 @@ __global__ void VecCastCUDAKernel(const InT* in, const int64_t N, OutT* out) {
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
using LoadT = AlignedVector<InT, VecSize>; using LoadT = AlignedVector<InT, VecSize>;
using StoreT = AlignedVector<OutT, VecSize>; using StoreT = AlignedVector<OutT, VecSize>;
for (int i = idx * VecSize; i < N; i += blockDim.x * gridDim.x * VecSize) { for (int64_t i = idx * VecSize; i < N;
i += blockDim.x * gridDim.x * VecSize) {
InT in_vec[VecSize]; InT in_vec[VecSize];
LoadT* in_value = reinterpret_cast<LoadT*>(&in_vec); LoadT* in_value = reinterpret_cast<LoadT*>(&in_vec);
*in_value = *reinterpret_cast<const LoadT*>(&in[i]); *in_value = *reinterpret_cast<const LoadT*>(&in[i]);
......
...@@ -41,7 +41,7 @@ struct GpuLaunchConfig { ...@@ -41,7 +41,7 @@ struct GpuLaunchConfig {
}; };
inline GpuLaunchConfig GetGpuLaunchConfig1D( inline GpuLaunchConfig GetGpuLaunchConfig1D(
const platform::CUDADeviceContext& context, int element_count, const platform::CUDADeviceContext& context, int64_t element_count,
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
// HIP will throw GPU memory access fault if threads > 256 // HIP will throw GPU memory access fault if threads > 256
int max_threads = 256) { int max_threads = 256) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册