diff --git a/paddle/phi/kernels/cpu/size_kernel.cc b/paddle/phi/kernels/cpu/size_kernel.cc index ff34ef26f6bd3aea13815cb347719f054fd0a058..71ebf9cdc09f79fc214f1e7790c502513daf1b11 100644 --- a/paddle/phi/kernels/cpu/size_kernel.cc +++ b/paddle/phi/kernels/cpu/size_kernel.cc @@ -22,6 +22,7 @@ PD_REGISTER_KERNEL(size, CPU, ALL_LAYOUT, phi::SizeKernel, + int16_t, int, int64_t, phi::dtype::float16, diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cumsum_kernel.cu index a253e6f4ad290bc45dee8f57afee06363042a8c5..e04f2b5f8765810703ff18b7d914bc024eb97318 100644 --- a/paddle/phi/kernels/gpu/cumsum_kernel.cu +++ b/paddle/phi/kernels/gpu/cumsum_kernel.cu @@ -222,25 +222,28 @@ void CumsumKernel(const Context& dev_ctx, // Use thrust for parallel acceleration when the input size is equal to the // length of the ‘axis’ dimension. if (size == out_dims[axis]) { +#ifdef __HIPCC__ + const auto& policy = thrust::hip::par.on(dev_ctx.stream()); +#else + const auto& policy = thrust::cuda::par.on(dev_ctx.stream()); +#endif if (reverse) { - thrust::device_ptr dev_ptr = - thrust::device_pointer_cast(in_data); - thrust::device_vector vec(dev_ptr, dev_ptr + size); + thrust::reverse_iterator> reversed_in( + thrust::device_pointer_cast(in_data) + size); + thrust::reverse_iterator> reversed_out( + thrust::device_pointer_cast(out_data) + size); if (exclusive) { thrust::exclusive_scan( - thrust::device, vec.rbegin(), vec.rend(), out_data); + policy, reversed_in, reversed_in + size, reversed_out); } else { thrust::inclusive_scan( - thrust::device, vec.rbegin(), vec.rend(), out_data); + policy, reversed_in, reversed_in + size, reversed_out); } - thrust::reverse(thrust::device, out_data, out_data + size); } else { if (exclusive) { - thrust::exclusive_scan( - thrust::device, in_data, in_data + size, out_data); + thrust::exclusive_scan(policy, in_data, in_data + size, out_data); } else { - thrust::inclusive_scan( - thrust::device, in_data, in_data + size, out_data); + thrust::inclusive_scan(policy, in_data, in_data + size, out_data); } } return; diff --git a/paddle/phi/kernels/gpu/size_kernel.cu b/paddle/phi/kernels/gpu/size_kernel.cu index 17a39944eb04f5cecd941b07e82fb6bb97363977..7051fb78c7587f6993c4a58c40c6be2a95acc29a 100644 --- a/paddle/phi/kernels/gpu/size_kernel.cu +++ b/paddle/phi/kernels/gpu/size_kernel.cu @@ -22,6 +22,7 @@ PD_REGISTER_KERNEL(size, GPU, ALL_LAYOUT, phi::SizeKernel, + int16_t, int, int64_t, phi::dtype::float16, diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index fb9c22edc65edfac4036443094e709f7cec6f9ac..62f034c7b41498123499d2d0f73af4b914ffcae4 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1795,7 +1795,7 @@ def cross_entropy(input, # 2. else # numerator: loss's weighted sum # denominator: cal the sum of weight where the sample's class_index!=ignore_index - if ignore_index != -100: + if ignore_index >= 0: out_sum = _C_ops.reduce_sum(out, 'reduce_all', True) # for each label[i],set 1 or 0, according to ignore_index # mask[i]=0, if label[i]==ignore_index