From 6f7aca9e51ac1b9b545a58824091e3c2076da18f Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 10 Sep 2021 10:38:40 +0800 Subject: [PATCH] Fix scatter and gather bug (#35595) * fix scatter gather bug: * fix windows ci --- paddle/fluid/operators/gather.cu.h | 16 +++--- paddle/fluid/operators/scatter.cu.h | 52 +++++++++---------- paddle/fluid/operators/scatter.h | 4 +- .../fluid/tests/unittests/test_gather_op.py | 12 +++++ .../fluid/tests/unittests/test_scatter_op.py | 40 ++++++++++++++ 5 files changed, 86 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h index 9e6dbf125e..05af4ff150 100644 --- a/paddle/fluid/operators/gather.cu.h +++ b/paddle/fluid/operators/gather.cu.h @@ -36,7 +36,7 @@ __global__ void GatherCUDAKernel(const T* params, const IndexT* indices, int64_t indices_i = i / slice_size; int64_t slice_i = i - indices_i * slice_size; // offset inside the slice IndexT gather_i = indices[indices_i]; - IndexT params_i = gather_i * slice_size + slice_i; + int64_t params_i = gather_i * slice_size + slice_i; *(output + i) = *(params + params_i); } } @@ -49,7 +49,7 @@ __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims, CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) { int64_t indices_i = i / slice_size; int64_t slice_i = i - indices_i * slice_size; // offset inside the slice - IndexT gather_i = 0; + int64_t gather_i = 0; int64_t temp = slice_size; for (int64_t j = end_size - 1; j >= 0; --j) { auto index_value = indices[indices_i * end_size + j]; @@ -63,7 +63,7 @@ __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims, gather_i += (index_value * temp); temp *= input_dims[j]; } - IndexT input_i = gather_i + slice_i; + int64_t input_i = gather_i + slice_i; *(output + i) = *(input + input_i); } } @@ -78,13 +78,7 @@ __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims, template void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { - // check index of shape 1-D - if (index.dims().size() == 1) { - PADDLE_ENFORCE_GT(index.dims()[0], 0, - platform::errors::InvalidArgument( - "The index of gather_op should not be empty" - "when the index's rank is 1.")); - } else if (index.dims().size() == 2) { + if (index.dims().size() == 2) { PADDLE_ENFORCE_EQ(index.dims()[1], 1, platform::errors::InvalidArgument( "If the index's rank of gather_op is 2," @@ -93,6 +87,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, // index size int64_t index_size = index.dims()[0]; + if (index_size == 0) return; auto src_dims = src.dims(); framework::DDim output_dims(src_dims); @@ -248,6 +243,7 @@ void GatherV2CUDAFunction(const Tensor* input, const Tensor* index, out->Resize(out_dim); auto* out_data = out->mutable_data(place); int64_t out_size = out->numel(); + if (out_size == 0) return; platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), out_size); diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h index 61e95c2b50..e3791351ce 100644 --- a/paddle/fluid/operators/scatter.cu.h +++ b/paddle/fluid/operators/scatter.cu.h @@ -29,9 +29,9 @@ using Tensor = framework::Tensor; template __global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output, size_t index_size, size_t slice_size) { - CUDA_KERNEL_LOOP(i, index_size * slice_size) { - int indices_i = i / slice_size; - int slice_i = i - indices_i * slice_size; // offset inside the slice + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; // offset inside the slice IndexT scatter_i = indices[indices_i]; PADDLE_ENFORCE(scatter_i >= 0, @@ -41,7 +41,7 @@ __global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output, "be greater than or equal to 0, but received [%d]", scatter_i); - IndexT out_i = scatter_i * slice_size + slice_i; + int64_t out_i = scatter_i * slice_size + slice_i; *(output + out_i) = static_cast(0); } } @@ -50,9 +50,9 @@ template __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices, T* output, size_t index_size, size_t slice_size, bool overwrite) { - CUDA_KERNEL_LOOP(i, index_size * slice_size) { - int indices_i = i / slice_size; - int slice_i = i - indices_i * slice_size; // offset inside the slice + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; // offset inside the slice IndexT scatter_i = indices[indices_i]; PADDLE_ENFORCE(scatter_i >= 0, @@ -62,7 +62,7 @@ __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices, "be greater than or equal to 0, but received [%d]", scatter_i); - IndexT out_i = scatter_i * slice_size + slice_i; + int64_t out_i = scatter_i * slice_size + slice_i; if (overwrite) { *(output + out_i) = *(params + i); } else { @@ -73,13 +73,13 @@ __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices, template __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices, - T* output, const int* output_dims, + T* output, const int64_t* output_dims, size_t remain_size, size_t slice_size, size_t end_size) { - CUDA_KERNEL_LOOP(i, remain_size * slice_size) { - int indices_i = i / slice_size; - int slice_i = i - indices_i * slice_size; // offset inside the slice - IndexT gather_i = 0; + CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; // offset inside the slice + int64_t gather_i = 0; int64_t temp = slice_size; for (int64_t j = end_size - 1; j >= 0; --j) { IndexT index_value = indices[indices_i * end_size + j]; @@ -95,7 +95,7 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices, gather_i += (index_value * temp); temp *= output_dims[j]; } - IndexT output_i = gather_i + slice_i; + int64_t output_i = gather_i + slice_i; paddle::platform::CudaAtomicAdd(output + output_i, *(update + i)); } } @@ -128,14 +128,14 @@ void GPUScatterAssign(const framework::ExecutionContext& context, "But received value is [%d]", index.dims().size())); } - int index_size = index.dims()[0]; + int64_t index_size = index.dims()[0]; auto src_dims = src.dims(); framework::DDim output_dims(src_dims); output_dims[0] = index_size; // slice size - int slice_size = 1; + int64_t slice_size = 1; for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; const T* p_src = src.data(); @@ -145,8 +145,8 @@ void GPUScatterAssign(const framework::ExecutionContext& context, // set block and grid num int block = 512; - int n = slice_size * index_size; - int grid = (n + block - 1) / block; + int64_t n = slice_size * index_size; + int64_t grid = (n + block - 1) / block; // if not overwrite mode, init data if (!overwrite) { @@ -167,10 +167,10 @@ void GPUScatterAssign(const framework::ExecutionContext& context, template void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index, Tensor* output) { - IndexT index_size = index.dims()[0]; + int64_t index_size = index.dims()[0]; auto dst_dims = output->dims(); // slice size - IndexT slice_size = 1; + int64_t slice_size = 1; for (int i = 1; i < dst_dims.size(); ++i) slice_size *= dst_dims[i]; const IndexT* p_index = index.data(); T* p_output = output->data(); @@ -224,20 +224,20 @@ void GPUScatterNdAdd(const framework::ExecutionContext& context, const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); auto cplace = platform::CPUPlace(); - std::vector v_output_dims(output_dims_size); + std::vector v_output_dims(output_dims_size); for (int i = 0; i < output_dims_size; ++i) { - v_output_dims[i] = static_cast(output_dims[i]); + v_output_dims[i] = output_dims[i]; } auto& dev_ctx = context.cuda_device_context(); - int bytes = output_dims_size * sizeof(int); + int64_t bytes = output_dims_size * sizeof(int64_t); auto output_dims_ptr = memory::Alloc(dev_ctx, bytes); - int* g_output_dims = reinterpret_cast(output_dims_ptr->ptr()); + int64_t* g_output_dims = reinterpret_cast(output_dims_ptr->ptr()); memory::Copy(gplace, g_output_dims, cplace, v_output_dims.data(), bytes, ctx.stream()); int block = 512; - int n = slice_size * remain_numel; - int grid = (n + block - 1) / block; + int64_t n = slice_size * remain_numel; + int64_t grid = (n + block - 1) / block; ScatterNdCUDAKernel<<< grid, block, 0, diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h index 1fb943f1ad..802d9ae76d 100644 --- a/paddle/fluid/operators/scatter.h +++ b/paddle/fluid/operators/scatter.h @@ -112,7 +112,7 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, const size_t slice_bytes = slice_size * sizeof(T); - for (int i = 0; i < index_size; ++i) { + for (int64_t i = 0; i < index_size; ++i) { IndexT index_ = p_index[i]; PADDLE_ENFORCE_GE(index_, 0, @@ -175,7 +175,7 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src, } // if not in overwrite mode, need to init output data - for (int i = 0; i < index_size; ++i) { + for (int64_t i = 0; i < index_size; ++i) { const IndexT& index_val = p_index[i]; PADDLE_ENFORCE_GE(index_val, 0, diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py index b54c60933e..83b39a62f1 100644 --- a/python/paddle/fluid/tests/unittests/test_gather_op.py +++ b/python/paddle/fluid/tests/unittests/test_gather_op.py @@ -248,6 +248,17 @@ class API_TestDygraphGather(unittest.TestCase): self.assertTrue(np.allclose(output_np, expected_output)) paddle.enable_static() + def test_zero_index(self): + paddle.disable_static() + x = paddle.to_tensor([[1, 2], [3, 4]]) + index = paddle.to_tensor(np.array([]).astype('int64')) + for axis in range(len(x.shape)): + out = paddle.gather(x, index, axis) + expected_shape = list(x.shape) + expected_shape[axis] = 0 + self.assertEqual(list(out.shape), expected_shape) + paddle.enable_static() + def test_large_data(self): if not paddle.is_compiled_with_cuda(): return @@ -340,4 +351,5 @@ class TestCheckOutType(unittest.TestCase): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py index f9a34b3f36..e58b2279e5 100644 --- a/python/paddle/fluid/tests/unittests/test_scatter_op.py +++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py @@ -16,10 +16,12 @@ from __future__ import print_function import unittest import numpy as np +import os import paddle import paddle.fluid as fluid from op_test import OpTest import paddle.fluid.core as core +from paddle.fluid.dygraph.base import switch_to_static_graph class TestScatterOp(OpTest): @@ -228,6 +230,44 @@ class TestScatterAPI(unittest.TestCase): self.assertEqual((output1.numpy() == \ np.array([[3., 3.],[6., 6.],[1., 1.]])).all(), True) + def test_large_data(self): + if os.name == "nt" or not paddle.is_compiled_with_cuda(): + return + + x = np.random.rand(183826, 256).astype("float32") + index = np.ones(10759233, dtype="int64") + updates = np.ones(shape=[10759233, 256], dtype="float32") + + def test_dygraph(): + with fluid.dygraph.guard(): + gpu_out = paddle.scatter( + paddle.to_tensor(x), + paddle.to_tensor(index), paddle.to_tensor(updates)) + return gpu_out.numpy() + + @switch_to_static_graph + def test_static_graph(): + with paddle.static.program_guard(paddle.static.Program(), + paddle.static.Program()): + x_t = paddle.static.data(name="x", dtype=x.dtype, shape=x.shape) + index_t = paddle.static.data( + name="index", dtype=index.dtype, shape=index.shape) + updates_t = paddle.static.data( + name="updates", dtype=updates.dtype, shape=updates.shape) + out_t = paddle.scatter(x_t, index_t, updates_t) + feed = { + x_t.name: x, + index_t.name: index, + updates_t.name: updates + } + fetch = [out_t] + + gpu_exe = paddle.static.Executor(paddle.CUDAPlace(0)) + gpu_value = gpu_exe.run(feed=feed, fetch_list=fetch)[0] + return gpu_value + + self.assertTrue(np.array_equal(test_dygraph(), test_static_graph())) + class TestScatterInplaceAPI(TestScatterAPI): def executed_api(self): -- GitLab