From 03ef0bdcd15756991e85f5c281d8c5924b398745 Mon Sep 17 00:00:00 2001
From: Siming Dai <908660116@qq.com>
Date: Tue, 23 Aug 2022 11:41:26 +0800
Subject: [PATCH] [Geometric] Fix cuda configuration error for message_passing
 api (#45315)

---
 .../gpu/graph_sample_neighbors_kernel.cu      |  2 +-
 .../kernels/gpu/graph_send_ue_recv_funcs.h    | 38 ++++++++++++++++++-
 .../gpu/graph_send_ue_recv_grad_kernel.cu     |  8 ++--
 .../kernels/gpu/graph_send_ue_recv_kernel.cu  |  3 +-
 .../kernels/gpu/graph_send_uv_grad_kernel.cu  |  6 +--
 .../phi/kernels/gpu/graph_send_uv_kernel.cu   |  2 +-
 6 files changed, 48 insertions(+), 11 deletions(-)
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
index 8457936eb65..6632d3f8b2e 100644
--- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -54,7 +54,7 @@ struct MaxFunctor {
     if (x > cap) {
       return cap;
     }
-    return x;
+    return x >= 0 ? x : 0;
   }
 };
 
diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
index 5ae9393dba0..1bc841a6d8b 100644
--- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
@@ -1,5 +1,6 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
+// Copyright 2022 The DGL team for some useful functions.
+
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -23,6 +24,10 @@
 
 namespace phi {
 
+#define CUDA_MAX_NUM_BLOCKS_X 0x7FFFFFFF
+#define CUDA_MAX_NUM_BLOCKS_Y 0xFFFF
+#define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF
+
 inline void CopyBCastOff(const BroadCastInfo& bcast_info,
                          thrust::device_vector<int64_t>& l_bcastoff,
                          thrust::device_vector<int64_t>& r_bcastoff) {
@@ -63,6 +68,37 @@ inline int FindNumThreads(int dim, int max_num_threads) {
   return res;
 }
 
+inline int FindNumBlocks(char axis, int nblocks, int max_num_blocks = -1) {
+  int default_max_num_blocks = -1;
+  switch (axis) {
+    case 'x':
+      default_max_num_blocks = CUDA_MAX_NUM_BLOCKS_X;
+      break;
+    case 'y':
+      default_max_num_blocks = CUDA_MAX_NUM_BLOCKS_Y;
+      break;
+    case 'z':
+      default_max_num_blocks = CUDA_MAX_NUM_BLOCKS_Z;
+      break;
+    default:
+      PADDLE_THROW(
+          phi::errors::InvalidArgument("%c axis is not recognized", axis));
+  }
+  if (max_num_blocks == -1) {
+    max_num_blocks = default_max_num_blocks;
+  }
+  PADDLE_ENFORCE_GT(
+      max_num_blocks,
+      0,
+      phi::errors::InvalidArgument("max_num_blocks should be larger than 0, "
+                                   "but received %d",
+                                   max_num_blocks));
+  if (nblocks < max_num_blocks) {
+    return nblocks;
+  }
+  return max_num_blocks;
+}
+
 template <typename T>
 struct GraphSendUERecvSumCUDAFunctor {
   DEVICE inline void operator()(T* output, T val) {
diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu
index c5d5fb7196f..a1d522cc3d4 100644
--- a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu
@@ -52,7 +52,7 @@ void CalculateXEGradForMinMax(const Context& ctx,
   const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock());
   const int nty = ctx.GetMaxThreadsPerBlock() / ntx;
   const int nbx = (out_len + ntx - 1) / ntx;
-  const int nby = (index_size + nty - 1) / nty;
+  const int nby = FindNumBlocks('y', (index_size + nty - 1) / nty);
   const dim3 grid(nbx, nby);
   const dim3 block(ntx, nty);
 
@@ -183,7 +183,7 @@ void CalculateXGrad(const Context& ctx,
       const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock());
       const int nty = ctx.GetMaxThreadsPerBlock() / ntx;
       const int nbx = (out_len + ntx - 1) / ntx;
-      const int nby = (index_size + nty - 1) / nty;
+      const int nby = FindNumBlocks('y', (index_size + nty - 1) / nty);
       const dim3 grid_(nbx, nby);
       const dim3 block_(ntx, nty);
       funcs::MultiplyFunctor<T> mul_functor;
@@ -306,7 +306,7 @@ void CalculateXGrad(const Context& ctx,
       const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock());
       const int nty = ctx.GetMaxThreadsPerBlock() / ntx;
       const int nbx = (out_len + ntx - 1) / ntx;
-      const int nby = (index_size + nty - 1) / nty;
+      const int nby = FindNumBlocks('y', (index_size + nty - 1) / nty);
       const dim3 grid_(nbx, nby);
       const dim3 block_(ntx, nty);
       if (!reduce) {
@@ -392,7 +392,7 @@ void CalculateEGrad(const Context& ctx,
   const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock());
   const int nty = ctx.GetMaxThreadsPerBlock() / ntx;
   const int nbx = (out_len + ntx - 1) / ntx;
-  const int nby = (index_size + nty - 1) / nty;
+  const int nby = FindNumBlocks('y', (index_size + nty - 1) / nty);
   const dim3 grid(nbx, nby);
   const dim3 block(ntx, nty);
   if (reduce_op == "SUM") {
diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu
index 7351c562dff..8a5897316ca 100644
--- a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu
@@ -81,6 +81,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
   if (index_size == 0) return;
 
   const auto& bcast_info = phi::CalcBCastInfo(x.dims(), e.dims());
+
   const T* x_data = x.data<T>();
   const T* e_data = e.data<T>();
   const IndexT* s_index = src_index.data<IndexT>();
@@ -95,7 +96,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
   const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock());
   const int nty = ctx.GetMaxThreadsPerBlock() / ntx;
   const int nbx = (out_len + ntx - 1) / ntx;
-  const int nby = (index_size + nty - 1) / nty;
+  const int nby = FindNumBlocks('y', (index_size + nty - 1) / nty);
   const dim3 grid(nbx, nby);
   const dim3 block(ntx, nty);
   int64_t input_size = x.dims()[0];
diff --git a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
index 5b8d7b28dcc..d845e9cc437 100644
--- a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
@@ -73,7 +73,7 @@ void CalculateGrad(const Context& ctx,
       const int ntx = FindNumThreads(slice_size, ctx.GetMaxThreadsPerBlock());
       const int nty = ctx.GetMaxThreadsPerBlock() / ntx;
       const int nbx = (slice_size + ntx - 1) / ntx;
-      const int nby = (index_size + nty - 1) / nty;
+      const int nby = FindNumBlocks('y', (index_size + nty - 1) / nty);
       const dim3 grid_tmp(nbx, nby);
       const dim3 block_tmp(ntx, nty);
       GraphSendUVGradCUDAKernel<T, IndexT>
@@ -93,7 +93,7 @@ void CalculateGrad(const Context& ctx,
           FindNumThreads(bcast_info.out_len, ctx.GetMaxThreadsPerBlock());
       const int nty = ctx.GetMaxThreadsPerBlock() / ntx;
       const int nbx = (bcast_info.out_len + ntx - 1) / ntx;
-      const int nby = (index_size + nty - 1) / nty;
+      const int nby = FindNumBlocks('y', (index_size + nty - 1) / nty);
       const dim3 grid_tmp(nbx, nby);
       const dim3 block_tmp(ntx, nty);
       GraphSendUVGradCUDAKernel<T, IndexT>
@@ -133,7 +133,7 @@ void CalculateGrad(const Context& ctx,
     const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock());
     const int nty = ctx.GetMaxThreadsPerBlock() / ntx;
     const int nbx = (out_len + ntx - 1) / ntx;
-    const int nby = (index_size + nty - 1) / nty;
+    const int nby = FindNumBlocks('y', (index_size + nty - 1) / nty);
     const dim3 grid_(nbx, nby);
     const dim3 block_(ntx, nty);
     funcs::MultiplyFunctor<T> mul_functor;
diff --git a/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
index f1e4581773f..32b8b014d0c 100644
--- a/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
@@ -101,7 +101,7 @@ void GraphSendUVOpCUDAKernelLaunchHelper(const Context& ctx,
   const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock());
   const int nty = ctx.GetMaxThreadsPerBlock() / ntx;
   const int nbx = (out_len + ntx - 1) / ntx;
-  const int nby = (index_size + nty - 1) / nty;
+  const int nby = FindNumBlocks('y', (index_size + nty - 1) / nty);
   const dim3 grid(nbx, nby);
   const dim3 block(ntx, nty);
   if (message_op == "ADD") {
-- 
GitLab