Fix llm int8 build error (#55338)

* add macro to avoid llm.int8 build error * fix ci --------- Co-authored-by: N wufeisheng <wfs1997@163.com>

Fix llm int8 build error (#55338)
* add macro to avoid llm.int8 build error * fix ci --------- Co-authored-by: N wufeisheng <wfs1997@163.com>
006bd959 · FormlessUnit · GitHub · d65209b6 · 006bd959
隐藏空白更改
内联并排

Showing with 4 addition and 27 deletion

paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h +4 -27

未找到文件。
--- a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
@@ -225,6 +225,7 @@ __global__ void ReduceAbsMaxKernel(const T* x,
                                   const int32_t cols,
                                   float* row_ranges,
                                   int32_t* outlier_idx) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
  using InVec = phi::AlignedVector<T, VecSize>;
  using ComputeVec = phi::AlignedVector<ComputeType, VecSize>;
@@ -263,6 +264,7 @@ __global__ void ReduceAbsMaxKernel(const T* x,
      row_ranges[row_idx] = tmp_max_val;
    }
  }
+#endif
 }
 template <typename T, int VecSize>
@@ -300,21 +302,6 @@ __global__ void QuantActKernel(const T* x,
  }
 }
-template <typename T, int VecSize>
-__global__ void Fill(T* input, T value, int64_t num) {
-  phi::AlignedVector<T, VecSize> in_vec;
-  int stride = blockDim.x * gridDim.x * VecSize;
-  int base_idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
-  for (int idx = base_idx; idx < num; idx += stride) {
-#pragma unroll
-    for (int j = 0; j < VecSize; ++j) {
-      in_vec[j] = value;
-    }
-    phi::Store(in_vec, input + idx);
-  }
-}
 template <typename T>
 __global__ void SplitKernel(const T* x,
                            const int8_t* weight,
@@ -433,6 +420,7 @@ __global__ void DequantMergeKernel(const int32_t* x,
                                   T* y,
                                   int m,
                                   int n) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
  using FpVec = phi::AlignedVector<T, VecSize>;
  using IntVec = phi::AlignedVector<int32_t, VecSize>;
@@ -455,18 +443,7 @@ __global__ void DequantMergeKernel(const int32_t* x,
      phi::Store(out_vec, y + linear_idx);
    }
  }
-}
+#endif
-template <typename T>
-void LaunchFillKernel(T* input,
-                      T value,
-                      int64_t num,
-                      backends::gpu::GpuLaunchConfig* gpu_config,
-                      gpuStream_t stream) {
-  constexpr int VecSize = 16 / sizeof(T);
-  Fill<T, VecSize>
-      <<<gpu_config->block_per_grid, gpu_config->thread_per_block, 0, stream>>>(
-          input, value, num);
 }
 template <typename T>