Linear compress (#55128)

* rename weight_only/llm.int8

Linear compress (#55128)
* rename weight_only/llm.int8
f4290a92 · FormlessUnit · GitHub · ab46b14c · f4290a92 · f4290a92
14 changed file
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1367,14 +1367,14 @@
  data_transform :
    skip_transform : out_size, size_tensor, scale_tensor

- op : llm_int8_mat_mul
+- op : llm_int8_matmul
  args : (Tensor x, Tensor weight, Tensor weight_scale, float threshold=6.0)
  output : Tensor(out)
  infer_meta :
-    func : LLMInt8MatMulInferMeta
+    func : LLMInt8MatmulInferMeta
    param : [x, weight]
  kernel :
-    func : llm_int8_mat_mul
+    func : llm_int8_matmul
    data_type : x

 - op : log
@@ -2602,14 +2602,13 @@
  intermediate: warprnntgrad
  backward : warprnnt_grad

- op : weight_only_mat_mul
+- op : weight_only_matmul
  args : (Tensor x, Tensor weight, Tensor weight_scale)
  output : Tensor(out)
  infer_meta :
-    func : WeightOnlyMatMulInferMeta
-    param : [x, weight]
+    func : WeightOnlyMatmulInferMeta
  kernel :
-    func : weight_only_mat_mul
+    func : weight_only_matmul
    data_type : x

 - op : weighted_sample_neighbors

--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -3572,7 +3572,7 @@ void WeightedSampleNeighborsInferMeta(const MetaTensor& row,
  out_count->set_dtype(DataType::INT32);
 }

-void LLMInt8MatMulInferMeta(const MetaTensor& x,
+void LLMInt8MatmulInferMeta(const MetaTensor& x,
                            const MetaTensor& weight,
                            MetaTensor* out) {
  auto x_dims = x.dims();
@@ -3595,25 +3595,31 @@ void LLMInt8MatMulInferMeta(const MetaTensor& x,
  out->set_dtype(x.dtype());
 }

-void WeightOnlyMatMulInferMeta(const MetaTensor& x,
+void WeightOnlyMatmulInferMeta(const MetaTensor& x,
                               const MetaTensor& weight,
+                               const MetaTensor& weight_scale,
                               MetaTensor* out) {
  auto x_dims = x.dims();
  auto w_dims = weight.dims();
+  auto n = weight_scale.dims()[0];
  PADDLE_ENFORCE_EQ(
      w_dims.size(),
      2UL,
      errors::InvalidArgument("The input(weight) must be a 2D Tensor."));
+  PADDLE_ENFORCE_EQ(
+      weight_scale.dims().size(),
+      1UL,
+      errors::InvalidArgument("The input(weight_scale) must be a 1D Tensor."));
  PADDLE_ENFORCE_EQ(
      x_dims[x_dims.size() - 1],
-      w_dims[0],
+      w_dims[1],
      errors::InvalidArgument(
-          "Input(X) dim[-1] and Input(Weight) dim[0] should be euqal."
-          "But received Input(X) dim[-1](%s) != Input(Weight) dim[0](%s)",
+          "Input(X) dim[-1] and Input(Weight) dim[1] should be euqal."
+          "But received Input(X) dim[-1](%s) != Input(Weight) dim[1](%s)",
          x_dims[x_dims.size() - 1],
-          w_dims[0]));
+          w_dims[1]));
  auto out_dims = x_dims;
-  out_dims[out_dims.size() - 1] = w_dims[1];
+  out_dims[out_dims.size() - 1] = n;
  out->set_dims(out_dims);
  out->set_dtype(x.dtype());
 }

--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -690,12 +690,13 @@ void FusedMultiHeadAttentionVariableInferMeta(const MetaTensor& query,
                                              bool causal,
                                              MetaTensor* out);

-void LLMInt8MatMulInferMeta(const MetaTensor& x,
+void LLMInt8MatmulInferMeta(const MetaTensor& x,
                            const MetaTensor& weight,
                            MetaTensor* out);

-void WeightOnlyMatMulInferMeta(const MetaTensor& x,
+void WeightOnlyMatmulInferMeta(const MetaTensor& x,
                               const MetaTensor& weight,
+                               const MetaTensor& weight_scale,
                               MetaTensor* out);

 void FusedRopeInferMeta(const MetaTensor& q,

--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -5086,22 +5086,17 @@ void QuantForCompressInferMeta(const MetaTensor& x,
          x_dims[0]));
  std::vector<int64_t> dim_scale({x_dims[1]});
  std::vector<int64_t> dim_out;
-  if (layout == "weight_only") {
-    dim_out = std::vector<int64_t>({x_dims[0], x_dims[1]});
-  } else if (layout == "llm.int8") {
+  if (bits == 8) {
    dim_out = std::vector<int64_t>({x_dims[1], x_dims[0]});
+  } else if (bits == 4) {
+    dim_out = std::vector<int64_t>({x_dims[1] / 2, x_dims[0]});
  } else {
-    phi::errors::InvalidArgument(
-        "The layout must be weight_only or llm.int8, but got %s", layout);
+    phi::errors::InvalidArgument("The bit must be 8 or 4, but got %d", bits);
  }
  out->set_dims(phi::make_ddim(dim_out));

-  // TODO(lizhenyun) support weight_only int4
-  if (bits == 8) {
-    out->set_dtype(DataType::INT8);
-  } else {
-    phi::errors::Fatal("The bits only support 8, but got[%d]", bits);
-  }
+  out->set_dtype(DataType::INT8);
+
  scale->set_dims(phi::make_ddim(dim_scale));
  scale->set_dtype(DataType::FLOAT32);
 }

--- a/paddle/phi/kernels/cpu/quant_for_compress_kernel.cc
+++ b/paddle/phi/kernels/cpu/quant_for_compress_kernel.cc
@@ -22,7 +22,7 @@

 namespace phi {

-template <typename DeviceContext, typename T, typename D>
+template <typename DeviceContext, typename T, typename D, int bits>
 void quant_compute(const DeviceContext& dev_ctx,
                   const DenseTensor& x,
                   DenseTensor* out,
@@ -59,15 +59,15 @@ void quant_compute(const DeviceContext& dev_ctx,

  per_channel_scale(scale_data, x_data, m, n);

-  per_channel_quant(x_int_data, x_data, scale_data, m, n);
+  per_channel_quant<T, bits>(x_int_data, x_data, scale_data, m, n);
  if (layout == "weight_only") {
-    permute_B_rows_for_mixed_gemm(
+    permute_B_rows_for_mixed_gemm<bits>(
        int_processed_data, x_int_data, std::vector<size_t>{m, n}, (int64_t)80);
-    row_major_to_column_major(
+    subbyte_transpose_impl<bits>(
        int_processed_2_data, int_processed_data, std::vector<size_t>{m, n});
-    interleave_column_major_tensor(
+    interleave_column_major_tensor<bits>(
        out_data, int_processed_2_data, std::vector<size_t>{m, n});
-    add_bias_and_interleave_int8s_inplace(out_data, num);
+    add_bias_and_interleave_inplace<bits>(out_data, num);
  } else if (layout == "llm.int8") {
    std::vector<int> axis = {1, 0};
    funcs::Transpose<DeviceContext, int8_t, 2> trans;
@@ -88,9 +88,16 @@ void QuantForCompressKernel(const Context& dev_ctx,
  if (bits == 8) {
    dev_ctx.template Alloc<int8_t>(out);
    dev_ctx.template Alloc<float>(scale);
-    quant_compute<Context, T, int8_t>(dev_ctx, x, out, scale, layout);
+    quant_compute<Context, T, int8_t, 8>(dev_ctx, x, out, scale, layout);
+  } else if (bits == 4 && layout == "weight_only") {
+    dev_ctx.template Alloc<int8_t>(out);
+    dev_ctx.template Alloc<float>(scale);
+    quant_compute<Context, T, int8_t, 4>(dev_ctx, x, out, scale, layout);
  } else {
-    phi::errors::Unimplemented("The bits only support 8, but got[%d]", bits);
+    phi::errors::Unimplemented(
+        "The bits only support 8 or weight_only 4, but got[%s] [%d]",
+        layout,
+        bits);
  }
  // VLOG(0) << "x: " << x.dtype() << x;
  // VLOG(0) << "out: " << out->dtype() << *out;
@@ -102,5 +109,4 @@ PD_REGISTER_KERNEL(quant_for_compress,
                   CPU,
                   ALL_LAYOUT,
                   phi::QuantForCompressKernel,
-                   float,
                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/gpu/llm_int8_mat_mul_kernel.cu
+++ b/paddle/phi/kernels/gpu/llm_int8_mat_mul_kernel.cu
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/phi/kernels/llm_int8_mat_mul_kernel.h"
+#include "paddle/phi/kernels/llm_int8_matmul_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #ifndef PADDLE_WITH_HIP
-#include "paddle/phi/kernels/impl/llm_int8_mat_mul_kernel_impl.h"
+#include "paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h"
 #endif

 namespace phi {
@@ -56,7 +56,7 @@ void llm_int8_compute(const Context& dev_ctx,
 }

 template <typename T, typename Context>
-void LLMInt8MatMulKernel(const Context& dev_ctx,
+void LLMInt8MatmulKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& weight,
                         const DenseTensor& weight_scale,
@@ -68,8 +68,8 @@ void LLMInt8MatMulKernel(const Context& dev_ctx,
 }
 }  // namespace phi

-PD_REGISTER_KERNEL(llm_int8_mat_mul,
+PD_REGISTER_KERNEL(llm_int8_matmul,
                   GPU,
                   ALL_LAYOUT,
-                   phi::LLMInt8MatMulKernel,
+                   phi::LLMInt8MatmulKernel,
                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/gpu/weight_only_mat_mul_kernel.cu
+++ b/paddle/phi/kernels/gpu/weight_only_mat_mul_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/phi/kernels/weight_only_mat_mul_kernel.h"
+#include "paddle/phi/kernels/weight_only_matmul_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/datatype_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -23,7 +23,7 @@
 namespace phi {

 template <typename T, typename Context>
-void WeightOnlyMatMulKernel(const Context& dev_ctx,
+void WeightOnlyMatmulKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& weight,
                            const DenseTensor& weight_scale,
@@ -32,42 +32,86 @@ void WeightOnlyMatMulKernel(const Context& dev_ctx,
  dev_ctx.template Alloc<T>(out);
  const auto x_dims = x.dims();
  const auto w_dims = weight.dims();
+  int n = weight_scale.dims()[0];
+  int quant_bit = 0;
+  if (n % w_dims[0] == 0) {
+    quant_bit = w_dims[0] * 8 / n;
+  } else {
+    errors::InvalidArgument(
+        "w_dims[0] must be divisible by weight_scale.dims()[0]");
+  }

-  int k = w_dims[0];
-  int n = w_dims[1];
+  int k = w_dims[1];
  int m = x.numel() / k;
-  auto mixed_gemm_runner =
-      CutlassFpAIntBGemmRunner<typename PDDataTypeTraits<T>::DataType,
-                               uint8_t>();
-  int mixgemm_max_size = std::max(n, k);
-  DenseTensor mixgemm_workspace;
-  int64_t mixgemm_workspace_size_bytes =
-      mixed_gemm_runner.getWorkspaceSize(m, mixgemm_max_size, mixgemm_max_size);
+  switch (quant_bit) {
+    case 8: {
+      auto mixed_gemm_runner =
+          CutlassFpAIntBGemmRunner<typename PDDataTypeTraits<T>::DataType,
+                                   uint8_t>();
+      int mixgemm_max_size = std::max(n, k);
+      DenseTensor mixgemm_workspace;
+      int64_t mixgemm_workspace_size_bytes = mixed_gemm_runner.getWorkspaceSize(
+          m, mixgemm_max_size, mixgemm_max_size);
+
+      mixgemm_workspace.Resize({mixgemm_workspace_size_bytes});
+      dev_ctx.template Alloc<uint8_t>(&mixgemm_workspace);
+      char* mixgemm_workspace_data =
+          reinterpret_cast<char*>(mixgemm_workspace.data<uint8_t>());
+      mixed_gemm_runner.gemm(
+          reinterpret_cast<const typename PDDataTypeTraits<T>::DataType*>(
+              x.data<T>()),
+          reinterpret_cast<const uint8_t*>(weight.data<int8_t>()),
+          reinterpret_cast<const float*>(weight_scale.data<float>()),
+          reinterpret_cast<typename PDDataTypeTraits<T>::DataType*>(
+              out->data<T>()),
+          m,
+          n,
+          k,
+          mixgemm_workspace_data,
+          mixgemm_workspace_size_bytes,
+          dev_ctx.stream());
+    } break;
+    case 4: {
+      auto mixed_gemm_runner =
+          CutlassFpAIntBGemmRunner<typename PDDataTypeTraits<T>::DataType,
+                                   cutlass::uint4b_t>();
+      int mixgemm_max_size = std::max(n, k);
+      DenseTensor mixgemm_workspace;
+      int64_t mixgemm_workspace_size_bytes = mixed_gemm_runner.getWorkspaceSize(
+          m, mixgemm_max_size, mixgemm_max_size);
+
+      mixgemm_workspace.Resize({mixgemm_workspace_size_bytes});
+      dev_ctx.template Alloc<uint8_t>(&mixgemm_workspace);
+      char* mixgemm_workspace_data =
+          reinterpret_cast<char*>(mixgemm_workspace.data<uint8_t>());
+      mixed_gemm_runner.gemm(
+          reinterpret_cast<const typename PDDataTypeTraits<T>::DataType*>(
+              x.data<T>()),
+          reinterpret_cast<const cutlass::uint4b_t*>(weight.data<int8_t>()),
+          reinterpret_cast<const float*>(weight_scale.data<float>()),
+          reinterpret_cast<typename PDDataTypeTraits<T>::DataType*>(
+              out->data<T>()),
+          m,
+          n,
+          k,
+          mixgemm_workspace_data,
+          mixgemm_workspace_size_bytes,
+          dev_ctx.stream());
+    } break;
+    default:
+      PADDLE_THROW(errors::Unimplemented(
+          "Quant_bits (%d) is not supported when gemm ", quant_bit));
+      break;
+  }

-  mixgemm_workspace.Resize({mixgemm_workspace_size_bytes});
-  dev_ctx.template Alloc<uint8_t>(&mixgemm_workspace);
-  char* mixgemm_workspace_data =
-      reinterpret_cast<char*>(mixgemm_workspace.data<uint8_t>());
-  mixed_gemm_runner.gemm(
-      reinterpret_cast<const typename PDDataTypeTraits<T>::DataType*>(
-          x.data<T>()),
-      reinterpret_cast<const uint8_t*>(weight.data<int8_t>()),
-      reinterpret_cast<const float*>(weight_scale.data<float>()),
-      reinterpret_cast<typename PDDataTypeTraits<T>::DataType*>(out->data<T>()),
-      m,
-      n,
-      k,
-      mixgemm_workspace_data,
-      mixgemm_workspace_size_bytes,
-      dev_ctx.stream());
 #else
  LOG(ERROR) << "Please compile with cutlass to EnableUseCutlass()";
 #endif
 }
 }  // namespace phi

-PD_REGISTER_KERNEL(weight_only_mat_mul,
+PD_REGISTER_KERNEL(weight_only_matmul,
                   GPU,
                   ALL_LAYOUT,
-                   phi::WeightOnlyMatMulKernel,
+                   phi::WeightOnlyMatmulKernel,
                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/impl/llm_int8_mat_mul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/llm_int8_mat_mul_kernel_impl.h
--- a/paddle/phi/kernels/impl/quant_for_compress_kernel_impl.h
+++ b/paddle/phi/kernels/impl/quant_for_compress_kernel_impl.h
@@ -39,51 +39,109 @@ void per_channel_scale(float* scale, const T* input, size_t m, size_t n) {
  }
 }

-template <typename T, typename D>
-void per_channel_quant(
-    D* output, const T* input, const float* scale, size_t m, size_t n) {
-  for (size_t i = 0; i < m; i++) {
-    for (size_t j = 0; j < n; j++) {
-      output[i * n + j] = static_cast<D>(
-          round(static_cast<float>(input[i * n + j]) / scale[j]));
+template <typename T, int quant_bit = 8>
+void per_channel_quant(int8_t* output,
+                       const T* input,
+                       const float* scale,
+                       size_t num_rows,
+                       size_t num_cols) {
+  size_t bytes_per_out_col = num_cols * quant_bit / 8;
+  for (size_t ii = 0; ii < num_rows; ++ii) {
+    int8_t* current_quantized_weight_row = output + ii * bytes_per_out_col;
+    const T* current_weight_row = input + ii * num_cols;
+    for (size_t jj = 0; jj < bytes_per_out_col; ++jj) {
+      if (quant_bit == 8) {
+        const float col_scale = scale[jj];
+        const float weight_elt = static_cast<float>(current_weight_row[jj]);
+        const float scaled_weight = round(weight_elt / col_scale);
+        const int8_t clipped_weight = static_cast<int8_t>(
+            std::max(-127.f, std::min(127.f, scaled_weight)));
+        current_quantized_weight_row[jj] = clipped_weight;
+      } else if (quant_bit == 4) {
+        // We will pack two int4 elements per iteration of the inner loop.
+        int8_t packed_int4s = 0;
+        for (int packed_idx = 0; packed_idx < 2; ++packed_idx) {
+          const size_t input_idx = 2 * jj + packed_idx;
+          if (input_idx < num_cols) {
+            const float col_scale = scale[input_idx];
+            const float weight_elt =
+                static_cast<float>(current_weight_row[input_idx]);
+            const float scaled_weight = round(weight_elt / col_scale);
+            int int_weight = static_cast<int>(scaled_weight);
+            const int8_t clipped_weight = std::max(-7, std::min(7, int_weight));
+
+            // Kill the sign extension bits (hence 0x0F mask) then shift to
+            // upper bits if packing the second int4 and or the bits into the
+            // final result.
+            packed_int4s |= ((clipped_weight & 0x0F) << (4 * packed_idx));
+          }
+        }
+        current_quantized_weight_row[jj] = packed_int4s;
+      } else {
+        phi::errors::Unimplemented("Unsupported quantization bits: %d",
+                                   quant_bit);
+      }
    }
  }
 }

-void row_major_to_column_major(int8_t* col_major_tensor,
-                               const int8_t* row_major_tensor,
-                               const std::vector<size_t>& shape) {
-  size_t m = shape[0];
-  size_t n = shape[1];
-  for (size_t i = 0; i < m * n; i++) {
-    size_t im = i / n;
-    size_t in = i % n;
-    col_major_tensor[in * m + im] = row_major_tensor[im * n + in];
-  }
-}
+template <int quant_bit = 8>
+void add_bias_and_interleave_inplace(int8_t* tensor_ptr, size_t num_elts) {
+  const size_t num_bytes = num_elts * quant_bit / 8;
+
+  for (size_t ii = 0; ii < num_bytes; ++ii) {
+    if (quant_bit == 8) {
+      tensor_ptr[ii] =
+          static_cast<int8_t>(static_cast<int>(tensor_ptr[ii]) + 128);
+    } else {
+      int8_t transformed_packed_int4s = 0;
+      int8_t transformed_first_elt =
+          (int8_t(tensor_ptr[ii] << 4) >> 4) +
+          8;  // The double shift here is to ensure sign extension
+      int8_t transformed_second_elt = (tensor_ptr[ii] >> 4) + 8;

-void add_bias_and_interleave_int8s_inplace(int8_t* int8_tensor_ptr,
-                                           size_t num_elts) {
-  int8_t* int8_tensor = reinterpret_cast<int8_t*>(int8_tensor_ptr);
-  for (size_t ii = 0; ii < num_elts; ++ii) {
-    int8_tensor[ii] =
-        static_cast<int8_t>(static_cast<int>(int8_tensor[ii]) + 128);
+      if (!(transformed_first_elt >= 0 && transformed_first_elt <= 15)) {
+        phi::errors::InvalidArgument(
+            "Illegal result for int4 transform (first elt)");
+      }
+      if (!(transformed_second_elt >= 0 && transformed_second_elt <= 15)) {
+        phi::errors::InvalidArgument(
+            "Illegal result for int4 transform (second elt)");
+      }
+      // We don't need to mask in these ops since everything should be in the
+      // range 0-15
+      transformed_packed_int4s |= transformed_first_elt;
+      transformed_packed_int4s |= (transformed_second_elt << 4);
+      tensor_ptr[ii] = transformed_packed_int4s;
+    }
  }
-  // Step 2 will transform the layout of a 32-bit register in CUDA in order to
-  // match the int4 layout. This has no performance benefit and is purely so
-  // that int4 and int8 have the same layout. Pictorially, this does the
-  // following: bit 32                                                      0
-  //      [elt_3  elt_2  elt_1  elt_0] (each elt occupies 8 bits)
-  //
-  // And it will rearrange the output 32 bit register to be the following:
-  // bit 32                                                      0
-  //      [elt_3  elt_1  elt_2  elt_0] (each elt occupies 8 bits)
-
-  for (size_t base = 0; base < num_elts; base += 4) {
-    std::swap(int8_tensor[base + 1], int8_tensor[base + 2]);
+  if (quant_bit == 8) {
+    for (size_t base = 0; base < num_elts; base += 4) {
+      std::swap(tensor_ptr[base + 1], tensor_ptr[base + 2]);
+    }
+  } else {
+    const size_t num_registers = num_bytes / 4;
+
+    uint32_t* register_ptr = reinterpret_cast<uint32_t*>(tensor_ptr);
+    for (size_t ii = 0; ii < num_registers; ++ii) {
+      const uint32_t current_register = register_ptr[ii];
+      uint32_t transformed_register = 0;
+
+      for (int dest_idx = 0; dest_idx < 8; ++dest_idx) {
+        const int src_idx =
+            dest_idx < 4 ? 2 * dest_idx : 2 * (dest_idx - 4) + 1;
+        const int src_shift = 4 * src_idx;
+        const int dest_shift = 4 * dest_idx;
+
+        const uint32_t src_bits = (current_register >> src_shift) & 0xF;
+        transformed_register |= (src_bits << dest_shift);
+      }
+      register_ptr[ii] = transformed_register;
+    }
  }
 }

+template <int quant_bit>
 void permute_B_rows_for_mixed_gemm(int8_t* permuted_quantized_tensor,
                                   const int8_t* quantized_tensor,
                                   const std::vector<size_t>& shape,
@@ -92,9 +150,8 @@ void permute_B_rows_for_mixed_gemm(int8_t* permuted_quantized_tensor,
  const size_t num_rows = shape.size() == 2 ? shape[0] : shape[1];
  const size_t num_cols = shape.size() == 2 ? shape[1] : shape[2];

-  const int BITS_PER_ELT = 8;
+  const int BITS_PER_ELT = quant_bit;
  const int K = 16 / BITS_PER_ELT;
-  // const int ELTS_PER_BYTE = 8 / BITS_PER_ELT;
  const int ELTS_PER_REG = 32 / BITS_PER_ELT;

  const uint32_t* input_byte_ptr =
@@ -102,7 +159,6 @@ void permute_B_rows_for_mixed_gemm(int8_t* permuted_quantized_tensor,
  uint32_t* output_byte_ptr =
      reinterpret_cast<uint32_t*>(permuted_quantized_tensor);

-  // int       MMA_SHAPE_N    = 8;
  int B_ROWS_PER_MMA = 8 * K;
  const int elts_in_int32 = 32 / BITS_PER_ELT;

@@ -118,15 +174,134 @@ void permute_B_rows_for_mixed_gemm(int8_t* permuted_quantized_tensor,
        const int read_row = base_row + tile_read_row;
        const int read_col = write_col;

-        const int64_t read_offset = int64_t(read_row) * num_vec_cols + read_col;
+        const int64_t read_offset =
+            static_cast<int64_t>(read_row) * num_vec_cols + read_col;
        const int64_t write_offset =
-            int64_t(write_row) * num_vec_cols + write_col;
+            static_cast<int64_t>(write_row) * num_vec_cols + write_col;
        output_byte_ptr[write_offset] = input_byte_ptr[read_offset];
      }
    }
  }
 }

+template <int quant_bit>
+void subbyte_transpose_impl(int8_t* transposed_quantized_tensor,
+                            const int8_t* quantized_tensor,
+                            const std::vector<size_t>& shape) {
+  const int bits_per_elt = quant_bit;
+
+  // FT_CHECK_WITH_INFO(shape.size() == 2 || shape.size() == 3, "Shape must be
+  // 2-D or 3-D");
+  // const size_t num_experts = 1;
+  const size_t num_rows = shape.size() == 2 ? shape[0] : shape[1];
+  const size_t num_cols = shape.size() == 2 ? shape[1] : shape[2];
+
+  const size_t col_bytes = num_cols * bits_per_elt / 8;
+  const size_t col_bytes_trans = num_rows * bits_per_elt / 8;
+  // const size_t num_bytes = size_t(num_experts) * num_rows * col_bytes;
+
+  const uint8_t* input_byte_ptr =
+      reinterpret_cast<const uint8_t*>(quantized_tensor);
+  uint8_t* output_byte_ptr =
+      reinterpret_cast<uint8_t*>(transposed_quantized_tensor);
+
+  static constexpr int ELTS_PER_BYTE = 8 / quant_bit;
+
+  static constexpr int M_TILE_L1 = 64;
+  static constexpr int N_TILE_L1 = M_TILE_L1 / ELTS_PER_BYTE;
+  uint8_t cache_buf[M_TILE_L1][N_TILE_L1];
+
+  static constexpr int VECTOR_WIDTH = std::min(32, N_TILE_L1);
+
+  // const int num_m_tiles = (num_rows + M_TILE_L1 - 1) / M_TILE_L1;
+  // const int num_n_tiles = (col_bytes + N_TILE_L1 - 1) / N_TILE_L1;
+
+  for (size_t row_tile_start = 0; row_tile_start < num_rows;
+       row_tile_start += M_TILE_L1) {
+    for (size_t col_tile_start_byte = 0; col_tile_start_byte < col_bytes;
+         col_tile_start_byte += N_TILE_L1) {
+      const int row_limit = std::min(row_tile_start + M_TILE_L1, num_rows);
+      const int col_limit =
+          std::min(col_tile_start_byte + N_TILE_L1, col_bytes);
+
+      for (int ii = 0; ii < M_TILE_L1; ++ii) {
+        const int row = row_tile_start + ii;
+
+        for (int jj = 0; jj < N_TILE_L1; jj += VECTOR_WIDTH) {
+          const int col = col_tile_start_byte + jj;
+
+          const size_t logical_src_offset = row * col_bytes + col;
+
+          if (row < row_limit && col < col_limit) {
+            for (int v = 0; v < VECTOR_WIDTH; ++v) {
+              cache_buf[ii][jj + v] = input_byte_ptr[logical_src_offset + v];
+            }
+          }
+        }
+      }
+
+      if (quant_bit == 8) {
+        for (int ii = 0; ii < M_TILE_L1; ++ii) {
+          for (int jj = ii + 1; jj < N_TILE_L1; ++jj) {
+            std::swap(cache_buf[ii][jj], cache_buf[jj][ii]);
+          }
+        }
+      } else if (quant_bit == 4) {
+        for (int ii = 0; ii < M_TILE_L1; ++ii) {
+          // Using M_TILE_L1 here is deliberate since we assume that the cache
+          // tile is square in the number of elements (not necessarily the
+          // number of bytes).
+          for (int jj = ii + 1; jj < M_TILE_L1; ++jj) {
+            const int ii_byte = ii / ELTS_PER_BYTE;
+            const int ii_bit_offset = ii % ELTS_PER_BYTE;
+
+            const int jj_byte = jj / ELTS_PER_BYTE;
+            const int jj_bit_offset = jj % ELTS_PER_BYTE;
+
+            uint8_t src_elt =
+                0xF & (cache_buf[ii][jj_byte] >> (4 * jj_bit_offset));
+            uint8_t tgt_elt =
+                0xF & (cache_buf[jj][ii_byte] >> (4 * ii_bit_offset));
+
+            cache_buf[ii][jj_byte] &= (0xF0 >> (4 * jj_bit_offset));
+            cache_buf[jj][ii_byte] &= (0xF0 >> (4 * ii_bit_offset));
+
+            cache_buf[ii][jj_byte] |= (tgt_elt << (4 * jj_bit_offset));
+            cache_buf[jj][ii_byte] |= (src_elt << (4 * ii_bit_offset));
+          }
+        }
+      } else {
+        phi::errors::Unimplemented("Unsupported quantization bits: %d",
+                                   quant_bit);
+      }
+
+      const size_t row_tile_start_trans = col_tile_start_byte * ELTS_PER_BYTE;
+      const size_t col_tile_start_byte_trans = row_tile_start / ELTS_PER_BYTE;
+
+      const int row_limit_trans =
+          std::min(row_tile_start_trans + M_TILE_L1, num_cols);
+      const int col_limit_trans =
+          std::min(col_tile_start_byte_trans + N_TILE_L1, col_bytes_trans);
+
+      for (int ii = 0; ii < M_TILE_L1; ++ii) {
+        const int row = row_tile_start_trans + ii;
+        for (int jj = 0; jj < N_TILE_L1; jj += VECTOR_WIDTH) {
+          const int col = col_tile_start_byte_trans + jj;
+
+          const size_t logical_tgt_offset = row * col_bytes_trans + col;
+
+          if (row < row_limit_trans && col < col_limit_trans) {
+            for (int v = 0; v < VECTOR_WIDTH; ++v) {
+              output_byte_ptr[logical_tgt_offset + v] = cache_buf[ii][jj + v];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <int quant_bit>
 void interleave_column_major_tensor(int8_t* interleaved_quantized_tensor,
                                    const int8_t* quantized_tensor,
                                    const std::vector<size_t>& shape) {
@@ -134,7 +309,7 @@ void interleave_column_major_tensor(int8_t* interleaved_quantized_tensor,
  const size_t num_rows = shape.size() == 2 ? shape[0] : shape[1];
  const size_t num_cols = shape.size() == 2 ? shape[1] : shape[2];

-  const size_t BITS_PER_ELT = 8;
+  const size_t BITS_PER_ELT = quant_bit;
  const size_t elts_in_int32 = 32 / BITS_PER_ELT;

  const size_t rows_per_tile = 64;
@@ -169,6 +344,5 @@ void interleave_column_major_tensor(int8_t* interleaved_quantized_tensor,
    }
  }
 }
-
 }  // namespace phi
 #endif  // PADDLE_PHI_KERNELS_IMPL_QUANT_FOR_COMPRESS_KERNEL_IMPL_H_
--- a/paddle/phi/kernels/llm_int8_mat_mul_kernel.h
+++ b/paddle/phi/kernels/llm_int8_mat_mul_kernel.h
@@ -16,7 +16,7 @@ limitations under the License. */
 namespace phi {

 template <typename T, typename Context>
-void LLMInt8MatMulKernel(const Context& dev_ctx,
+void LLMInt8MatmulKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& weight,
                         const DenseTensor& weight_scale,

--- a/paddle/phi/kernels/weight_only_mat_mul_kernel.h
+++ b/paddle/phi/kernels/weight_only_mat_mul_kernel.h
@@ -16,7 +16,7 @@ limitations under the License. */
 namespace phi {

 template <typename T, typename Context>
-void WeightOnlyMatMulKernel(const Context& dev_ctx,
+void WeightOnlyMatmulKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& weight,
                            const DenseTensor& weight_scale,

--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1892,11 +1892,11 @@ def linear_compress(
 ):
    if in_dynamic_mode():
        if algo == "llm.int8":
-            y = _C_ops.llm_int8_mat_mul(
+            y = _C_ops.llm_int8_matmul(
                x, weight, weight_scale, config['threshold']
            )
        elif algo == "weight_only":
-            y = _C_ops.weight_only_mat_mul(x, weight, weight_scale)
+            y = _C_ops.weight_only_matmul(x, weight, weight_scale)
        else:
            raise ValueError(
                "Unknown algo: '{}'. It can only be 'llm.int8' or 'weight_only'.".format(
@@ -1915,11 +1915,19 @@ def linear_compress(

        if algo == "llm.int8":
            type = "llm_int8_matmul"
-            inputs = {'X': [x], 'Y': [weight], 'weight_scale': [weight_scale]}
+            inputs = {
+                'x': [x],
+                'weight': [weight],
+                'weight_scale': [weight_scale],
+            }
            attrs = {'algo': algo, 'threshold': config['threshold']}
        elif algo == "weight_only":
            type = "weight_only_matmul"
-            inputs = {'X': [x], 'Y': [weight], 'weight_scale': [weight_scale]}
+            inputs = {
+                'x': [x],
+                'weight': [weight],
+                'weight_scale': [weight_scale],
+            }
            attrs = {}
        else:
            raise ValueError(

--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -301,10 +301,13 @@ class LinearCompress(Layer):
                weight_attr = paddle.framework.ParamAttr(
                    initializer=paddle.nn.initializer.Assign(weight_tensor)
                )
+                weight_shape = (
+                    [self.weight.shape[1], self.weight.shape[0]]
+                    if self.bits == 8
+                    else [self.weight.shape[1] / 2, self.weight.shape[0]]
+                )
                self.weight = self.create_parameter(
-                    shape=self.weight.shape
-                    if self.layout == 0
-                    else [self.weight.shape[1], self.weight.shape[0]],
+                    shape=weight_shape,
                    attr=weight_attr,
                    dtype="int8",
                    is_bias=False,

--- a/test/legacy_test/test_linear_compress.py
+++ b/test/legacy_test/test_linear_compress.py
@@ -36,6 +36,7 @@ class LinearTestCase(unittest.TestCase):
        self.in_features = 64
        self.out_features = 64
        self.algo = "weight_only"
+        self.bits = 8

    def setUp(self):
        self.config()
@@ -62,6 +63,7 @@ class LinearTestCase(unittest.TestCase):
            self.in_features,
            self.out_features,
            bias_attr=bias_attr,
+            bits=8,
            algo=self.algo,
            config=self.config,
        )
@@ -112,5 +114,15 @@ class LinearTestCase3(LinearTestCase):
        self.atol = 1e-1


+class LinearTestCase4(LinearTestCase):
+    def config(self):
+        super().config()
+        self.dtype = 'float16'
+        self.bias = True
+        self.in_features = 128
+        self.out_features = 64
+        self.bits = 4
+
+
 if __name__ == '__main__':
    unittest.main()