未验证 提交 57f6a469 编写于 作者: Y Yiqun Liu 提交者: GitHub

Enable matmul + bias fusion in fused_gat_attention. (#50755)

* Enable matmul + bias fusion in fused_gat_attention.

* Add a variable to control whether using fused matmul + bias.
上级 7c73910e
...@@ -14,12 +14,13 @@ limitations under the License. */ ...@@ -14,12 +14,13 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -44,13 +45,43 @@ class AttnMatMul { ...@@ -44,13 +45,43 @@ class AttnMatMul {
input_size_(input_size), input_size_(input_size),
compute_bias_(compute_bias) {} compute_bias_(compute_bias) {}
~AttnMatMul() {}
void ComputeForward(const phi::DenseTensor* weight, void ComputeForward(const phi::DenseTensor* weight,
const phi::DenseTensor* input, const phi::DenseTensor* input,
const phi::DenseTensor* bias, const phi::DenseTensor* bias,
phi::DenseTensor* output, phi::DenseTensor* output,
phi::DenseTensor* bias_out) { phi::DenseTensor* bias_out,
bool fused = false) {
VLOG(6) << "input.shape={" << input->dims() << "}, weight.shape={"
<< weight->dims() << "}, output.shape={" << output->dims()
<< "}, batch_size=" << bsz_seq_ << ", output_size=" << output_size_
<< ", input_size=" << input_size_ << ", transA=" << transA_
<< ", transB=" << transB_ << ", compute_bias=" << compute_bias_
<< ", fused=" << fused;
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
if (compute_bias_ && fused) {
PADDLE_ENFORCE_EQ(
!output || output == bias_out,
true,
phi::errors::InvalidArgument(
"The output (= input * weight) is expected to be nullptr or the "
"same as bias_out when fused is true."));
ComputeFusedGemmEpilogueForward<T>(dev_ctx_,
input,
weight,
bias,
bsz_seq_, // M
output_size_, // N
input_size_, // K
transA_,
transB_,
"none",
bias_out,
nullptr);
return;
}
#endif
// Note: for blas.GEMM API in Paddle, it treats all inputs as row-major. // Note: for blas.GEMM API in Paddle, it treats all inputs as row-major.
// here: (transa, transb): nt, input * weight. // here: (transa, transb): nt, input * weight.
CBLAS_TRANSPOSE transA = transA_ ? CblasTrans : CblasNoTrans; CBLAS_TRANSPOSE transA = transA_ ? CblasTrans : CblasNoTrans;
...@@ -85,7 +116,29 @@ class AttnMatMul { ...@@ -85,7 +116,29 @@ class AttnMatMul {
phi::DenseTensor* d_input, phi::DenseTensor* d_input,
phi::DenseTensor* d_weight, phi::DenseTensor* d_weight,
phi::DenseTensor* d_bias, phi::DenseTensor* d_bias,
bool use_addto = false) { bool use_addto = false,
bool fused = false) {
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
if (compute_bias_ && fused) {
ComputeFusedGemmEpilogueBackward<T>(dev_ctx_,
d_output,
input,
weight,
nullptr,
bsz_seq_, // M
output_size_, // N
input_size_, // K
transA_,
transB_,
"none",
d_input,
d_weight,
d_bias,
use_addto);
return;
}
#endif
T alpha = static_cast<T>(1.0); T alpha = static_cast<T>(1.0);
T beta_dA = use_addto ? static_cast<T>(1.0) : static_cast<T>(0.0); T beta_dA = use_addto ? static_cast<T>(1.0) : static_cast<T>(0.0);
T beta_dB = static_cast<T>(0.0); T beta_dB = static_cast<T>(0.0);
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h" #include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/dropout_impl.cu.h" #include "paddle/phi/kernels/funcs/dropout_impl.cu.h"
......
...@@ -209,7 +209,8 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx, ...@@ -209,7 +209,8 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
const GateAttentionConfig<T> &config, const GateAttentionConfig<T> &config,
const phi::DenseTensor *query, const phi::DenseTensor *query,
const phi::DenseTensor *fmha_out, const phi::DenseTensor *fmha_out,
phi::DenseTensor *gate_out) { phi::DenseTensor *gate_bias_out,
bool use_fused_matmul_bias) {
auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight"); auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight");
auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias"); auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias");
...@@ -220,14 +221,18 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx, ...@@ -220,14 +221,18 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
int m = config.batch_size * config.seq_len_m * config.seq_len_r; int m = config.batch_size * config.seq_len_m * config.seq_len_r;
int n = config.num_heads * config.head_dim; int n = config.num_heads * config.head_dim;
int k = config.q_dim; int k = config.q_dim;
auto gate_attn_compute = auto gate_linear =
AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true); AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
gate_attn_compute.ComputeForward( gate_linear.ComputeForward(gate_weight,
gate_weight, query, gate_bias, gate_out, gate_out); query,
gate_bias,
gate_bias_out,
gate_bias_out,
use_fused_matmul_bias);
// gate_out = sigmoid(gate_out) * fmha_out // gate_out = sigmoid(gate_out) * fmha_out
std::vector<const phi::DenseTensor *> ins = {gate_out, fmha_out}; std::vector<const phi::DenseTensor *> ins = {gate_bias_out, fmha_out};
std::vector<phi::DenseTensor *> outs = {gate_out}; std::vector<phi::DenseTensor *> outs = {gate_bias_out};
phi::funcs::ElementwiseKernel<T>( phi::funcs::ElementwiseKernel<T>(
ctx.cuda_device_context(), ins, &outs, SigmoidMultiplyFunctor<T>()); ctx.cuda_device_context(), ins, &outs, SigmoidMultiplyFunctor<T>());
} }
...@@ -239,10 +244,12 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx, ...@@ -239,10 +244,12 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
const phi::DenseTensor *fmha_out, const phi::DenseTensor *fmha_out,
const phi::DenseTensor *gate_out_grad, const phi::DenseTensor *gate_out_grad,
phi::DenseTensor *query_grad, phi::DenseTensor *query_grad,
phi::DenseTensor *fmha_out_grad) { phi::DenseTensor *fmha_out_grad,
bool use_fused_matmul_bias) {
const auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight"); const auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight");
const auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias"); const auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias");
auto &dev_ctx = ctx.template device_context<phi::GPUContext>(); auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
// Re-compute gate_bias_out // Re-compute gate_bias_out
phi::DenseTensor gate_bias_out; phi::DenseTensor gate_bias_out;
gate_bias_out.Resize(config.gate_out_dims); gate_bias_out.Resize(config.gate_out_dims);
...@@ -251,10 +258,14 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx, ...@@ -251,10 +258,14 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
int m = config.batch_size * config.seq_len_m * config.seq_len_r; int m = config.batch_size * config.seq_len_m * config.seq_len_r;
int n = config.num_heads * config.head_dim; int n = config.num_heads * config.head_dim;
int k = config.q_dim; int k = config.q_dim;
auto gate_attn_compute = auto gate_linear =
AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true); AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
gate_attn_compute.ComputeForward( gate_linear.ComputeForward(gate_weight,
gate_weight, query, gate_bias, &gate_bias_out, &gate_bias_out); query,
gate_bias,
&gate_bias_out,
&gate_bias_out,
use_fused_matmul_bias);
// Gradient of sigmoid(gate_bias_out) * fmha_out // Gradient of sigmoid(gate_bias_out) * fmha_out
// Compute inplace and save gate_bias_out_grad to gate_bias_out. // Compute inplace and save gate_bias_out_grad to gate_bias_out.
...@@ -272,19 +283,22 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx, ...@@ -272,19 +283,22 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
dev_ctx.Alloc<T>(gate_weight_grad, gate_weight_grad->numel() * sizeof(T)); dev_ctx.Alloc<T>(gate_weight_grad, gate_weight_grad->numel() * sizeof(T));
dev_ctx.Alloc<T>(gate_bias_grad, gate_bias_grad->numel() * sizeof(T)); dev_ctx.Alloc<T>(gate_bias_grad, gate_bias_grad->numel() * sizeof(T));
gate_attn_compute.ComputeBackward(query, gate_linear.ComputeBackward(query,
gate_weight, gate_weight,
&gate_bias_out, &gate_bias_out,
query_grad, query_grad,
gate_weight_grad, gate_weight_grad,
gate_bias_grad); gate_bias_grad,
false,
use_fused_matmul_bias);
} }
template <typename T> template <typename T>
void ComputeOutputLinearForward(const framework::ExecutionContext &ctx, void ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
const GateAttentionConfig<T> &config, const GateAttentionConfig<T> &config,
const phi::DenseTensor *fmha_or_gate_out, const phi::DenseTensor *fmha_or_gate_out,
phi::DenseTensor *out) { phi::DenseTensor *out,
bool use_fused_matmul_bias) {
const auto *out_linear_weight = const auto *out_linear_weight =
ctx.Input<phi::DenseTensor>("OutLinearWeight"); ctx.Input<phi::DenseTensor>("OutLinearWeight");
const auto *out_linear_bias = ctx.Input<phi::DenseTensor>("OutLinearBias"); const auto *out_linear_bias = ctx.Input<phi::DenseTensor>("OutLinearBias");
...@@ -293,17 +307,22 @@ void ComputeOutputLinearForward(const framework::ExecutionContext &ctx, ...@@ -293,17 +307,22 @@ void ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
int m = config.batch_size * config.seq_len_m * config.seq_len_r; int m = config.batch_size * config.seq_len_m * config.seq_len_r;
int n = config.q_dim; int n = config.q_dim;
int k = config.num_heads * config.head_dim; int k = config.num_heads * config.head_dim;
auto out_linear_compute = auto out_linear =
AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true); AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
out_linear_compute.ComputeForward( out_linear.ComputeForward(out_linear_weight,
out_linear_weight, fmha_or_gate_out, out_linear_bias, out, out); fmha_or_gate_out,
out_linear_bias,
out,
out,
use_fused_matmul_bias);
} }
template <typename T> template <typename T>
void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx, void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
const GateAttentionGradConfig<T> &config, const GateAttentionGradConfig<T> &config,
const phi::DenseTensor *input, const phi::DenseTensor *input,
phi::DenseTensor *input_grad) { phi::DenseTensor *input_grad,
bool use_fused_matmul_bias) {
auto &dev_ctx = ctx.template device_context<phi::GPUContext>(); auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
const auto *out_grad = const auto *out_grad =
ctx.Input<phi::DenseTensor>(framework::GradVarName("Out")); ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
...@@ -323,14 +342,16 @@ void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx, ...@@ -323,14 +342,16 @@ void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
int m = config.batch_size * config.seq_len_m * config.seq_len_r; int m = config.batch_size * config.seq_len_m * config.seq_len_r;
int n = config.q_dim; int n = config.q_dim;
int k = config.num_heads * config.head_dim; int k = config.num_heads * config.head_dim;
auto out_linear_compute = auto out_linear =
AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true); AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
out_linear_compute.ComputeBackward(input, out_linear.ComputeBackward(input,
out_linear_weight, out_linear_weight,
out_grad, out_grad,
input_grad, input_grad,
out_linear_weight_grad, out_linear_weight_grad,
out_linear_bias_grad); out_linear_bias_grad,
false,
use_fused_matmul_bias);
} }
template <typename T> template <typename T>
...@@ -358,6 +379,7 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> { ...@@ -358,6 +379,7 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
const bool merge_qkv = ctx.Attr<bool>("merge_qkv"); const bool merge_qkv = ctx.Attr<bool>("merge_qkv");
const bool has_gating = ctx.Attr<bool>("has_gating"); const bool has_gating = ctx.Attr<bool>("has_gating");
bool use_fused_matmul_bias = true;
auto &dev_ctx = ctx.template device_context<phi::GPUContext>(); auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
AllocWithDebugInfo<T>(dev_ctx, "softmax_out", softmax_out); AllocWithDebugInfo<T>(dev_ctx, "softmax_out", softmax_out);
AllocWithDebugInfo<T>(dev_ctx, "fmha_out", fmha_out); AllocWithDebugInfo<T>(dev_ctx, "fmha_out", fmha_out);
...@@ -413,12 +435,14 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> { ...@@ -413,12 +435,14 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
// 3. Gating Linear // 3. Gating Linear
if (has_gating) { if (has_gating) {
ComputeGatingLinearForward<T>(ctx, config, query, fmha_out, gate_out); ComputeGatingLinearForward<T>(
ctx, config, query, fmha_out, gate_out, use_fused_matmul_bias);
} }
// 4. Output Linear // 4. Output Linear
phi::DenseTensor *fmha_or_gate_out = has_gating ? gate_out : fmha_out; phi::DenseTensor *fmha_or_gate_out = has_gating ? gate_out : fmha_out;
ComputeOutputLinearForward<T>(ctx, config, fmha_or_gate_out, out); ComputeOutputLinearForward<T>(
ctx, config, fmha_or_gate_out, out, use_fused_matmul_bias);
} }
}; };
...@@ -454,6 +478,7 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> { ...@@ -454,6 +478,7 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
bool has_gating = ctx.Attr<bool>("has_gating"); bool has_gating = ctx.Attr<bool>("has_gating");
bool merge_qkv = ctx.Attr<bool>("merge_qkv"); bool merge_qkv = ctx.Attr<bool>("merge_qkv");
bool use_fused_matmul_bias = true;
auto &dev_ctx = ctx.template device_context<phi::GPUContext>(); auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
AllocWithDebugInfo<T>(dev_ctx, "query_grad", query_grad); AllocWithDebugInfo<T>(dev_ctx, "query_grad", query_grad);
...@@ -468,7 +493,8 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> { ...@@ -468,7 +493,8 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
phi::DenseTensor gate_out_grad; phi::DenseTensor gate_out_grad;
gate_out_grad.Resize(config.gate_out_dims); gate_out_grad.Resize(config.gate_out_dims);
AllocWithDebugInfo<T>(dev_ctx, "gate_out_grad", &gate_out_grad); AllocWithDebugInfo<T>(dev_ctx, "gate_out_grad", &gate_out_grad);
ComputeOutputLinearBackward<T>(ctx, config, gate_out, &gate_out_grad); ComputeOutputLinearBackward<T>(
ctx, config, gate_out, &gate_out_grad, use_fused_matmul_bias);
// 2. Gradient of Gating Linear // 2. Gradient of Gating Linear
// Forward: gate_out = Sigmoid(Linear(fmha_out)) * fmha_out // Forward: gate_out = Sigmoid(Linear(fmha_out)) * fmha_out
...@@ -478,10 +504,12 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> { ...@@ -478,10 +504,12 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
fmha_out, fmha_out,
&gate_out_grad, &gate_out_grad,
query_grad, query_grad,
&fmha_out_grad); &fmha_out_grad,
use_fused_matmul_bias);
} else { } else {
// 1. Gradient of Output Linear: out = Linear(fmha_grad) // 1. Gradient of Output Linear: out = Linear(fmha_grad)
ComputeOutputLinearBackward<T>(ctx, config, fmha_out, &fmha_out_grad); ComputeOutputLinearBackward<T>(
ctx, config, fmha_out, &fmha_out_grad, use_fused_matmul_bias);
} }
// 3. Gradient of FMHA // 3. Gradient of FMHA
......
...@@ -14,7 +14,6 @@ See the License for the specific language governing permissions and ...@@ -14,7 +14,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h" #include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
// This file has been adapted from FasterTransformer file: // This file has been adapted from FasterTransformer file:
// https://github.com/NVIDIA/FasterTransformer/blob/v4.0/fastertransformer/cuda/masked_multihead_attention.cu // https://github.com/NVIDIA/FasterTransformer/blob/v4.0/fastertransformer/cuda/masked_multihead_attention.cu
// We add License in the head. // We add License in the head.
......
...@@ -14,6 +14,12 @@ ...@@ -14,6 +14,12 @@
#pragma once #pragma once
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> // NOLINT
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
namespace phi { namespace phi {
namespace backends { namespace backends {
namespace gpu { namespace gpu {
...@@ -69,6 +75,22 @@ namespace gpu { ...@@ -69,6 +75,22 @@ namespace gpu {
for (index_type i = __index__; __index__ < (num); \ for (index_type i = __index__; __index__ < (num); \
__index__ += __stride__, i = __index__) __index__ += __stride__, i = __index__)
template <typename T>
cudaDataType_t ToCudaDataType() {
if (std::is_same<T, float>::value) {
return CUDA_R_32F;
} else if (std::is_same<T, double>::value) {
return CUDA_R_64F;
} else if (std::is_same<T, phi::dtype::float16>::value) {
return CUDA_R_16F;
#if CUDA_VERSION >= 11000
} else if (std::is_same<T, phi::dtype::bfloat16>::value) {
return CUDA_R_16BF;
#endif
}
}
} // namespace gpu } // namespace gpu
} // namespace backends } // namespace backends
} // namespace phi } // namespace phi
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册