未验证 提交 f74ebd8a 编写于 作者: H Haohongxiang 提交者: GitHub

optimize performance of offload in dygraph sharding stage2 (#38064)

* update

* fix bugs

* modify code style

* fix bugs of _get_global_group
上级 61ef56a1
...@@ -127,8 +127,8 @@ def _get_group_map(): ...@@ -127,8 +127,8 @@ def _get_group_map():
global _group_map global _group_map
if not _group_map: if not _group_map:
genv = _get_global_env() genv = _get_global_env()
_group_map[0] = Group(genv.rank, genv.world_size, _group_map[0] = Group(
list(range(genv.world_size))) genv.rank, genv.world_size, ranks=list(range(genv.world_size)))
return _group_map return _group_map
......
...@@ -30,11 +30,11 @@ from paddle.optimizer import Optimizer ...@@ -30,11 +30,11 @@ from paddle.optimizer import Optimizer
from paddle.fluid.clip import ClipGradByGlobalNorm from paddle.fluid.clip import ClipGradByGlobalNorm
from paddle.distributed.collective import _get_global_group from paddle.distributed.collective import _get_global_group
from ...utils.internal_storage import ParamStorage from ...utils.internal_storage import ParamStorage, GradStorage
from ...meta_parallel.sharding.sharding_utils import Type, device_guard, ShardingClipGrad from ...meta_parallel.sharding.sharding_utils import Type, device_guard, ShardingClipGrad
# CUDA alignment 256 bytes # CUDA alignment 256 bytes, cpu alignment 4096 bytes
alignment = {"gpu": 256, } alignment = {"gpu": 256, "cpu": 4096}
align = { align = {
Type.fp16.value: 2, Type.fp16.value: 2,
Type.fp32.value: 4, Type.fp32.value: 4,
...@@ -95,10 +95,11 @@ class ShardingOptimizerStage2(Optimizer): ...@@ -95,10 +95,11 @@ class ShardingOptimizerStage2(Optimizer):
filter(lambda x: x.trainable and x.dtype == Type.fp16.value, filter(lambda x: x.trainable and x.dtype == Type.fp16.value,
self._local_params))) > 0 self._local_params))) > 0
self.group = group self.group = dist.new_group(_get_global_group()
group = _get_global_group() if group is None else group .ranks) if group is None else group
self.world_size = group.nranks
self.rank = group.rank self.world_size = self.group.nranks
self.rank = self.group.rank
self.broadcast_fp16 = broadcast_fp16 self.broadcast_fp16 = broadcast_fp16
self.param_storages = {} # {dtype: {rank: InternalStorage}} self.param_storages = {} # {dtype: {rank: InternalStorage}}
...@@ -108,14 +109,18 @@ class ShardingOptimizerStage2(Optimizer): ...@@ -108,14 +109,18 @@ class ShardingOptimizerStage2(Optimizer):
"While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed." "While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed."
) )
self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip, self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip,
group, paddle.get_device(),
paddle.get_device()) self.group)
if offload: if offload:
assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16" assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16"
self.offload = offload # Using for offload self.offload = offload # Using for offload
self.offload_device = "cpu" self.offload_device = "cpu"
self.offload_buffer_size = 0
self.offload_param2align = {}
self.offload_params = None
self.offload_grads = None
self._master_params = {} self._master_params = {}
...@@ -131,7 +136,6 @@ class ShardingOptimizerStage2(Optimizer): ...@@ -131,7 +136,6 @@ class ShardingOptimizerStage2(Optimizer):
value=param.cast(dtype=Type.fp32.value).numpy(), value=param.cast(dtype=Type.fp32.value).numpy(),
place=core.CPUPlace(), place=core.CPUPlace(),
stop_gradient=param.stop_gradient) stop_gradient=param.stop_gradient)
self._optim._master_weights = self._master_params
else: else:
for param in trainable_params: for param in trainable_params:
if param.dtype == Type.fp16.value: if param.dtype == Type.fp16.value:
...@@ -265,13 +269,73 @@ class ShardingOptimizerStage2(Optimizer): ...@@ -265,13 +269,73 @@ class ShardingOptimizerStage2(Optimizer):
for d in dtype_to_pop: for d in dtype_to_pop:
self.param_storages.pop(d) self.param_storages.pop(d)
if self.offload:
self._optim._master_weights = self._master_params
cpu_master_params = [p for p in self._master_params.values()]
for param in cpu_master_params:
size = np.prod(param.shape) * align[Type.fp32.value]
remaining = size % alignment[self.offload_device]
ali = 0 if remaining == 0 else alignment[
self.offload_device] - remaining
align_ = ali // align[Type.fp32.value]
self.offload_buffer_size += np.prod(param.shape) + align_
self.offload_param2align[param.name] = align_
if cpu_master_params:
with device_guard(self.rank, self.offload_device):
self.offload_params = ParamStorage(
size=self.offload_buffer_size,
dtype=Type.fp32.value,
device=self.offload_device)
self.offload_params.add_rank_params(
cpu_master_params, self.offload_param2align, False)
self.offload_params.buffer.stop_gradient = False
self.offload_grads = GradStorage(
size=self.offload_buffer_size,
dtype=Type.fp32.value,
device=self.offload_device,
destination=self.rank,
parm2align=self.offload_param2align,
convert_cpu=True)
for p in cpu_master_params:
self.offload_grads.add_grad(
p, self.offload_param2align[p.name])
self._optim._master_weights[
self.offload_params.buffer.
name] = self.offload_params.buffer
def _offload_acc_grad(self, param_name, grad_fp32_cpu):
"""accumulate grads with offload strategy"""
with device_guard(self.rank, self.offload_device):
if param_name in self._master_params.keys():
if self._master_params[param_name].grad is None:
self._master_params[param_name]._copy_gradient_from(
grad_fp32_cpu)
else:
self._master_params[param_name].grad.add_(grad_fp32_cpu)
self.offload_params.buffer._copy_gradient_from(
self.offload_grads.buffer)
def _offload_scale_grad(self, scale_size):
"""scale grads with offload strategy"""
with device_guard(self.rank, self.offload_device):
self.offload_grads.buffer.scale_(scale=scale_size)
def _offload_clear_grad(self):
"""clear grads with offload strategy"""
with device_guard(self.rank, self.offload_device):
self.offload_grads.buffer.zero_()
def step(self): def step(self):
""" """
A wrapper for Optimizer's step function to finish the update operation of the optimizer. A wrapper for Optimizer's step function to finish the update operation of the optimizer.
""" """
if self.offload: if self.offload:
params_list = list(self._master_params.values()) params_list = [self.offload_params.buffer]
else: else:
# Synchronize optimizer parameters for the current rank # Synchronize optimizer parameters for the current rank
params_list = [] params_list = []
...@@ -296,14 +360,11 @@ class ShardingOptimizerStage2(Optimizer): ...@@ -296,14 +360,11 @@ class ShardingOptimizerStage2(Optimizer):
with device_guard(device=self.offload_device): with device_guard(device=self.offload_device):
self._optim.step() self._optim.step()
dev_id = 0 if paddle.get_device() == "cpu" else int( dev_id = int(paddle.get_device().split(":")[1])
paddle.get_device().split(":")[1])
for param in self._local_params: for param in self._local_params:
if param.name in self._master_params.keys(): if param.name in self._master_params.keys():
param.set_value(self._master_params[param.name].cuda(dev_id) param.set_value(self._master_params[param.name].cuda(dev_id)
.cast(dtype=param.dtype)) .cast(dtype=param.dtype))
self._master_params[param.name].clear_gradient(False)
else: else:
self._optim.step() self._optim.step()
......
...@@ -51,7 +51,7 @@ class ShardingStage2(nn.Layer): ...@@ -51,7 +51,7 @@ class ShardingStage2(nn.Layer):
# Feature Notes:: # Feature Notes::
# 1. Unified memory for param and param.grad to InternalStorage. # 1. Unified memory for param and param.grad to InternalStorage.
# 2. Divide param.grad according to rank to centrally apply for and release GPU memory. # 2. Divide param.grad according to rank to centrally apply for and release GPU memory.
# 3. Dynamically adjust training parameters and models # 3. Dynamically adjust training parameters and models.
# 4. Support offload function. # 4. Support offload function.
# 5. Support the establishment of independent communication groups. # 5. Support the establishment of independent communication groups.
...@@ -85,11 +85,11 @@ class ShardingStage2(nn.Layer): ...@@ -85,11 +85,11 @@ class ShardingStage2(nn.Layer):
self._accumulate_grads = accumulate_grads self._accumulate_grads = accumulate_grads
# Communication related attributes # Communication related attributes
self._group = group self._group = dist.new_group(_get_global_group()
group = _get_global_group() if group is None else group .ranks) if group is None else group
self._world_size_scaling = 1.0 / group.nranks self._world_size_scaling = 1.0 / self._group.nranks
assert group.nranks > 1, "Training must be distributed, ranks must be greater than 1" assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1"
self._rank = group.rank self._rank = self._group.rank
self._global_root_rank = 0 # picking rank 0 as the reference self._global_root_rank = 0 # picking rank 0 as the reference
self._default_device = device self._default_device = device
...@@ -173,37 +173,40 @@ class ShardingStage2(nn.Layer): ...@@ -173,37 +173,40 @@ class ShardingStage2(nn.Layer):
""" """
# Release grad storages # Release grad storages
for dtype in self._grad_storages.keys(): for dtype in self._grad_storages.keys():
if self._rank in self._grad_storages[dtype].keys(): if not self._offload and self._rank in self._grad_storages[
if not self._offload: dtype].keys():
self._grad_storages[dtype][self._rank].buffer.zero_() self._grad_storages[dtype][self._rank].buffer.zero_()
# Release params # Release grads of params
for param in self._trainable_params: for param in self._trainable_params:
if param.name in self._param_grads and param.grad is not None: if param.name in self._param_grads and param.grad is not None:
param.clear_gradient() param.clear_gradient()
# Release grads of master params with offload strategy
if self._offload:
self._sharding_optimizers[0]._offload_clear_grad()
def _grad_scale(self): def _grad_scale(self):
""" """
Before the gradient accumulation, scale the gradient. Before the gradient accumulation, scale the gradient.
""" """
# Scale grad storages
for dtype in self._grad_storages.keys():
if not self._offload and self._rank in self._grad_storages[
dtype].keys():
self._grad_storages[dtype][self._rank].buffer.scale_(
scale=self._world_size_scaling)
# Scale grads of params
for param in self._trainable_params:
if param.name in self._param_grads and param.grad is not None:
param.grad.scale_(scale=self._world_size_scaling)
param._reset_grad_inplace_version(True)
# Scale grads of master params with offload strategy
if self._offload: if self._offload:
for param in self._trainable_params: self._sharding_optimizers[0]._offload_scale_grad(
if param.name in self._sharding_optimizers[ self._world_size_scaling)
0]._master_params.keys():
self._sharding_optimizers[0]._master_params[
param.name].grad.scale_(scale=self._world_size_scaling)
else:
# Scale grad storages
for dtype in self._grad_storages.keys():
if self._rank in self._grad_storages[dtype].keys():
self._grad_storages[dtype][self._rank].buffer.scale_(
scale=self._world_size_scaling)
# Scale params
for param in self._trainable_params:
if param.name in self._param_grads and param.grad is not None:
param.grad.scale_(scale=self._world_size_scaling)
param._reset_grad_inplace_version(True)
def _init_internal_storage(self, needs_fresh): def _init_internal_storage(self, needs_fresh):
""" """
...@@ -319,9 +322,9 @@ class ShardingStage2(nn.Layer): ...@@ -319,9 +322,9 @@ class ShardingStage2(nn.Layer):
if dst_rank != self._rank: if dst_rank != self._rank:
param.clear_gradient(False) param.clear_gradient(False)
elif self._offload: elif self._offload:
self._sharding_optimizers[0]._master_params[ self._sharding_optimizers[0]._offload_acc_grad(
param.name]._copy_gradient_from(param.grad.cpu( param.name,
).cast(dtype=Type.fp32.value)) param.grad.cast(dtype=Type.fp32.value).cpu())
param.clear_gradient(False) param.clear_gradient(False)
# Synchronize the reduce parameter gradient # Synchronize the reduce parameter gradient
...@@ -375,11 +378,14 @@ class ShardingStage2(nn.Layer): ...@@ -375,11 +378,14 @@ class ShardingStage2(nn.Layer):
) )
elif self._offload: elif self._offload:
grad_storage.to(device=self._offload_device) grad_storage.to(device=self._offload_device)
for param in grad_storage._params: for p in grad_storage._params:
self._sharding_optimizers[0]._master_params[ self._sharding_optimizers[
param.name]._copy_gradient_from( 0]._offload_acc_grad(
param.grad.cast( p.name,
dtype=Type.fp32.value)) p.grad.cast(dtype=Type.fp32.value))
p.clear_gradient(False)
p._gradient_set_empty(False)
grad_storage._device = self._default_device
grad_storage.buffer.value().get_tensor()._clear( grad_storage.buffer.value().get_tensor()._clear(
) )
......
...@@ -28,6 +28,7 @@ from paddle.fluid import layers ...@@ -28,6 +28,7 @@ from paddle.fluid import layers
from paddle.fluid.dygraph import to_variable from paddle.fluid.dygraph import to_variable
from paddle.fluid.framework import dygraph_only from paddle.fluid.framework import dygraph_only
from paddle.fluid.dygraph import base as imperative_base from paddle.fluid.dygraph import base as imperative_base
from paddle.distributed.collective import _get_global_group
class Taskflow: class Taskflow:
...@@ -49,10 +50,10 @@ class Type(Enum): ...@@ -49,10 +50,10 @@ class Type(Enum):
class ShardingClipGrad: class ShardingClipGrad:
def __init__(self, clip, group, device): def __init__(self, clip, device, group):
self._clip = clip self._clip = clip
self._group = group
self._device = device self._device = device
self._group = group
@imperative_base.no_grad @imperative_base.no_grad
def _dygraph_clip(self, params_grads): def _dygraph_clip(self, params_grads):
...@@ -144,7 +145,7 @@ def device_guard(dev_id=0, device="cpu"): ...@@ -144,7 +145,7 @@ def device_guard(dev_id=0, device="cpu"):
@dygraph_only @dygraph_only
def ShardingScaler(scaler, sharding_group): def ShardingScaler(scaler):
def unscale_method(self, optimizer): def unscale_method(self, optimizer):
if not self._enable: if not self._enable:
return return
...@@ -152,10 +153,10 @@ def ShardingScaler(scaler, sharding_group): ...@@ -152,10 +153,10 @@ def ShardingScaler(scaler, sharding_group):
param_grads_fp16 = [] param_grads_fp16 = []
param_grads_fp32 = [] param_grads_fp32 = []
if getattr(optimizer, '_param_groups', None) and isinstance( if getattr(optimizer._optim, '_param_groups', None) and isinstance(
optimizer._param_groups[0], dict): optimizer._optim._param_groups[0], dict):
for group in optimizer._param_groups: for group in optimizer._optim._param_groups:
for param in group['params']: for param in group['params']:
if param._grad_ivar() is not None: if param._grad_ivar() is not None:
param_grads.append(param._grad_ivar()) param_grads.append(param._grad_ivar())
...@@ -166,31 +167,37 @@ def ShardingScaler(scaler, sharding_group): ...@@ -166,31 +167,37 @@ def ShardingScaler(scaler, sharding_group):
param_grads_fp32.append(param._grad_ivar()) param_grads_fp32.append(param._grad_ivar())
else: else:
param_grads = [ param_grads = [
param._grad_ivar() for param in optimizer._parameter_list param._grad_ivar() for param in optimizer._optim._parameter_list
if param._grad_ivar() is not None if param._grad_ivar() is not None
] ]
param_grads_fp16 = [ param_grads_fp16 = [
param._grad_ivar() for param in optimizer._parameter_list param._grad_ivar() for param in optimizer._optim._parameter_list
if (param._grad_ivar() is not None if (param._grad_ivar() is not None
) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16 ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
) )
] ]
param_grads_fp32 = [ param_grads_fp32 = [
param._grad_ivar() for param in optimizer._parameter_list param._grad_ivar() for param in optimizer._optim._parameter_list
if (param._grad_ivar() is not None if (param._grad_ivar() is not None
) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32 ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
) )
] ]
temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool)) temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool)) temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
if len(param_grads_fp16):
_C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, device = "cpu" if optimizer.offload else "gpu"
param_grads_fp16, dev_id = 0 if device == "cpu" else int(paddle.get_device().split(":")[
temp_found_inf_fp16) 1])
if len(param_grads_fp32):
_C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, with device_guard(dev_id, device):
param_grads_fp32, if len(param_grads_fp16):
temp_found_inf_fp32) _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
param_grads_fp16,
temp_found_inf_fp16)
if len(param_grads_fp32):
_C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
param_grads_fp32,
temp_found_inf_fp32)
self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0 self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32") is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
...@@ -198,7 +205,7 @@ def ShardingScaler(scaler, sharding_group): ...@@ -198,7 +205,7 @@ def ShardingScaler(scaler, sharding_group):
paddle.distributed.all_reduce( paddle.distributed.all_reduce(
is_found_inf, is_found_inf,
op=paddle.distributed.ReduceOp.MAX, op=paddle.distributed.ReduceOp.MAX,
group=sharding_group) group=optimizer.group)
self._found_inf = is_found_inf.numpy()[0] self._found_inf = is_found_inf.numpy()[0]
scaler._unscale = MethodType(unscale_method, scaler) scaler._unscale = MethodType(unscale_method, scaler)
......
...@@ -69,9 +69,11 @@ class InternalStorage: ...@@ -69,9 +69,11 @@ class InternalStorage:
param._gradient_set_empty(False) param._gradient_set_empty(False)
self.buffer.value().get_tensor()._clear() self.buffer.value().get_tensor()._clear()
self.buffer = tmp_buffer self.buffer = tmp_buffer
self._device = device
if dtype is not None: if dtype is not None:
self.buffer = self.buffer.cast(dtype=dtype) self.buffer = self.buffer.cast(dtype=dtype)
self._dtype = dtype
class ParamStorage(InternalStorage): class ParamStorage(InternalStorage):
...@@ -94,7 +96,7 @@ class ParamStorage(InternalStorage): ...@@ -94,7 +96,7 @@ class ParamStorage(InternalStorage):
self._array_params() self._array_params()
@fluid.dygraph.no_grad @fluid.dygraph.no_grad
def add_rank_params(self, trainable_params, param2align): def add_rank_params(self, trainable_params, param2align, convert_gpu=True):
""" """
Add new parameters to the InternalStorage. Params becomes a view of this InternalStorage buffer. Add new parameters to the InternalStorage. Params becomes a view of this InternalStorage buffer.
""" """
...@@ -108,12 +110,15 @@ class ParamStorage(InternalStorage): ...@@ -108,12 +110,15 @@ class ParamStorage(InternalStorage):
cpu_param_shape = list() cpu_param_shape = list()
for param in trainable_params: for param in trainable_params:
p_shape = self._add_param_as_view(param, param2align[param.name]) p_shape = self._add_param_as_view(param, param2align[param.name],
convert_gpu)
cpu_param_shape.append(p_shape) cpu_param_shape.append(p_shape)
# buffer convert from cpu to cuda if convert_gpu:
dev_id = int(paddle.get_device().split(":")[1]) # buffer convert from cpu to cuda
self.buffer = self.buffer.cuda(dev_id) dev_id = int(paddle.get_device().split(":")[1])
self.buffer = self.buffer.cuda(dev_id)
self._fill = 0 self._fill = 0
for idx, param in enumerate(trainable_params): for idx, param in enumerate(trainable_params):
...@@ -123,7 +128,7 @@ class ParamStorage(InternalStorage): ...@@ -123,7 +128,7 @@ class ParamStorage(InternalStorage):
self._param_ids.append(id(param)) self._param_ids.append(id(param))
@fluid.dygraph.no_grad @fluid.dygraph.no_grad
def _add_param_as_view(self, param, align): def _add_param_as_view(self, param, align, convert_gpu=True):
assert ( assert (
param.dtype == self.buffer.dtype param.dtype == self.buffer.dtype
...@@ -147,9 +152,12 @@ class ParamStorage(InternalStorage): ...@@ -147,9 +152,12 @@ class ParamStorage(InternalStorage):
with device_guard(dev_id, "cpu"): with device_guard(dev_id, "cpu"):
tmp_var = core.VarBase(tensor=self.buffer._slice(self._fill, tmp_var = core.VarBase(tensor=self.buffer._slice(self._fill,
var_end)) var_end))
param_cpu = param.cpu() if convert_gpu:
param.value().get_tensor()._clear() param_cpu = param.cpu()
tmp_var.set_value(param_cpu) param.value().get_tensor()._clear()
tmp_var.set_value(param_cpu)
else:
tmp_var.set_value(param)
self._fill = offset self._fill = offset
return p_shape return p_shape
...@@ -186,10 +194,16 @@ class GradStorage(InternalStorage): ...@@ -186,10 +194,16 @@ class GradStorage(InternalStorage):
This is a basic class to simplify the handling of gradient InternalStorages This is a basic class to simplify the handling of gradient InternalStorages
""" """
def __init__(self, size, dtype, device, destination, parm2align): def __init__(self,
size,
dtype,
device,
destination,
parm2align,
convert_cpu=False):
if isinstance(size, np.int64): if isinstance(size, np.int64):
size = size.tolist() size = size.tolist()
super().__init__(size, dtype, device) super().__init__(size, dtype, device, convert_cpu)
self._max_size = size self._max_size = size
self._release = False self._release = False
......
...@@ -40,24 +40,16 @@ paddle.seed(seed) ...@@ -40,24 +40,16 @@ paddle.seed(seed)
def train_mlp(model, offload=False): def train_mlp(model, offload=False):
group = paddle.distributed.new_group([0, 1])
optimizer = optimizer_setting(model=model, use_pure_fp16=True) optimizer = optimizer_setting(model=model, use_pure_fp16=True)
model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32') model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32')
scaler = paddle.amp.GradScaler(init_loss_scaling=32768) scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
scaler = ShardingScaler(scaler, group) scaler = ShardingScaler(scaler)
optimizer = ShardingOptimizerStage2( optimizer = ShardingOptimizerStage2(
params=model.parameters(), params=model.parameters(), optim=optimizer, offload=offload)
optim=optimizer,
group=group,
offload=offload)
model = ShardingStage2( model = ShardingStage2(
model, model, optimizer, buffer_max_size=2**21, accumulate_grads=True)
optimizer,
group=group,
buffer_max_size=2**21,
accumulate_grads=True)
train_reader = paddle.batch( train_reader = paddle.batch(
reader_decorator(linear_size), batch_size=batch_size, drop_last=True) reader_decorator(linear_size), batch_size=batch_size, drop_last=True)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册