"[hybrid_dp] API setting is deprecated. Now when dp_degree >= 2, its will be in hybrid dp mode automatically"
"[hybrid_dp] API setting is deprecated. Now when "
)
"dp_degree >= 2, its will be in hybrid dp mode automatically")
assertself.dp_degree>=1
assertdp_degree>=1
ifself.dp_degree>1:
self.hybrid_dp=True
self.hybrid_dp=Trueifdp_degree>1elseFalse
else:
self.sharding_degree=sharding_degree
self.hybrid_dp=False
self.mp_degree=mp_degree
self.pp_degree=pp_degree
# NOTE (JZ-LIANG)
self.dp_degree=dp_degree
# there 2 kind of modes for gradient-merge and hybrid-dp in mixed parallism [sharding] and [pipeline].
# we distinguish this two modes since the gm/hybrid-dp related allreduce should be insert in different place according different mode to have best performance:
def_get_hybrid_dp_mode(self):
# sharding: communication within node, and therefore should insert within backward segment to overlap with bw calc, conduct every micro step
""" get
# pipeline: communication accross nodes, and therefore should insert in update segemnt, conduct just once per global step
self.hybrid_dp_mode
self.hybrid_dp_mode=None
self.gradient_merge_mode
self._gradient_merge_acc_step
self.pp_allreduce_in_optimize
"""
strategy=self.user_defined_strategy
sharding_configs=strategy.sharding_configs
# NOTE (JZ-LIANG)
# There 2 kind of modes for gradient-merge and hybrid-dp in mixed parallelism [sharding] and [pipeline].
# We distinguish this two modes since the gm/hybrid-dp related allreduce should be insert in different place
# according different mode to have best performance:
# sharding: communication within node, and therefore should insert within backward segment
# to overlap with bw calc, conduct every micro step.
# pipeline: communication across nodes, and therefore should insert in update segment,
# conduct just once per global step.
dp_mode=None
# dp here is the pure dp as the outest parallelism
# dp here is the pure dp as the outest parallelism
ifself.hybrid_dp:
ifself.hybrid_dp:
assertself.dp_degree>1,"hybrid dp is on, but dp degree is [{}]".format(
self.dp_degree)
ifself.pp_degree>1:
ifself.pp_degree>1:
self.hybrid_dp_mode="pp_hybrid_dp"
dp_mode="pp_hybrid_dp"
else:
else:
assertself.sharding_degree>1,"by now we only support five kind of hybrid dp: sharding_hybrid_dp, mp_sharding_hybrid_dp, pp_hybrid_dp, mp_sharding_pp_hybrid_dp, sharding_pp_hybrid_dp."
assertself.sharding_degree>1, \
self.hybrid_dp_mode="sharding_hybrid_dp"
"by now we only support five kind of hybrid dp: sharding_hybrid_dp, " \