# there 2 kind of modes for gradient-merge and hybrid-dp in mixed parallism [sharding] and [pipeline].
# we distinguish this two modes since the gm/hybrid-dp related allreduce should be insert in different place according different mode to have best performance:
# sharding: communication within node, and therefore should insert within backward segment to overlap with bw calc, conduct every micro step
# pipeline: communication accross nodes, and therefore should insert in update segemnt, conduct just once per global step
self.hybrid_dp_mode=None
# dp here is the pure dp as the outest parallelism
ifself.hybrid_dp:
assertself.dp_degree>1,"hybrid dp is on, but dp degree is [{}]".format(
self.dp_degree)
ifself.pp_degree>1:
self.hybrid_dp_mode="pp_hybrid_dp"
else:
assertself.sharding_degree>1,"by now we only support five kind of hybrid dp: sharding_hybrid_dp, mp_sharding_hybrid_dp, pp_hybrid_dp, mp_sharding_pp_hybrid_dp, sharding_pp_hybrid_dp."