fix bug of recompute in hybridparallel (#35588)

d53e567a · ShenLiang · GitHub · 652da1f4 · d53e567a · d53e567a
4 changed file
--- a/paddle/fluid/operators/flatten_op.cu.cc
+++ b/paddle/fluid/operators/flatten_op.cu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/flatten_op.h"

 namespace ops = paddle::operators;
+namespace plat = paddle::platform;

 REGISTER_OP_CUDA_KERNEL(
    flatten, ops::FlattenKernel<paddle::platform::CUDADeviceContext, float>,
@@ -50,6 +51,8 @@ REGISTER_OP_CUDA_KERNEL(
    flatten_contiguous_range,
    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
                                      float>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
+                                      plat::float16>,
    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
                                      double>,
    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
@@ -63,6 +66,8 @@ REGISTER_OP_CUDA_KERNEL(
    flatten_contiguous_range_grad,
    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
                                          float>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
+                                          plat::float16>,
    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
                                          double>,
    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,

--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -133,6 +133,7 @@ def _split_activation(tensor):

    # use inplace operation to save memory
    data = tensor.flatten_()
+
    part_size = tensor_numel // mp_degree
    start = part_size * mp_rank
    end = start + part_size

--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -94,6 +94,7 @@ black_list = {
    'softmax',
    'softmax_with_cross_entropy',
    'sigmoid_cross_entropy_with_logits',
+    'c_softmax_with_cross_entropy',
    'cross_entropy',
    'cross_entropy2',
    # fp16 is slower than fp32, though fp16 is supported.

--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -45,6 +45,7 @@ BLACK_LIST = {
    'softmax',
    'softmax_with_cross_entropy',
    'sigmoid_cross_entropy_with_logits',
+    'c_softmax_with_cross_entropy',
    'cross_entropy',
    'cross_entropy2',
    # default fp32 can avoid return inf when the sum value large than 65504