[Zero-Dim] fix functool.reduce more safe with intial value, to support empty list (#53182)

4da9b87b · zhouweiwei2014 · GitHub · f424162c · 4da9b87b · 4da9b87b
27 changed file
--- a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
@@ -252,7 +252,7 @@ class CostEstimator:

    def _calculate_bytes(self, sizes, dtype):
        if sizes:
-            total_count = reduce(lambda x, y: x * y, sizes)
+            total_count = reduce(lambda x, y: x * y, sizes, 1)
        else:
            total_count = 0


--- a/python/paddle/distributed/auto_parallel/cost/tensor_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/tensor_cost.py
@@ -96,7 +96,7 @@ class TensorCost:
            shape = self.shape
            dtype = self.dtype

-        total_count = reduce(lambda x, y: x * y, shape)
+        total_count = reduce(lambda x, y: x * y, shape, 1)

        if dtype == paddle.float32 or dtype == paddle.int32:
            dtype_factor = 4

--- a/python/paddle/distributed/auto_parallel/planner.py
+++ b/python/paddle/distributed/auto_parallel/planner.py
@@ -336,7 +336,7 @@ class PlanSpace:
        ops = program.global_block().ops
        vars = program.global_block().vars

-        processes = reduce(lambda x, y: x * y, process_mesh_topology)
+        processes = reduce(lambda x, y: x * y, process_mesh_topology, 1)
        global_group = list(range(processes))
        global_process_mesh = None
        pipeline_process_meshes = None

--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -1120,7 +1120,7 @@ class Resharder:
        """Compute the index of process_shape corresponding to the process."""
        relative_process = process_group.index(process)
        process_index = []
-        product = reduce(lambda x, y: x * y, process_shape)
+        product = reduce(lambda x, y: x * y, process_shape, 1)

        for i in range(len(process_shape)):
            idx = relative_process // (product // process_shape[i])

--- a/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
@@ -2120,7 +2120,7 @@ class RuleBasedTuner:
            has_used_devices = 0
            self.device_meshes_list.append([])
            for device_mesh in device_meshes:
-                devices = reduce(lambda x, y: x * y, device_mesh)
+                devices = reduce(lambda x, y: x * y, device_mesh, 1)
                processes = list(
                    range(has_used_devices, has_used_devices + devices)
                )

--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -1684,7 +1684,7 @@ def get_standalone_cost_data(distributed_programs):
                ].split(",")
                shape = [int(x.strip()) for x in shape]
                dtype_factor = 1
-                total_static_input_size += reduce(lambda x, y: x * y, shape)
+                total_static_input_size += reduce(lambda x, y: x * y, shape, 1)
                if op.type == "c_embedding":
                    arg_name_lower = (
                        "w" if arg_name_lower == "weight" else "ids"
@@ -1838,7 +1838,7 @@ def get_var_numel(var):
    """
    assert isinstance(var, Variable)
    assert -1 not in var.shape
-    return reduce(lambda x, y: x * y, var.shape)
+    return reduce(lambda x, y: x * y, var.shape, 1)


 def get_lr(optimizer):

--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -62,7 +62,7 @@ class CommunicateTopology:
        self.coordinate = collections.namedtuple(
            'Coordinate', self._parallel_names
        )
-        self._world_size = reduce(lambda x, y: x * y, self._dims)
+        self._world_size = reduce(lambda x, y: x * y, self._dims, 1)

        ranges = [range(d) for d in self._dims]
        all_coordinate = [self.coordinate(*x) for x in product(*ranges)]

--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -113,7 +113,7 @@ class DGCMomentumOptimizer(Optimizer):
        return regular_type, regular_coeff

    def _is_use_dgc(self, param_var, grad_var):
-        var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
+        var_numel = abs(reduce(lambda x, y: x * y, param_var.shape, 1))
        if (
            var_numel < 16384
            or param_var.type == core.VarDesc.VarType.SELECTED_ROWS

--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -111,7 +111,7 @@ class DygraphShardingOptimizer:
        for param in self._parameter_list:
            rank = sizes.index(min(sizes))
            mapping[rank].append(param)
-            numel = reduce(lambda x, y: x * y, param.shape)
+            numel = reduce(lambda x, y: x * y, param.shape, 1)
            assert (
                numel > 0
            ), "param [{}] should larger than 0, but it is [{}]".format(

--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -898,7 +898,7 @@ def get_var_size(param):
    """
    assert -1 not in param.shape
    return (
-        reduce(lambda x, y: x * y, param.shape)
+        reduce(lambda x, y: x * y, param.shape, 1)
        * DtypeToSize[param.dtype]
        / 1024.0
        / 1024.0

--- a/python/paddle/distributed/passes/auto_parallel_grad_clip.py
+++ b/python/paddle/distributed/passes/auto_parallel_grad_clip.py
@@ -75,8 +75,8 @@ def _get_dpmp_topology(origin_topology, sharding_group):
        sharding_axis = 0
        dp_sharding_topology = dp_sharding_topology[1:]

-    product_dp_sharding = reduce(lambda x, y: x * y, dp_sharding_topology)
-    product_topology = reduce(lambda x, y: x * y, origin_topology)
+    product_dp_sharding = reduce(lambda x, y: x * y, dp_sharding_topology, 1)
+    product_topology = reduce(lambda x, y: x * y, origin_topology, 1)

    if product_topology == product_dp_sharding:
        dpmp_topology = dp_sharding_topology
@@ -274,7 +274,7 @@ class ClipHelper:
            for param in params:
                rank = sizes.index(min(sizes))
                mapping[rank].append(param.name)
-                numel = reduce(lambda x, y: x * y, param.shape)
+                numel = reduce(lambda x, y: x * y, param.shape, 1)
                assert (
                    numel > 0
                ), "param [{}] should larger than 0, but it is [{}]".format(

--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -1661,7 +1661,7 @@ def partition_by_greedy_even(params, group_size):
    for param in params:
        rank = sizes.index(min(sizes))
        mapping[rank].append(param)
-        numel = reduce(lambda x, y: x * y, param.shape)
+        numel = reduce(lambda x, y: x * y, param.shape, 1)
        assert (
            numel > 0
        ), "param [{}] should larger than 0, but it is [{}]".format(

--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -386,7 +386,7 @@ def get_dense_send_context(
            grad = merged[1]
            origin_varnames.append(grad.merged_var.name)
            var = program.global_block().vars[grad.merged_var.name]
-            var_numel += reduce(lambda x, y: x * y, var.shape)
+            var_numel += reduce(lambda x, y: x * y, var.shape, 1)
        grad_name = "Dense@GRAD_" + str(idx)
        aggregate = True
        # print("public get_dense_send_context dense_table:", grad_name,
@@ -422,7 +422,7 @@ def get_dense_send_context(
            grad = merged[1]
            origin_varnames.append(grad.merged_var.name)
            var = program.global_block().vars[grad.merged_var.name]
-            var_numel += reduce(lambda x, y: x * y, var.shape)
+            var_numel += reduce(lambda x, y: x * y, var.shape, 1)
        grad_name = "DataNorm@GRAD_" + str(idx)
        aggregate = True
        # print("public get_dense_send_context data_norm table:", grad_name,
@@ -452,7 +452,7 @@ def get_dense_send_context(
            grad = merged[1]
            origin_varname = grad.merged_var.name
            var = program.global_block().vars[origin_varname]
-            var_numel = reduce(lambda x, y: x * y, var.shape)
+            var_numel = reduce(lambda x, y: x * y, var.shape, 1)
            grad_name = origin_varname
            aggregate = True
            from paddle.fluid.core import CommContext
@@ -503,7 +503,7 @@ def get_geo_trainer_send_context(attrs):
                True if param_name in distibuted_varnames else False
            )
            var = program.global_block().vars[grad.merged_var.name]
-            var_numel = reduce(lambda x, y: x * y, var.shape[1:])
+            var_numel = reduce(lambda x, y: x * y, var.shape[1:], 1)
            from paddle.fluid.core import CommContext

            print(
@@ -1167,7 +1167,7 @@ def get_communicate_var_info(
    for name in entrance_var_list:
        var = program.global_block().vars[name]
        shape = var.shape
-        recv_var_dim = -1 * reduce(lambda x, y: x * y, shape)
+        recv_var_dim = -1 * reduce(lambda x, y: x * y, shape, 1)
        input_var_reshape_dim.append(recv_var_dim)
        input_var_reshape_name.append(f"{name}.input_reshape@Heter")

@@ -1448,7 +1448,7 @@ dtype_to_size = {


 def get_var_mem_size(var):
-    m_size = reduce(lambda x, y: x * y, var.shape)
+    m_size = reduce(lambda x, y: x * y, var.shape, 1)
    m_size *= dtype_to_size[var.dtype]
    return m_size


--- a/python/paddle/distributed/transpiler/distribute_transpiler.py
+++ b/python/paddle/distributed/transpiler/distribute_transpiler.py
@@ -117,7 +117,7 @@ def slice_variable(var_list, slice_count, min_block_size):
    blocks = []
    for var in var_list:
        split_count = slice_count
-        var_numel = reduce(lambda x, y: x * y, var.shape)
+        var_numel = reduce(lambda x, y: x * y, var.shape, 1)
        max_pserver_count = int(math.floor(var_numel / float(min_block_size)))
        if max_pserver_count == 0:
            max_pserver_count = 1
@@ -127,7 +127,7 @@ def slice_variable(var_list, slice_count, min_block_size):

        if len(var.shape) >= 2:
            # align by dim1(width)
-            dim1 = reduce(lambda x, y: x * y, var.shape[1:])
+            dim1 = reduce(lambda x, y: x * y, var.shape[1:], 1)
            remains = block_size % dim1
            if remains != 0:
                block_size += dim1 - remains
@@ -2286,7 +2286,9 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
            orig_shape = orig_var.shape
            orig_dim1_flatten = 1
            if len(orig_shape) >= 2:
-                orig_dim1_flatten = reduce(lambda x, y: x * y, orig_shape[1:])
+                orig_dim1_flatten = reduce(
+                    lambda x, y: x * y, orig_shape[1:], 1
+                )

            for i, block in enumerate(split):
                size = block[1]

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -5968,7 +5968,7 @@ class PipelineOptimizer:
        }
        assert -1 not in var.shape
        return (
-            reduce(lambda x, y: x * y, var.shape)
+            reduce(lambda x, y: x * y, var.shape, 1)
            * dtype_to_size[var.dtype]
            / 1024.0
            / 1024.0

--- a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
@@ -46,7 +46,7 @@ class TestDygraphWeightNorm(unittest.TestCase):
    def norm_except_dim(self, w, dim=None):
        shape = w.shape
        ndims = len(shape)
-        shape_numel = reduce(lambda x, y: x * y, shape)
+        shape_numel = reduce(lambda x, y: x * y, shape, 1)
        if dim == -1:
            return np.linalg.norm(w, axis=None, keepdims=True).flatten()
        elif dim == 0:
@@ -68,7 +68,7 @@ class TestDygraphWeightNorm(unittest.TestCase):
    def weight_normalize(self, w, dim=None):
        shape = w.shape
        ndims = len(shape)
-        shape_numel = reduce(lambda x, y: x * y, shape)
+        shape_numel = reduce(lambda x, y: x * y, shape, 1)
        v = w
        g = self.norm_except_dim(w, dim)
        g_mul = g

--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -1427,7 +1427,7 @@ class TestGradientTruncated(unittest.TestCase):
        paddle.enable_static()

        to_string = lambda x, i: x + '_' + str(i)
-        numel = lambda input_shape: reduce(lambda x, y: x * y, input_shape)
+        numel = lambda input_shape: reduce(lambda x, y: x * y, input_shape, 1)

        def op1(x):
            value = paddle.tensor.fill_constant([1], "float32", 1)

--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -612,7 +612,7 @@ class TestListIndex(unittest.TestCase):
        np.random.seed(2022)

    def numel(self, shape):
-        return reduce(lambda x, y: x * y, shape)
+        return reduce(lambda x, y: x * y, shape, 1)

    def test_static_graph_list_index(self):
        paddle.enable_static()

--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -117,7 +117,7 @@ class SliceInfo:
        return s

    def numel(self, shape):
-        return reduce(lambda x, y: x * y, shape)
+        return reduce(lambda x, y: x * y, shape, 1)

    def get_offset_stride(self, tensor_shape):
        for index in self.indexes:

--- a/python/paddle/incubate/distributed/fleet/parameter_server/ir/public.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/ir/public.py
@@ -652,7 +652,7 @@ class CompileTimeStrategy:
                var = self.origin_main_program.global_block().vars[
                    grad.merged_var.name
                ]
-                var_numel = reduce(lambda x, y: x * y, var.shape[1:])
+                var_numel = reduce(lambda x, y: x * y, var.shape[1:], 1)

                sparse_ctx = core.CommContext(
                    grad_name,
@@ -705,7 +705,7 @@ class CompileTimeStrategy:
                var = self.origin_main_program.global_block().vars[
                    grad.merged_var.name
                ]
-                var_numel += reduce(lambda x, y: x * y, var.shape)
+                var_numel += reduce(lambda x, y: x * y, var.shape, 1)
            grad_name = "Dense@Grad"
            trainer_id = self.get_role_id()
            aggregate = True
@@ -734,7 +734,7 @@ class CompileTimeStrategy:
                var = self.origin_main_program.global_block().vars[
                    origin_varname
                ]
-                var_numel = reduce(lambda x, y: x * y, var.shape)
+                var_numel = reduce(lambda x, y: x * y, var.shape, 1)
                grad_name = origin_varname
                aggregate = True
                dense_ctx = core.CommContext(
@@ -1058,7 +1058,7 @@ class CompileTimeStrategy:
        blocks = []
        for var in var_list:
            if not uniform:
-                var_numel = reduce(lambda x, y: x * y, var.shape)
+                var_numel = reduce(lambda x, y: x * y, var.shape, 1)

                split_count = 1

@@ -1077,7 +1077,7 @@ class CompileTimeStrategy:

                if len(var.shape) >= 2:
                    # align by dim1(width)
-                    dim1 = reduce(lambda x, y: x * y, var.shape[1:])
+                    dim1 = reduce(lambda x, y: x * y, var.shape[1:], 1)
                    remains = block_size % dim1
                    if remains != 0:
                        block_size += dim1 - remains
@@ -1102,7 +1102,7 @@ class CompileTimeStrategy:
                for i in range(remainder):
                    dim0s[i] = dim0s[i] + 1

-                dim1 = reduce(lambda x, y: x * y, var.shape[1:])
+                dim1 = reduce(lambda x, y: x * y, var.shape[1:], 1)

                for block_id in range(len(dim0s)):
                    numel = dim0s[block_id] * dim1

--- a/python/paddle/incubate/distributed/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/ir/trainer_pass.py
@@ -1484,7 +1484,7 @@ def get_communicate_var_info(
        #     raise ValueError(
        #         "Variable {} not support heter training. its shape is {}".
        #         format(name, shape))
-        recv_var_dim = -1 * reduce(lambda x, y: x * y, shape)
+        recv_var_dim = -1 * reduce(lambda x, y: x * y, shape, 1)
        input_var_reshape_dim.append(recv_var_dim)
        input_var_reshape_name.append(f"{name}.input_reshape@Heter")

@@ -1497,7 +1497,7 @@ def get_communicate_var_info(
    #    #     raise ValueError(
    #    #         "Variable {} not support heter training. its shape is {}".
    #    #         format(var_name, shape))
-    #    send_reshape_dim = -1 * reduce(lambda x, y: x * y, shape)
+    #    send_reshape_dim = -1 * reduce(lambda x, y: x * y, shape, 1)
    #    output_var_reshape_dim.append(send_reshape_dim)
    #    output_var_reshape_name.append("{}.output_reshape@Heter".format(
    #        var_name))

--- a/python/paddle/incubate/distributed/fleet/parameter_server/ir/vars_metatools.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/ir/vars_metatools.py
@@ -65,7 +65,7 @@ class VarStruct:
        self.lod_level = lod_level
        self.persistable = persistable
        self.m_size = 1
-        self.m_size = reduce(lambda x, y: x * y, shape)
+        self.m_size = reduce(lambda x, y: x * y, shape, 1)
        self.m_size *= dtype_to_size[dtype]

    def __str__(self):

--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -546,7 +546,7 @@ def local_response_norm(

    from functools import reduce

-    sum_sizes = reduce(lambda x, y: x * y, sizes[1:])
+    sum_sizes = reduce(lambda x, y: x * y, sizes[1:], 1)

    div = paddle.unsqueeze(paddle.multiply(x, x), axis=1)
    if not channel_last:

--- a/python/paddle/nn/utils/transform_parameters.py
+++ b/python/paddle/nn/utils/transform_parameters.py
@@ -155,7 +155,7 @@ def vector_to_parameters(vec, parameters, name=None):
    for param in parameters:
        shape = param.shape
        origin_shapes.append(shape)
-        numel = reduce(lambda x, y: x * y, shape)
+        numel = reduce(lambda x, y: x * y, shape, 1)
        sections.append(numel)

    if len(sections) == 1:

--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -3601,7 +3601,7 @@ def layer_norm(
    # create intput and parameters
    inputs = {'X': input}
    input_shape = input.shape
-    param_shape = [reduce(lambda x, y: x * y, input_shape[begin_norm_axis:])]
+    param_shape = [reduce(lambda x, y: x * y, input_shape[begin_norm_axis:], 1)]
    if scale:
        assert (
            param_attr is not False

--- a/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
+++ b/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
@@ -46,13 +46,13 @@ class TestOneDNNReshapeTransposeMatmulFusePass(PassAutoScanTest):

        def generate_input2(attrs):
            shape_x = [attrs[3]['batch_size'], attrs[3]['channel'], self.num]
-            input_volume = reduce(lambda x, y: x * y, shape_x)
+            input_volume = reduce(lambda x, y: x * y, shape_x, 1)
            matmul_shape = list(attrs[0]['shape'])
            if 0 in matmul_shape:
                for i in range(len(matmul_shape)):
                    if matmul_shape[i] == 0:
                        matmul_shape[i] = shape_x[i]
-            shape_volume = reduce(lambda x, y: x * y, matmul_shape)
+            shape_volume = reduce(lambda x, y: x * y, matmul_shape, 1)

            if -1 in matmul_shape:
                for i in range(len(matmul_shape)):

--- a/test/xpu/test_set_value_op_xpu.py
+++ b/test/xpu/test_set_value_op_xpu.py
@@ -1252,7 +1252,9 @@ class XPUTestSetValueOp(XPUOpTestWrapper):
            paddle.enable_static()

            to_string = lambda x, i: x + '_' + str(i)
-            numel = lambda input_shape: reduce(lambda x, y: x * y, input_shape)
+            numel = lambda input_shape: reduce(
+                lambda x, y: x * y, input_shape, 1
+            )

            def op1(x):
                value = paddle.tensor.fill_constant([1], "float32", 1)