fix reshard bug (#41106)

e494b73b · caozhou · GitHub · ee8eeb45 · e494b73b · e494b73b
Showing with 12 addition and 13 deletion

python/paddle/distributed/auto_parallel/planner.py python/paddle/distributed/auto_parallel/planner.py +9 -12

python/paddle/distributed/auto_parallel/reshard.py python/paddle/distributed/auto_parallel/reshard.py +3 -1

未找到文件。
--- a/python/paddle/distributed/auto_parallel/planner.py
+++ b/python/paddle/distributed/auto_parallel/planner.py
@@ -15,7 +15,6 @@
 import copy
 import time
 import random
-import logging
 from functools import reduce
 from itertools import chain, product
 from collections import OrderedDict
@@ -741,7 +740,7 @@ class MCMC(SearchAlgorithm):
        return best_dist_context, min_cost
    def search(self):
-        logging.info("Start MCMC searching.")
+        print("Start MCMC searching.")
        start_time = time.time()
        train_program = self.serial_program_info.train_program
        cluster = self.serial_program_info.cluster
@@ -757,9 +756,8 @@ class MCMC(SearchAlgorithm):
        searched_pipeline_dist_context = None
        pipeline_min_cost = None
        for process_mesh_topology in process_mesh_topology_list:
-            logging.info(
+            print("MCMC search: search process mesh {} with pipeline mode.".
-                "MCMC search: search process mesh {} with pipeline mode.".
+                  format(process_mesh_topology))
-                format(process_mesh_topology))
            valid_dist_attr_dict, pipeline_process_meshes, global_process_mesh = PlanSpace.enum_valid_dist_attr_for_program(
                train_program, process_mesh_topology, True)
            init_dist_context = self.init_program(
@@ -768,7 +766,7 @@ class MCMC(SearchAlgorithm):
            best_dist_context, cost = self._search_core(valid_dist_attr_dict,
                                                        init_dist_context,
                                                        pipeline_process_meshes)
-            logging.info(
+            print(
                "MCMC search: the min cost is {} in the process mesh {} with pipeline mode.".
                format(cost, process_mesh_topology))
            best_dist_context._dist_op_context = DistributedOperatorContext()
@@ -784,9 +782,8 @@ class MCMC(SearchAlgorithm):
            # if process_mesh_topology shape is 3, include pipeline mode by default
            if len(process_mesh_topology) == 3:
                continue
-            logging.info(
+            print("MCMC search: search process mesh {} without pipeline mode.".
-                "MCMC search: search process mesh {} without pipeline mode.".
+                  format(process_mesh_topology))
-                format(process_mesh_topology))
            valid_dist_attr_dict, pipeline_process_meshes, global_process_mesh = PlanSpace.enum_valid_dist_attr_for_program(
                train_program, process_mesh_topology, False)
            init_dist_context = self.init_program(
@@ -795,7 +792,7 @@ class MCMC(SearchAlgorithm):
            best_dist_context, cost = self._search_core(valid_dist_attr_dict,
                                                        init_dist_context,
                                                        pipeline_process_meshes)
-            logging.info(
+            print(
                "MCMC search: the min cost is {} in the process mesh {} without pipeline mode.".
                format(cost, process_mesh_topology))
            best_dist_context._dist_op_context = DistributedOperatorContext()
@@ -808,7 +805,7 @@ class MCMC(SearchAlgorithm):
        if non_pipeline_min_cost > pipeline_min_cost:
            searched_dist_context = searched_pipeline_dist_context
            min_cost = pipeline_min_cost
-            logging.info(
+            print(
                "Better set FLAGS_benchmark=1 to avoid hang problem in the pipeline mode."
            )
        else:
@@ -820,7 +817,7 @@ class MCMC(SearchAlgorithm):
        for process_mesh in searched_dist_context._process_meshes:
            pg0.add_ranks(process_mesh.processes)
        end_time = time.time()
-        logging.info(
+        print(
            "End MCMC searching: the min cost is {} and the search time is {}s.".
            format(min_cost, end_time - start_time))
        return searched_dist_context, min_cost

--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -1239,7 +1239,9 @@ class Resharder:
                    for item in self.has_allgather[var_name]:
                        if op_desc.group == item[0]:
                            tensor_list = [
-                                program.global_block().vars[var_name]
+                                get_var_with_recursion(
+                                    var_name, block,
+                                    self.auto_parallel_main_prog)
                                for var_name in item[1]
                            ]
                            break