From 5c9c1a39a1ffde467f94f2290e7aaf33a0deec18 Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Thu, 12 Jan 2023 10:16:06 +0800 Subject: [PATCH] [AutoParallel] recovery annotation (#49665) * recovery annotation * bugfix --- .../distributed/auto_parallel/completion.py | 2 - .../auto_parallel/amp_pass_unittest.py | 38 +++++++++---------- .../unittests/auto_parallel/engine_api.py | 20 +++++----- .../auto_parallel/test_engine_api.py | 4 +- .../unittests/auto_parallel/test_pass_amp.py | 4 +- 5 files changed, 33 insertions(+), 35 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index 123ff0e0206..8960c47c1f5 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -939,7 +939,6 @@ class Completer: self._dist_context._serial_main_program = serial_main_program if not is_naive_data_parallel(self._dist_context): - print("$$$$$$ here 0", flush=True) self._dist_context.initialize(with_graph=True) self._prepare() self._update_process_mesh() @@ -947,7 +946,6 @@ class Completer: # Copy the corresponding distributed attribute from graph to serial_main_program self._dist_context.copy_dist_attr_from_graph_to_program() else: - print("$$$$$$ here 2", flush=True) self._logger.info("Default distributed attributed will be set.") self._dist_context.initialize(with_graph=False) # A fast and special completion for data parallel diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py index 917494a19a8..1f90f90b2fb 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py @@ -89,31 +89,31 @@ class TestAMPPass(unittest.TestCase): ) def test_amp_pass(self): - # # mp2 training - # mp_engine = self.get_engine() - # history = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size) - # mp_losses = np.array(history.history["loss"]) + # mp2 training + mp_engine = self.get_engine() + history = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size) + mp_losses = np.array(history.history["loss"]) # mp2 amp-o1 training amp_o1_engine = self.get_engine(True, "o1") history = amp_o1_engine.fit(self.dataset, 3, batch_size=self.batch_size) amp_o1_losses = np.array(history.history["loss"]) amp_o1_engine.evaluate(self.dataset, 3, batch_size=self.batch_size) - # # self.check_results(mp_losses, amp_o1_losses) - - # # mp2 amp-o2 training - # amp_o2_engine = self.get_engine(True, "o2") - # history = amp_o2_engine.fit(self.dataset, 3, batch_size=self.batch_size) - # amp_o2_losses = np.array(history.history["loss"]) - # amp_o2_engine.evaluate(self.dataset, 3, batch_size=self.batch_size) - # # self.check_results(mp_losses, amp_o2_losses) - - # # mp2 amp-o3 training - # amp_o3_engine = self.get_engine(True, "o3") - # history = amp_o3_engine.fit(self.dataset, 3, batch_size=self.batch_size) - # amp_o3_losses = np.array(history.history["loss"]) - # amp_o3_engine.evaluate(self.dataset, 3, batch_size=self.batch_size) - # # self.check_results(mp_losses, amp_o3_losses) + # self.check_results(mp_losses, amp_o1_losses) + + # mp2 amp-o2 training + amp_o2_engine = self.get_engine(True, "o2") + history = amp_o2_engine.fit(self.dataset, 3, batch_size=self.batch_size) + amp_o2_losses = np.array(history.history["loss"]) + amp_o2_engine.evaluate(self.dataset, 3, batch_size=self.batch_size) + # self.check_results(mp_losses, amp_o2_losses) + + # mp2 amp-o3 training + amp_o3_engine = self.get_engine(True, "o3") + history = amp_o3_engine.fit(self.dataset, 3, batch_size=self.batch_size) + amp_o3_losses = np.array(history.history["loss"]) + amp_o3_engine.evaluate(self.dataset, 3, batch_size=self.batch_size) + # self.check_results(mp_losses, amp_o3_losses) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py index 003b09f9f37..1ff2cc5822d 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py @@ -158,9 +158,9 @@ def train_high_level(fetch): eval_dataset2 = MyDataset(batch_size) engine.evaluate(eval_dataset2, batch_size=batch_size) - # # predict - # test_dataset = MyDataset(batch_size) - # outputs = engine.predict(test_dataset, batch_size=batch_size) + # predict + test_dataset = MyDataset(batch_size) + outputs = engine.predict(test_dataset, batch_size=batch_size) # save temp_dir = tempfile.TemporaryDirectory() @@ -498,10 +498,10 @@ def get_cost_by_spec(): if __name__ == "__main__": train_high_level(fetch=True) - # train_high_level(fetch=False) - # train_low_level() - # train_builtin_data_vars() - # train_non_builtin_data_vars() - # get_cost() - # get_cost_by_default_program() - # get_cost_by_spec() + train_high_level(fetch=False) + train_low_level() + train_builtin_data_vars() + train_non_builtin_data_vars() + get_cost() + get_cost_by_default_program() + get_cost_by_spec() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py index d9b584ae21b..68eb819dd8b 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py @@ -38,8 +38,8 @@ class TestEngineAPI(unittest.TestCase): "paddle.distributed.launch", "--devices", "0,1", - # "--log_dir", - # tmp_dir.name, + "--log_dir", + tmp_dir.name, launch_model_path, ] ) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_amp.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_amp.py index bb6e3636284..492159e650d 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_amp.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_amp.py @@ -38,8 +38,8 @@ class TestAMPPass(unittest.TestCase): "paddle.distributed.launch", "--devices", "0,1", - # "--log_dir", - # tmp_dir.name, + "--log_dir", + tmp_dir.name, launch_model_path, ] ) -- GitLab