未验证 提交 22c67d14 编写于 作者: Z ziyoujiyi 提交者: GitHub

统一 ps 开发 - python (#39431)

* delete gloo connect retry

* the_one_ps dirs reconstruct

* .

* .

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* the one ps dirs modify

* the one ps dirs modify

* the one ps dirs modify

* the one ps dirs modify

* refactor ps optimize

* refactor ps optimize

* refactor ps optimize

* .

* .

* .

* .

* .

* .

* refactor theoneps

* the_one_ps

* add ps pass unittest

* add ps pass unittest

* ps unitest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* add cpu_async_ps_mode test

* add cpu_async_ps_mode test

* add cpu_async_ps_mode test

* ps unittest ready

* ps unittest ready

* solve dist_pass init conflict

* solve import CommContext error

* unittest ok

* implement AllocateFrom

* solve setup.py.in conflict

* solve conflict

* solve conflict

* solve conflict

* .

* .

* cpu-async-ps minimize test ok & gpu minimize test ok
Co-authored-by: Nzkh2016 <zhangkaihuo@baidu.com>
上级 1c44d3e2
...@@ -31,7 +31,6 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -31,7 +31,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):
self.inner_opt = optimizer self.inner_opt = optimizer
# we do not allow meta optimizer to be inner optimizer currently # we do not allow meta optimizer to be inner optimizer currently
self.meta_optimizers_white_list = [] self.meta_optimizers_white_list = []
self.attrs = {}
self.pass_ctx = PassContext() self.pass_ctx = PassContext()
def _set_basic_info(self, loss, role_maker, user_defined_optimizer, def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
...@@ -40,50 +39,48 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -40,50 +39,48 @@ class ParameterServerOptimizer(MetaOptimizerBase):
loss, role_maker, user_defined_optimizer, user_defined_strategy) loss, role_maker, user_defined_optimizer, user_defined_strategy)
def _init_ps_pass_context(self, loss, startup_program): def _init_ps_pass_context(self, loss, startup_program):
attrs = {}
# trainer # trainer
self.attrs["env"] = get_dist_env() attrs["env"] = get_dist_env()
self.attrs['loss'] = loss attrs['loss'] = loss
self.attrs['min_block_size'] = 81920 attrs['min_block_size'] = 81920
self.attrs['origin_main_program'] = loss.block.program attrs['origin_main_program'] = loss.block.program
self.attrs['origin_startup_program'] = startup_program attrs['origin_startup_program'] = startup_program
self.attrs['cloned_main'] = loss.block.program.clone() attrs['cloned_main'] = attrs['origin_main_program'].clone()
self.attrs['cloned_startup'] = startup_program.clone() attrs['cloned_startup'] = attrs['origin_startup_program'].clone()
self.attrs['user_defined_strategy'] = self.user_defined_strategy attrs['user_defined_strategy'] = self.user_defined_strategy
self.attrs['trainer'] = TrainerRuntimeConfig(self.user_defined_strategy) attrs['trainer'] = TrainerRuntimeConfig(self.user_defined_strategy)
self.attrs['ps_mode'] = self.attrs['trainer'].mode attrs['ps_mode'] = attrs['trainer'].mode
self.attrs['role_maker'] = self.role_maker attrs['role_maker'] = self.role_maker
self.attrs[ attrs[
'is_heter_ps_mode'] = self.role_maker._is_heter_parameter_server_mode 'is_heter_ps_mode'] = self.role_maker._is_heter_parameter_server_mode
self.attrs['is_worker'] = self.role_maker._is_worker() attrs['is_worker'] = self.role_maker._is_worker()
self.attrs['is_server'] = self.role_maker._is_server() attrs['is_server'] = self.role_maker._is_server()
self.attrs['is_heter_worker'] = self.role_maker._is_heter_worker() attrs['is_heter_worker'] = self.role_maker._is_heter_worker()
self.attrs['use_ps_gpu'] = self.user_defined_strategy.a_sync_configs[ attrs['use_ps_gpu'] = self.user_defined_strategy.a_sync_configs[
"use_ps_gpu"] "use_ps_gpu"]
self.attrs[ attrs['lr_decay_steps'] = self.user_defined_strategy.a_sync_configs[
'lr_decay_steps'] = self.user_defined_strategy.a_sync_configs[
"lr_decay_steps"] "lr_decay_steps"]
self.attrs['k_steps'] = self.user_defined_strategy.a_sync_configs[ attrs['k_steps'] = self.user_defined_strategy.a_sync_configs["k_steps"]
"k_steps"] attrs['launch_barrier'] = self.user_defined_strategy.a_sync_configs[
self.attrs[
'launch_barrier'] = self.user_defined_strategy.a_sync_configs[
"launch_barrier"] "launch_barrier"]
self.attrs['launch_barrier_flag'] = int( attrs['launch_barrier_flag'] = int(
os.getenv("FLAGS_LAUNCH_BARRIER", "1")) os.getenv("FLAGS_LAUNCH_BARRIER", "1"))
build_var_distributed(self.attrs) build_var_distributed(attrs)
# server # server
self.attrs['_main_server'] = fluid.Program() attrs['_main_server'] = fluid.Program()
self.attrs['_startup_server'] = fluid.Program() attrs['_startup_server'] = fluid.Program()
self.attrs['tensor_table'] = {} attrs['tensor_table'] = {}
self.pass_ctx._attrs = self.attrs self.pass_ctx._attrs = attrs
def _is_graph_out(self): def _is_graph_out(self):
return False return False
......
...@@ -115,9 +115,9 @@ class AddLrDecayTablePass(PassBase): ...@@ -115,9 +115,9 @@ class AddLrDecayTablePass(PassBase):
LRScheduler), "must be LRScheduler" LRScheduler), "must be LRScheduler"
ops = get_optimize_ops(attrs['origin_main_program']) ops = get_optimize_ops(attrs['origin_main_program'])
lr_decay_main_program, lr_decay_startup_program, lr_name = _get_lr_sheduler_program( lr_decay_main_program, lr_decay_startup_program, lr_name = self._get_lr_sheduler_program(
attrs['origin_main_program'].lr_sheduler, attrs['lr_decay_steps']) attrs['origin_main_program'].lr_sheduler, attrs['lr_decay_steps'])
_add_tensor_table(attrs, "@LR_DECAY_COUNTER@", lr_name, self._add_tensor_table(attrs, "@LR_DECAY_COUNTER@", lr_name,
lr_decay_startup_program, lr_decay_main_program, lr_decay_startup_program, lr_decay_main_program,
"GlobalStepTable") "GlobalStepTable")
return return
......
...@@ -41,7 +41,7 @@ class PsProgramBuilder(object): ...@@ -41,7 +41,7 @@ class PsProgramBuilder(object):
pass pass
def _build_trainer_programs(self): def _build_trainer_programs(self):
pass raise NotImplementedError
def _build_pserver_programs(self): def _build_pserver_programs(self):
is_sgd_adam = False is_sgd_adam = False
...@@ -60,11 +60,13 @@ class PsProgramBuilder(object): ...@@ -60,11 +60,13 @@ class PsProgramBuilder(object):
def _build_programs(self): def _build_programs(self):
if self.attrs['is_worker']: if self.attrs['is_worker']:
logger.info("start building trainer program")
self._build_trainer_programs() self._build_trainer_programs()
fluid.framework.switch_startup_program(self.cloned_startup) fluid.framework.switch_startup_program(self.cloned_startup)
self.loss.block.program = self.cloned_main self.loss.block.program = self.cloned_main
elif self.attrs['is_server']: elif self.attrs['is_server']:
logger.info("start building pserver program")
self._build_pserver_programs() self._build_pserver_programs()
self.loss.block.program = self.attrs['_main_server'] self.loss.block.program = self.attrs['_main_server']
fluid.framework.switch_startup_program(self.attrs[ fluid.framework.switch_startup_program(self.attrs[
...@@ -73,6 +75,7 @@ class PsProgramBuilder(object): ...@@ -73,6 +75,7 @@ class PsProgramBuilder(object):
class GeoPsProgramBuilder(PsProgramBuilder): # 仅 CPU 模式 class GeoPsProgramBuilder(PsProgramBuilder): # 仅 CPU 模式
def __init__(self, pass_ctx): def __init__(self, pass_ctx):
logger.info("start building geo-ps program")
super(GeoPsProgramBuilder, self).__init__(pass_ctx) super(GeoPsProgramBuilder, self).__init__(pass_ctx)
if self.ps_mode != DistributedMode.GEO: if self.ps_mode != DistributedMode.GEO:
raise ValueError("ps mode: {} not matched {}", raise ValueError("ps mode: {} not matched {}",
...@@ -92,6 +95,7 @@ class GeoPsProgramBuilder(PsProgramBuilder): # 仅 CPU 模式 ...@@ -92,6 +95,7 @@ class GeoPsProgramBuilder(PsProgramBuilder): # 仅 CPU 模式
class CpuSyncPsProgramBuilder(PsProgramBuilder): class CpuSyncPsProgramBuilder(PsProgramBuilder):
def __init__(self, pass_ctx): def __init__(self, pass_ctx):
logger.info("start building cpu-sync-ps program")
super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx) super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx)
if self.ps_mode == DistributedMode.GEO: if self.ps_mode == DistributedMode.GEO:
raise ValueError("ps mode: {} not matched {}", raise ValueError("ps mode: {} not matched {}",
...@@ -130,14 +134,17 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder): ...@@ -130,14 +134,17 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
class CpuAsyncPsProgramBuilder(CpuSyncPsProgramBuilder): class CpuAsyncPsProgramBuilder(CpuSyncPsProgramBuilder):
def __init__(self, pass_ctx): def __init__(self, pass_ctx):
logger.info("start building cpu-async-ps program")
super(CpuAsyncPsProgramBuilder, self).__init__(pass_ctx) super(CpuAsyncPsProgramBuilder, self).__init__(pass_ctx)
class GpuPsProgramBuilder(PsProgramBuilder): # 和 geo、sync、async 等模式无关 class GpuPsProgramBuilder(PsProgramBuilder):
def __init__(self, pass_ctx): def __init__(self, pass_ctx):
logger.info("start building gpu-ps program")
super(GpuPsProgramBuilder, self).__init__(pass_ctx) super(GpuPsProgramBuilder, self).__init__(pass_ctx)
def _build_trainer_programs(self): def _build_trainer_programs(self):
add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass", add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
self.attrs) self.attrs)
add_lr_decay_table_pass.apply([], [], self.pass_ctx) add_lr_decay_table_pass.apply([], [], self.pass_ctx)
...@@ -152,7 +159,8 @@ class GpuPsProgramBuilder(PsProgramBuilder): # 和 geo、sync、async 等模式 ...@@ -152,7 +159,8 @@ class GpuPsProgramBuilder(PsProgramBuilder): # 和 geo、sync、async 等模式
ps_gpu_pass.apply([self.cloned_main], [None], self.pass_ctx) ps_gpu_pass.apply([self.cloned_main], [None], self.pass_ctx)
ps_transpile_pass = new_pass("ps_transpile_pass", self.attrs) ps_transpile_pass = new_pass("ps_transpile_pass", self.attrs)
ps_transpile_pass.apply([_main], [_startup], self.pass_ctx) ps_transpile_pass.apply([self.cloned_main], [self.cloned_startup],
self.pass_ctx)
self.attrs['origin_main_program'] = self.cloned_main self.attrs['origin_main_program'] = self.cloned_main
self.attrs['origin_startup_program'] = self.cloned_startup self.attrs['origin_startup_program'] = self.cloned_startup
...@@ -165,6 +173,7 @@ class GpuPsProgramBuilder(PsProgramBuilder): # 和 geo、sync、async 等模式 ...@@ -165,6 +173,7 @@ class GpuPsProgramBuilder(PsProgramBuilder): # 和 geo、sync、async 等模式
class HeterAsyncPsProgramBuilder(PsProgramBuilder): class HeterAsyncPsProgramBuilder(PsProgramBuilder):
def __init__(self, pass_ctx): def __init__(self, pass_ctx):
logger.info("start building heter-async-ps program")
super(HeterAsyncPsProgramBuilder, self).__init__(pass_ctx) super(HeterAsyncPsProgramBuilder, self).__init__(pass_ctx)
if self.use_ps_gpu or self.ps_mode == DistributedMode.GEO or self.attrs[ if self.use_ps_gpu or self.ps_mode == DistributedMode.GEO or self.attrs[
'is_heter_ps_mode'] == False: 'is_heter_ps_mode'] == False:
......
...@@ -27,6 +27,10 @@ from paddle.fluid.core import CommContext ...@@ -27,6 +27,10 @@ from paddle.fluid.core import CommContext
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
#logging.basicConfig(
# format='%(levelname)s - %(asctime)s - %(pathname)s: %(lineno)s - %(message)s', level=logging.INFO)
#logger = logging.getLogger(__name__)
OP_NAME_SCOPE = "op_namescope" OP_NAME_SCOPE = "op_namescope"
CLIP_OP_NAME_SCOPE = "gradient_clip" CLIP_OP_NAME_SCOPE = "gradient_clip"
STEP_COUNTER = "@PS_STEP_COUNTER@" STEP_COUNTER = "@PS_STEP_COUNTER@"
...@@ -43,6 +47,24 @@ SPARSE_OP_LIST = ["lookup_table", "lookup_table_v2"] ...@@ -43,6 +47,24 @@ SPARSE_OP_LIST = ["lookup_table", "lookup_table_v2"]
SPARSE_OP_TYPE_DICT = {"lookup_table": "W", "lookup_table_v2": "W"} SPARSE_OP_TYPE_DICT = {"lookup_table": "W", "lookup_table_v2": "W"}
def logger_config(log_path, logging_name):
logger = logging.getLogger(logging_name)
logger.setLevel(level=logging.DEBUG)
handler = logging.FileHandler(log_path, mode='a', encoding='UTF-8')
handler.setLevel(logging.INFO)
formatter = logging.Formatter(
'%(levelname)s - %(asctime)s - %(pathname)s: %(lineno)s - %(message)s')
handler.setFormatter(formatter)
console = logging.StreamHandler()
console.setLevel(logging.DEBUG)
logger.addHandler(handler)
logger.addHandler(console)
return logger
logger = logger_config(log_path='/ps_log', logging_name='ps_log')
class DistributedMode: class DistributedMode:
SYNC = 0 SYNC = 0
ASYNC = 1 ASYNC = 1
......
...@@ -36,7 +36,10 @@ class PsPassTestBase(unittest.TestCase): ...@@ -36,7 +36,10 @@ class PsPassTestBase(unittest.TestCase):
def tearDown(self): def tearDown(self):
print('Ps tearDown...') print('Ps tearDown...')
def ps_launch(self, config): def ps_launch(self, config, ps_mode="cpu-ps"):
if ps_mode == "cpu-ps":
os.environ['WITH_DISTRIBUTE'] = 'ON'
cmd = [ cmd = [
sys.executable, sys.executable,
"-u", "-u",
...@@ -50,6 +53,31 @@ class PsPassTestBase(unittest.TestCase): ...@@ -50,6 +53,31 @@ class PsPassTestBase(unittest.TestCase):
config['debug_new_minimize'], "--applied_pass_name", config['debug_new_minimize'], "--applied_pass_name",
config['applied_pass_name'] config['applied_pass_name']
] ]
elif ps_mode == "gpu-ps":
os.environ['FLAGS_LAUNCH_BARRIER'] = '0'
os.environ['PADDLE_PSERVER_NUMS'] = '1'
os.environ['PADDLE_TRAINERS_NUM'] = '1'
os.environ['POD_IP'] = '127.0.0.1'
os.environ['PADDLE_PSERVERS_IP_PORT_LIST'] = '127.0.0.1:29011'
os.environ['PADDLE_PORT'] = '29011'
os.environ['FLAGS_selected_gpus'] = '0,1,2,3,4,5,6,7'
# pserver
# os.environ['TRAINING_ROLE'] = 'PSERVER'
# trainer
os.environ['TRAINING_ROLE'] = 'TRAINER'
os.environ['PADDLE_TRAINER_ID'] = '0'
cmd = [
sys.executable, "-u", "../ps/ps_dnn_trainer.py", "-m",
config['ps_mode_config'], "--run_minimize",
config['run_minimize'], "--run_single_pass",
config['run_single_pass'], "--debug_new_pass",
config['debug_new_pass'], "--debug_new_minimize",
config['debug_new_minimize'], "--applied_pass_name",
config['applied_pass_name']
]
cmd = [shlex.quote(c) for c in cmd] cmd = [shlex.quote(c) for c in cmd]
prepare_python_path_and_return_module(__file__) prepare_python_path_and_return_module(__file__)
exitcode = os.system(' '.join(cmd)) exitcode = os.system(' '.join(cmd))
...@@ -21,6 +21,7 @@ import numpy as np ...@@ -21,6 +21,7 @@ import numpy as np
import paddle import paddle
from ps_pass_test_base import * from ps_pass_test_base import *
from paddle.distributed.ps.utils.public import logger
from paddle.fluid.tests.unittests.ps.ps_dnn_trainer import DnnTrainer from paddle.fluid.tests.unittests.ps.ps_dnn_trainer import DnnTrainer
...@@ -38,30 +39,43 @@ class TestPsTrainerPass(PsPassTestBase): ...@@ -38,30 +39,43 @@ class TestPsTrainerPass(PsPassTestBase):
self.config['applied_pass_name'] = "" self.config['applied_pass_name'] = ""
def setUp(self): def setUp(self):
print('TestPsTrainerPass setUp...') pass
def tearDown(self): def tearDown(self):
print('TestPsTrainerPass tearDown...') pass
def check(self): def check(self):
pass pass
def test_ps_optimizer_minimize(self): def test_ps_optimizer_minimize_cpu(self):
self.init() self.init()
self.config['run_minimize'] = '1' self.config['run_minimize'] = '1'
self.config['debug_new_minimize'] = '0' self.config['debug_new_minimize'] = '0'
self.config['log_dir'] = "/log_old_minimize" self.config['log_dir'] = "/cpu_log_old_minimize"
remove_path_if_exists(self.config['log_dir']) remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config) self.ps_launch(self.config)
self.config['debug_new_minimize'] = '1' self.config['debug_new_minimize'] = '1'
self.config['log_dir'] = "/log_new_minimize" self.config['log_dir'] = "/cpu_log_new_minimize"
remove_path_if_exists(self.config['log_dir']) remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config) self.ps_launch(self.config)
self.check() self.check()
def test_ps_optimizer_minimize_gpu(self):
self.init()
self.config['run_minimize'] = '1'
self.config['ps_mode_config'] = "../ps/gpu_ps_config.yaml"
self.config['debug_new_minimize'] = '0'
self.ps_launch(self.config, "gpu-ps")
self.config['debug_new_minimize'] = '1'
self.ps_launch(self.config, "gpu-ps")
self.check()
def test_append_send_ops_pass(self): def test_append_send_ops_pass(self):
self.init() self.init()
self.config['run_single_pass'] = '1' self.config['run_single_pass'] = '1'
...@@ -70,12 +84,12 @@ class TestPsTrainerPass(PsPassTestBase): ...@@ -70,12 +84,12 @@ class TestPsTrainerPass(PsPassTestBase):
self.config['debug_new_pass'] = '0' self.config['debug_new_pass'] = '0'
self.config['log_dir'] = "/log_old_" + self.config['applied_pass_name'] self.config['log_dir'] = "/log_old_" + self.config['applied_pass_name']
remove_path_if_exists(self.config['log_dir']) remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config) self.ps_launch(self.config, "cpu-ps")
self.config['debug_new_pass'] = '1' self.config['debug_new_pass'] = '1'
self.config['log_dir'] = "/log_new_" + self.config['applied_pass_name'] self.config['log_dir'] = "/log_new_" + self.config['applied_pass_name']
remove_path_if_exists(self.config['log_dir']) remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config) self.ps_launch(self.config, "cpu-ps")
self.check() self.check()
...@@ -84,4 +98,5 @@ class TestPsTrainerPass(PsPassTestBase): ...@@ -84,4 +98,5 @@ class TestPsTrainerPass(PsPassTestBase):
if __name__ == '__main__': if __name__ == '__main__':
remove_path_if_exists('/ps_log')
unittest.main() unittest.main()
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# refer to PaddleRec/models/rank/dnn/benchmark.yaml
hyper_parameters: hyper_parameters:
optimizer: optimizer:
class: Adam class: Adam
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# refer to PaddleRec/models/rank/dnn/config_gpubox.yaml
hyper_parameters:
optimizer:
class: Adam
learning_rate: 0.001
strategy: async
sparse_inputs_slots: 27
sparse_feature_number: 1024
sparse_feature_dim: 11
dense_input_dim: 13
fc_sizes: [512, 256, 128, 32]
distributed_embedding: 0
runner:
geo_step: 400
sync_mode: "gpubox"
thread_num: 16
use_gpu: 1
model_path: "../ps_dnn_model.py"
...@@ -33,10 +33,6 @@ from ps_dnn_model import StaticModel ...@@ -33,10 +33,6 @@ from ps_dnn_model import StaticModel
__dir__ = os.path.dirname(os.path.abspath(__file__)) __dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
def is_distributed_env(): def is_distributed_env():
node_role = os.getenv("TRAINING_ROLE") node_role = os.getenv("TRAINING_ROLE")
...@@ -140,7 +136,7 @@ class YamlHelper(object): ...@@ -140,7 +136,7 @@ class YamlHelper(object):
if header: if header:
draws += h_format.format(header[0], header[1]) draws += h_format.format(header[0], header[1])
else: else:
draws += h_format.format("PaddleRec Benchmark Envs", "Value") draws += h_format.format("Ps Benchmark Envs", "Value")
draws += line + "\n" draws += line + "\n"
...@@ -163,7 +159,7 @@ def get_user_defined_strategy(config): ...@@ -163,7 +159,7 @@ def get_user_defined_strategy(config):
logger.warn( logger.warn(
"Not Find Distributed env, Change To local train mode. If you want train with fleet, please use [fleetrun] command." "Not Find Distributed env, Change To local train mode. If you want train with fleet, please use [fleetrun] command."
) )
return None #return None
sync_mode = config.get("runner.sync_mode") sync_mode = config.get("runner.sync_mode")
assert sync_mode in ["async", "sync", "geo", "heter", "gpubox"] assert sync_mode in ["async", "sync", "geo", "heter", "gpubox"]
if sync_mode == "sync": if sync_mode == "sync":
...@@ -318,7 +314,6 @@ class DnnTrainer(object): ...@@ -318,7 +314,6 @@ class DnnTrainer(object):
logger.info("worker: {} started".format(fleet.worker_index())) logger.info("worker: {} started".format(fleet.worker_index()))
def run_minimize(self): def run_minimize(self):
logger.info("entering run_minimize")
self.init_fleet_with_gloo() self.init_fleet_with_gloo()
self.model = get_model(self.config) self.model = get_model(self.config)
logger.info("cpu_num: {}".format(os.getenv("CPU_NUM"))) logger.info("cpu_num: {}".format(os.getenv("CPU_NUM")))
...@@ -328,36 +323,32 @@ class DnnTrainer(object): ...@@ -328,36 +323,32 @@ class DnnTrainer(object):
user_defined_strategy = get_user_defined_strategy(self.config) user_defined_strategy = get_user_defined_strategy(self.config)
learning_rate = self.config.get( learning_rate = self.config.get(
"hyper_parameters.optimizer.learning_rate") "hyper_parameters.optimizer.learning_rate")
sync_mode = self.config.get("runner.sync_mode")
inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True) inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True)
if self.config['debug_new_minimize'] == 1: if self.config['debug_new_minimize'] == 1:
logger.info("entering run_minimize -- new")
from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer
ps_optimizer = ParameterServerOptimizer(inner_optimizer) ps_optimizer = ParameterServerOptimizer(inner_optimizer)
ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer, ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer,
user_defined_strategy) user_defined_strategy)
ps_optimizer.minimize_impl(loss) ps_optimizer.minimize_impl(loss)
else: else:
logger.info("entering run_minimize -- old")
fleet_obj = fleet.distributed_optimizer( fleet_obj = fleet.distributed_optimizer(
inner_optimizer, user_defined_strategy) ## Fleet 对象 inner_optimizer, user_defined_strategy) ## Fleet 对象
fleet_obj.minimize(loss) fleet_obj.minimize(loss)
if fleet.is_server(): if fleet.is_server():
_main_file = '/' + 'run_minimize' + '_debug_minimize:_' + str( _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
self.config['debug_new_minimize']) + '_server_main.prototxt' self.config['debug_new_minimize']) + '_server_main.prototxt'
debug_program(_main_file, loss.block.program, 0) debug_program(_main_file, loss.block.program, 0)
elif fleet.is_worker(): elif fleet.is_worker():
_main_file = '/' + 'run_minimize' + '_debug_minimize:_' + str( _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
self.config['debug_new_minimize']) + '_worker_main.prototxt' self.config['debug_new_minimize']) + '_worker_main.prototxt'
debug_program(_main_file, loss.block.program, 1) debug_program(_main_file, loss.block.program, 1)
'''
if fleet.is_server():
logger.info("Run Server Begin")
fleet.init_server()
fleet.run_server()
'''
def run_single_pass(self): def run_single_pass(self):
logger.info("entering run_single_pass")
self.init_fleet_with_gloo() self.init_fleet_with_gloo()
self.model = get_model(config) self.model = get_model(config)
input_data = self.model.create_feeds() input_data = self.model.create_feeds()
...@@ -365,22 +356,26 @@ class DnnTrainer(object): ...@@ -365,22 +356,26 @@ class DnnTrainer(object):
loss = self.model._cost loss = self.model._cost
user_defined_strategy = get_user_defined_strategy(config) user_defined_strategy = get_user_defined_strategy(config)
learning_rate = config.get("hyper_parameters.optimizer.learning_rate") learning_rate = config.get("hyper_parameters.optimizer.learning_rate")
sync_mode = self.config.get("runner.sync_mode")
inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True) inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True)
startup_program = paddle.static.default_startup_program() startup_program = paddle.static.default_startup_program()
inner_optimizer.minimize(loss, startup_program) inner_optimizer.minimize(loss, startup_program)
if self.config['debug_new_pass'] == 1: if self.config['debug_new_pass'] == 1:
logger.info("entering run {} - new".format(
str(config["applied_pass_name"])))
from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer
ps_optimizer = ParameterServerOptimizer(inner_optimizer) ps_optimizer = ParameterServerOptimizer(inner_optimizer)
ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer, ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer,
user_defined_strategy) user_defined_strategy)
ps_optimizer._init_ps_pass_context(loss, startup_program) ps_optimizer._init_ps_pass_context(loss, startup_program)
_main = ps_optimizer.attrs['cloned_main'] _main = ps_optimizer.pass_ctx._attrs['cloned_main']
append_send_ops_pass = new_pass(config["applied_pass_name"], append_send_ops_pass = new_pass(config["applied_pass_name"],
ps_optimizer.attrs) ps_optimizer.pass_ctx._attrs)
append_send_ops_pass.apply([_main], [None], ps_optimizer.pass_ctx) append_send_ops_pass.apply([_main], [None], ps_optimizer.pass_ctx)
else: else:
logger.info("entering run {} - old".format(
str(config["applied_pass_name"])))
from paddle.fluid.incubate.fleet.parameter_server.ir import public as public from paddle.fluid.incubate.fleet.parameter_server.ir import public as public
dist_strategy = get_distributed_strategy(user_defined_strategy) dist_strategy = get_distributed_strategy(user_defined_strategy)
compiled_config = public.CompileTimeStrategy( compiled_config = public.CompileTimeStrategy(
...@@ -393,13 +388,13 @@ class DnnTrainer(object): ...@@ -393,13 +388,13 @@ class DnnTrainer(object):
_main = worker.append_send_ops_pass(_main, compiled_config) _main = worker.append_send_ops_pass(_main, compiled_config)
if fleet.is_server(): if fleet.is_server():
_main_file = '/' + str(config[ _main_file = '/' + sync_mode + "_" + str(config[
"applied_pass_name"]) + '_debug_pass:_' + str(self.config[ "applied_pass_name"]) + '_debug:_' + str(self.config[
'debug_new_pass']) + '_server_main.prototxt' 'debug_new_pass']) + '_server_main.prototxt'
debug_program(_main_file, _main, 0) debug_program(_main_file, _main, 0)
elif fleet.is_worker(): elif fleet.is_worker():
_main_file = '/' + str(config[ _main_file = '/' + sync_mode + "_" + str(config[
"applied_pass_name"]) + '_debug_pass:_' + str(self.config[ "applied_pass_name"]) + '_debug:_' + str(self.config[
'debug_new_pass']) + '_worker_main.prototxt' 'debug_new_pass']) + '_worker_main.prototxt'
debug_program(_main_file, _main, 1) debug_program(_main_file, _main, 1)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册