未验证 提交 1c4e3e5d 编写于 作者: Z ziyoujiyi 提交者: GitHub

new fleet_desc builder (#39948)

* delete gloo connect retry

* the_one_ps dirs reconstruct

* .

* .

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* the one ps dirs modify

* the one ps dirs modify

* the one ps dirs modify

* the one ps dirs modify

* refactor ps optimize

* refactor ps optimize

* refactor ps optimize

* .

* .

* .

* .

* .

* .

* refactor theoneps

* the_one_ps

* add ps pass unittest

* add ps pass unittest

* ps unitest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* add cpu_async_ps_mode test

* add cpu_async_ps_mode test

* add cpu_async_ps_mode test

* ps unittest ready

* ps unittest ready

* solve dist_pass init conflict

* solve import CommContext error

* unittest ok

* implement AllocateFrom

* solve setup.py.in conflict

* solve conflict

* solve conflict

* solve conflict

* .

* .

* cpu-async-ps minimize test ok & gpu minimize test ok

* add heter 2stage unittest

* add heter 2stage unittest

* add heter 2stage unittest

* sync/geo test ok & fix heter_worker program ok

* .

* new fleet desc generator

* new fleet_desc builder

* new fleet_desc builder

* .

* .

* correct ps.proto compile

* .
Co-authored-by: Nzkh2016 <zhangkaihuo@baidu.com>
上级 f30b3f81
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
\ No newline at end of file
......@@ -235,6 +235,7 @@ if(WITH_PYTHON)
py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto)
py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto)
py_proto_compile(ps_py_proto SRCS ps.proto)
#Generate an empty \
#__init__.py to make framework_py_proto as a valid python module.
add_custom_target(fleet_proto_init ALL
......@@ -242,12 +243,13 @@ if(WITH_PYTHON)
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
)
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto)
add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto ps_py_proto)
if (NOT WIN32)
add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
COMMAND cp ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto
......@@ -259,6 +261,7 @@ if(WITH_PYTHON)
add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND copy /Y *.py ${proto_dstpath}
COMMAND copy /Y ps_pb2.py ${fleet_proto_dstpath}
COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
COMMENT "Copy generated python proto into directory paddle/fluid/proto."
COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto."
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package paddle.distributed;
option cc_generic_services = true;
option cc_enable_arenas = true;
message FsClientParameter {
enum FsApiType {
HDFS = 0;
AFS = 1;
}
optional FsApiType fs_type = 1 [ default = HDFS ];
optional string uri = 2; // such as afs://xxx.afs.com:9902
optional string user = 3; // user_name to access fs
optional string passwd = 4; // password
optional int32 buffer_size = 5; // buffer for read/write
optional string hadoop_bin = 51;
optional string afs_conf = 101;
}
message PSParameter {
optional string worker_class = 1;
optional string server_class = 2;
optional string instance_class = 3;
optional string init_gflags = 4 [ default = "" ];
optional WorkerParameter worker_param = 101;
optional ServerParameter server_param = 102;
repeated DownpourTrainerParameter trainer_param = 301;
optional FsClientParameter fs_client_param = 501;
}
message WorkerParameter {
optional DownpourWorkerParameter downpour_worker_param = 1;
}
message DownpourWorkerParameter {
repeated TableParameter downpour_table_param = 1;
}
message DownpourServerParameter {
repeated TableParameter downpour_table_param = 1;
optional ServerServiceParameter service_param = 2;
}
message ServerParameter {
optional DownpourServerParameter downpour_server_param = 1;
}
message DownpourTrainerParameter {
repeated DenseTableParameter dense_table = 1;
repeated SparseTableParameter sparse_table = 2;
optional int32 push_sparse_per_batch = 3;
optional int32 push_dense_per_batch = 4;
repeated string skip_op = 5;
repeated ProgramConfig program_config = 6;
}
message DenseTableParameter {
optional int32 table_id = 1;
repeated string dense_variable_name = 2;
repeated string dense_gradient_variable_name = 3;
optional int32 fea_dim = 4;
}
message SparseTableParameter {
optional int32 table_id = 1;
optional int32 feature_dim = 2;
repeated string slot_key = 3;
repeated string slot_value = 4;
repeated string slot_gradient = 5;
}
message ServerServiceParameter {
optional string server_class = 1 [ default = "BrpcPsServer" ];
optional string client_class = 2 [ default = "BrpcPsClient" ];
optional string service_class = 3 [ default = "BrpcPsService" ];
optional uint32 start_server_port = 4
[ default = 0 ]; // will find a avaliable port from it
optional uint32 server_thread_num = 5 [ default = 12 ];
}
message ProgramConfig {
required string program_id = 1;
repeated int32 push_sparse_table_id = 2;
repeated int32 push_dense_table_id = 3;
repeated int32 pull_sparse_table_id = 4;
repeated int32 pull_dense_table_id = 5;
}
enum TableType {
PS_SPARSE_TABLE = 0;
PS_DENSE_TABLE = 1;
PS_OTHER_TABLE = 2;
}
message TableParameter {
optional uint64 table_id = 1;
optional string table_class = 2;
optional uint64 shard_num = 3 [ default = 1000 ];
optional TableAccessorParameter accessor = 4;
optional TensorAccessorParameter tensor = 5;
optional CommonAccessorParameter common = 6;
optional TableType type = 7;
optional bool compress_in_save = 8 [ default = false ];
}
message TableAccessorParameter {
optional string accessor_class = 1;
optional uint32 fea_dim = 4 [ default = 11 ];
optional uint32 embedx_dim = 5 [ default = 8 ];
optional uint32 embedx_threshold = 6 [ default = 10 ];
optional CtrAccessorParameter ctr_accessor_param = 7;
repeated TableAccessorSaveParameter table_accessor_save_param = 8;
optional SparseCommonSGDRuleParameter embed_sgd_param = 10;
optional SparseCommonSGDRuleParameter embedx_sgd_param = 11;
}
message CtrAccessorParameter {
optional float nonclk_coeff = 1
[ default = 0.1 ]; // to calculate show_click_score
optional float click_coeff = 2
[ default = 1 ]; // to calculate show_click_score
optional float base_threshold = 3 [
default = 1.5
]; // show_click_score > base_threshold, this feature can be saved
optional float delta_threshold = 4
[ default =
0.25 ]; // delta_score > delta_threshold, this feature can be saved
optional float delta_keep_days = 5
[ default =
16 ]; // unseen_day < delta_keep_days, this feature can be saved
optional float show_click_decay_rate = 6 [
default = 0.98
]; // show/click will update to show/click * show_click_decay_rate after a day
optional float delete_threshold = 7
[ default = 0.8 ]; // threshold to shrink a feasign
optional float delete_after_unseen_days = 8
[ default = 30 ]; // unseen_day > delete_after_unseen_days, this feature
// will be delete in shrink_model
optional int32 ssd_unseenday_threshold = 9
[ default = 1 ]; // threshold to save ssd
}
message TensorAccessorParameter {
optional string feed_var_name = 1;
optional string fetch_var_name = 2;
optional int64 startup_program_id = 3;
optional int64 main_program_id = 4;
optional string tensor_table_class = 6;
}
message CommonAccessorParameter {
optional string name = 1;
optional string table_name = 2;
repeated string attributes = 3;
repeated string params = 4;
repeated uint32 dims = 5;
repeated string initializers = 6;
optional string entry = 7;
optional int32 trainer_num = 8;
optional bool sync = 9;
optional uint32 table_num = 10;
optional uint32 table_dim = 11;
}
message TableAccessorSaveParameter {
optional uint32 param = 1;
optional string converter = 2;
optional string deconverter = 3;
}
message SparseCommonSGDRuleParameter {
optional string name = 1;
optional SparseNaiveSGDRuleParameter naive = 2;
optional SparseAdagradSGDRuleParameter adagrad = 3;
optional SparseAdamSGDParameter adam = 4;
}
message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule
optional double learning_rate = 1 [ default = 0.05 ];
optional double initial_range = 2 [ default = 0.0001 ];
repeated float weight_bounds = 3;
}
message
SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule
optional double learning_rate = 1 [ default = 0.05 ];
optional double initial_g2sum = 2 [ default = 3.0 ];
optional double initial_range = 3 [ default = 0.0001 ];
repeated float weight_bounds = 4;
}
message SparseAdamSGDParameter { // SparseAdamSGDRule
optional double learning_rate = 1 [ default = 0.001 ];
optional double initial_range = 2 [ default = 0.0001 ];
optional double beta1_decay_rate = 3 [ default = 0.9 ];
optional double beta2_decay_rate = 4 [ default = 0.999 ];
optional double ada_epsilon = 5 [ default = 1e-08 ];
repeated float weight_bounds = 6;
}
......@@ -54,6 +54,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
attrs['cloned_startup'] = attrs['origin_startup_program'].clone()
attrs['user_defined_strategy'] = self.user_defined_strategy
attrs['valid_strategy'] = self.user_defined_strategy
attrs['trainer'] = TrainerRuntimeConfig(self.user_defined_strategy)
attrs['ps_mode'] = attrs['trainer'].mode
logger.info("ps_mode: {}".format(attrs['ps_mode']))
......
# 目录说明
* 改完之后,上层目录中 fleet 中相关文件(夹)就可以删除
......@@ -38,5 +38,7 @@ class PsProgramBuilderFactory(object):
elif 'is_fl_ps_mode' in attrs and attrs[
'is_fl_ps_mode'] == DistributedMode.FL:
return globals()['FlPsProgramBuilder'](pass_ctx)
else:
elif attrs['ps_mode'] == DistributedMode.SYNC:
return globals()['CpuSyncPsProgramBuilder'](pass_ctx)
else:
return globals()['CpuAsyncPsProgramBuilder'](pass_ctx)
......@@ -95,11 +95,12 @@ class GeoPsProgramBuilder(PsProgramBuilder): # 仅 CPU 模式
class CpuSyncPsProgramBuilder(PsProgramBuilder):
def __init__(self, pass_ctx):
logger.info("start building cpu-sync-ps program")
super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx)
if self.ps_mode == DistributedMode.SYNC:
logger.info("start building cpu-sync-ps program")
if self.ps_mode != DistributedMode.SYNC and self.ps_mode != DistributedMode.ASYNC:
raise ValueError("ps mode: {} not matched {}",
format(self.ps_mode, "CpuSyncPsProgramBuilder"))
format(self.ps_mode, "PsProgramBuilder"))
def _build_trainer_programs(self):
add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
......
......@@ -73,7 +73,9 @@ def logger_config(log_path, logging_name):
return logger
logger = logger_config(log_path='/ps_log', logging_name='ps_log')
ps_log_root_dir = '/ps_log/'
logger = logger_config(
log_path='/ps_usr_print_log', logging_name='ps_usr_print_log')
class DistributedMode:
......
......@@ -627,7 +627,7 @@ set_tests_properties(test_norm_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
if(WITH_DISTRIBUTE)
add_subdirectory(distributed_passes)
add_subdirectory(ps)
add_subdirectory(auto_parallel)
# FIXME(typhoonzero): add these tests back
......
......@@ -23,13 +23,24 @@ import unittest
import numpy as np
from collections import OrderedDict
from paddle.distributed.ps.utils.public import logger
from dist_pass_test_base import prepare_python_path_and_return_module, remove_path_if_exists
from paddle.fluid.tests.unittests.distributed_passes.dist_pass_test_base import prepare_python_path_and_return_module, remove_path_if_exists
import paddle.distributed.fleet as fleet
class PsPassTestBase(unittest.TestCase):
def init(self):
raise NotImplementedError
self.config = {}
self.config['ps_mode_config'] = ""
self.config['worker_num'] = "1"
self.config['server_num'] = "1"
self.config['run_minimize'] = "0"
self.config['run_single_pass'] = "0"
self.config['run_the_one_ps'] = '0'
self.config['debug_new_minimize'] = "0"
self.config['debug_new_pass'] = "0"
self.config['debug_the_one_ps'] = '0'
self.config['log_dir'] = ""
self.config['applied_pass_name'] = ""
def setUp(self):
print('Ps setUp...')
......@@ -37,7 +48,7 @@ class PsPassTestBase(unittest.TestCase):
def tearDown(self):
print('Ps tearDown...')
def ps_launch(self, config, ps_mode="cpu-ps"):
def ps_launch(self, ps_mode="cpu-ps"):
if ps_mode == "cpu-ps" or ps_mode == 'heter-ps':
os.environ['WITH_DISTRIBUTE'] = 'ON'
......@@ -45,23 +56,26 @@ class PsPassTestBase(unittest.TestCase):
sys.executable,
"-u",
] + [
"-m", "launch", "--log_dir", config['log_dir'], "--worker_num",
config['worker_num'], "--server_num", config['server_num']
"-m", "launch", "--log_dir", self.config['log_dir'],
"--worker_num", self.config['worker_num'], "--server_num",
self.config['server_num']
]
if ps_mode == 'heter-ps':
os.environ['FLAGS_START_PORT'] = '12004'
cmd += [
'--heter_worker_num', config['heter_worker_num'],
'--heter_devices', config['heter_devices']
'--heter_worker_num', self.config['heter_worker_num'],
'--heter_devices', self.config['heter_devices']
]
cmd += [
"../ps/ps_dnn_trainer.py", "-m", config['ps_mode_config'],
"--run_minimize", config['run_minimize'], "--run_single_pass",
config['run_single_pass'], "--debug_new_pass",
config['debug_new_pass'], "--debug_new_minimize",
config['debug_new_minimize'], "--applied_pass_name",
config['applied_pass_name']
"../ps/ps_dnn_trainer.py", "-m", self.config['ps_mode_config'],
"--run_minimize", self.config['run_minimize'],
"--run_single_pass", self.config['run_single_pass'],
"--run_the_one_ps", self.config['run_the_one_ps'],
"--debug_new_pass", self.config['debug_new_pass'],
"--debug_new_minimize", self.config['debug_new_minimize'],
"--applied_pass_name", self.config['applied_pass_name'],
"--debug_the_one_ps", self.config['debug_the_one_ps']
]
elif ps_mode == "gpu-ps":
os.environ['FLAGS_LAUNCH_BARRIER'] = '0'
......@@ -80,12 +94,14 @@ class PsPassTestBase(unittest.TestCase):
cmd = [
sys.executable, "-u", "../ps/ps_dnn_trainer.py", "-m",
config['ps_mode_config'], "--run_minimize",
config['run_minimize'], "--run_single_pass",
config['run_single_pass'], "--debug_new_pass",
config['debug_new_pass'], "--debug_new_minimize",
config['debug_new_minimize'], "--applied_pass_name",
config['applied_pass_name']
self.config['ps_mode_config'], "--run_minimize",
self.config['run_minimize'], "--run_single_pass",
self.config['run_single_pass'], "--run_the_one_ps",
self.config['run_the_one_ps'], "--debug_new_pass",
self.config['debug_new_pass'], "--debug_new_minimize",
self.config['debug_new_minimize'], "--applied_pass_name",
self.config['applied_pass_name'], "--debug_the_one_ps",
self.config['debug_the_one_ps']
]
cmd = [shlex.quote(c) for c in cmd]
......
......@@ -21,31 +21,26 @@ import numpy as np
import paddle
from ps_pass_test_base import *
from paddle.distributed.ps.utils.public import logger
from paddle.distributed.ps.utils.public import logger, ps_log_root_dir
from paddle.fluid.tests.unittests.ps.ps_dnn_trainer import DnnTrainer
class TestPsTrainerPass(PsPassTestBase):
def init(self):
self.config = {}
self.config['ps_mode_config'] = ""
self.config['worker_num'] = "1"
self.config['server_num'] = "1"
self.config['run_minimize'] = "0"
self.config['run_single_pass'] = "0"
self.config['debug_new_minimize'] = "0"
self.config['debug_new_pass'] = "0"
self.config['log_dir'] = ""
self.config['applied_pass_name'] = ""
def setUp(self):
pass
def tearDown(self):
pass
def check(self):
pass
def check(self, file1, file2):
with open(file1, 'r', encoding='utf-8') as f:
text1 = f.read()
with open(file2, 'r', encoding='utf-8') as f:
text2 = f.read()
if text1 == text2:
return True
else:
return False
def test_ps_optimizer_minimize_cpu_async(self):
self.init()
......@@ -53,16 +48,21 @@ class TestPsTrainerPass(PsPassTestBase):
self.config['run_minimize'] = '1'
self.config['debug_new_minimize'] = '0'
self.config['log_dir'] = "/async_cpu_log_old_minimize"
self.config['log_dir'] = ps_log_root_dir + "async_cpu_log_old_minimize"
remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config)
self.ps_launch()
self.config['debug_new_minimize'] = '1'
self.config['log_dir'] = "/async_cpu_log_new_minimize"
self.config['log_dir'] = ps_log_root_dir + "async_cpu_log_new_minimize"
remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config)
self.ps_launch()
self.check()
file1 = '/ps_log/async_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/async_run_minimize_debug:_1_worker_main.prototxt'
if self.check(file1, file2):
logger.info('test_ps_optimizer_minimize_cpu_async passed!')
else:
logger.error('test_ps_optimizer_minimize_cpu_async failed!')
def test_ps_optimizer_minimize_cpu_sync(self):
self.init()
......@@ -70,16 +70,22 @@ class TestPsTrainerPass(PsPassTestBase):
self.config['run_minimize'] = '1'
self.config['debug_new_minimize'] = '0'
self.config['log_dir'] = "/sync_cpu_log_old_minimize"
self.config['log_dir'] = ps_log_root_dir + "sync_cpu_log_old_minimize"
remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config)
self.ps_launch()
self.config['debug_new_minimize'] = '1'
self.config['log_dir'] = "/sync_cpu_log_new_minimize"
self.config['log_dir'] = ps_log_root_dir + "sync_cpu_log_new_minimize"
remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config)
self.check()
self.ps_launch()
'''
file1 = '/ps_log/sync_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/sync_run_minimize_debug:_1_worker_main.prototxt'
if self.check(file1, file2):
logger.info('test_ps_optimizer_minimize_cpu_sync passed!')
else:
logger.error('test_ps_optimizer_minimize_cpu_sync failed!')
'''
def test_ps_optimizer_minimize_cpu_geo(self):
self.init()
......@@ -87,16 +93,21 @@ class TestPsTrainerPass(PsPassTestBase):
self.config['run_minimize'] = '1'
self.config['debug_new_minimize'] = '0'
self.config['log_dir'] = "/geo_cpu_log_old_minimize"
self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_old_minimize"
remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config)
self.ps_launch()
self.config['debug_new_minimize'] = '1'
self.config['log_dir'] = "/geo_cpu_log_new_minimize"
self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_new_minimize"
remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config)
self.ps_launch()
self.check()
file1 = '/ps_log/geo_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/geo_run_minimize_debug:_1_worker_main.prototxt'
if self.check(file1, file2):
logger.info('test_ps_optimizer_minimize_cpu_geo passed!')
else:
logger.error('test_ps_optimizer_minimize_cpu_geo failed!')
# heter ps 二阶段
def test_ps_optimizer_minimize_heter(self):
......@@ -110,14 +121,24 @@ class TestPsTrainerPass(PsPassTestBase):
self.config['ps_mode_config'] = "../ps/heter_ps_config.yaml"
self.config['debug_new_minimize'] = '0'
self.config['log_dir'] = "/heter_log_old_minimize"
self.config['log_dir'] = ps_log_root_dir + "heter_log_old_minimize"
remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config, 'heter-ps')
self.ps_launch('heter-ps')
self.config['debug_new_minimize'] = '1'
self.config['log_dir'] = "/heter_log_new_minimize"
self.config['log_dir'] = ps_log_root_dir + "heter_log_new_minimize"
remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config, 'heter-ps')
self.ps_launch('heter-ps')
'''
file1 = '/ps_log/heter_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/heter_run_minimize_debug:_1_worker_main.prototxt'
file3 = '/ps_log/heter_run_minimize_debug:_0_heter_worker_main.prototxt'
file4 = '/ps_log/heter_run_minimize_debug:_1_heter_worker_main.prototxt'
if self.check(file1, file2) and self.check(file3, file4):
logger.info('test_ps_optimizer_minimize_heter passed!')
else:
logger.error('test_ps_optimizer_minimize_heter failed!')
'''
def test_ps_optimizer_minimize_gpu(self):
self.init()
......@@ -125,29 +146,42 @@ class TestPsTrainerPass(PsPassTestBase):
self.config['ps_mode_config'] = "../ps/gpu_ps_config.yaml"
self.config['debug_new_minimize'] = '0'
self.ps_launch(self.config, "gpu-ps")
self.ps_launch("gpu-ps")
self.config['debug_new_minimize'] = '1'
self.ps_launch(self.config, "gpu-ps")
self.ps_launch("gpu-ps")
self.check()
file1 = '/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/gpubox_run_minimize_debug:_1_worker_main.prototxt'
if self.check(file1, file2):
logger.info('test_ps_optimizer_minimize_gpu passed!')
else:
logger.error('test_ps_optimizer_minimize_gpu failed!')
def test_append_send_ops_pass(self):
self.init()
self.config['run_single_pass'] = '1'
self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml"
self.config['applied_pass_name'] = "append_send_ops_pass"
self.config['debug_new_pass'] = '0'
self.config['log_dir'] = "/log_old_" + self.config['applied_pass_name']
self.config['log_dir'] = ps_log_root_dir + "log_old_" + self.config[
'applied_pass_name']
remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config, "cpu-ps")
self.ps_launch("cpu-ps")
self.config['debug_new_pass'] = '1'
self.config['log_dir'] = "/log_new_" + self.config['applied_pass_name']
self.config['log_dir'] = ps_log_root_dir + "log_new_" + self.config[
'applied_pass_name']
remove_path_if_exists(self.config['log_dir'])
self.ps_launch(self.config, "cpu-ps")
self.check()
self.ps_launch("cpu-ps")
file1 = '/ps_log/async_append_send_ops_pass_debug:_0_worker_main.prototxt'
file2 = '/ps_log/async_append_send_ops_pass_debug:_1_worker_main.prototxt'
if self.check(file1, file2):
logger.info('test_append_send_ops_pass passed!')
else:
logger.info('test_append_send_ops_pass failed!')
def test_distributed_ops_pass(self):
pass
......
......@@ -3,6 +3,6 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
list(APPEND TEST_OPS ${TEST_OP})
set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 50)
endforeach(TEST_OP)
set_tests_properties(test_the_one_ps PROPERTIES TIMEOUT 50)
......@@ -264,12 +264,16 @@ def parse_args():
'--run_minimize', type=int, default=0, help="test single pass")
parser.add_argument(
'--run_single_pass', type=int, default=0, help="test single pass")
parser.add_argument(
'--run_the_one_ps', type=int, default=0, help="test the_one_ps")
parser.add_argument(
'--debug_new_minimize', type=int, default=0, help="test single pass")
parser.add_argument(
'--debug_new_pass', type=int, default=0, help="test single pass")
parser.add_argument(
'--applied_pass_name', type=str, default="", help="test single pass")
parser.add_argument(
'--debug_the_one_ps', type=int, default=0, help="test the_one_ps")
args = parser.parse_args()
args.abs_dir = os.path.dirname(os.path.abspath(args.config_yaml))
......@@ -280,9 +284,11 @@ def parse_args():
config["pure_bf16"] = args.pure_bf16
config['run_minimize'] = args.run_minimize
config['run_single_pass'] = args.run_single_pass
config['run_the_one_ps'] = args.run_the_one_ps
config['debug_new_minimize'] = args.debug_new_minimize
config['debug_new_pass'] = args.debug_new_pass
config['applied_pass_name'] = args.applied_pass_name
config['debug_the_one_ps'] = args.debug_the_one_ps
yaml_helper.print_yaml(config)
return config
......@@ -344,15 +350,15 @@ class DnnTrainer(object):
fleet_obj.minimize(loss)
if fleet.is_server():
_main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
_main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str(
self.config['debug_new_minimize']) + '_server_main.prototxt'
debug_program(_main_file, loss.block.program)
elif fleet.is_worker():
_main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
_main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str(
self.config['debug_new_minimize']) + '_worker_main.prototxt'
debug_program(_main_file, loss.block.program)
elif self.role_maker._is_heter_worker():
_main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
_main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str(
self.config[
'debug_new_minimize']) + '_heter_worker_main.prototxt'
debug_program(_main_file, loss.block.program)
......@@ -397,16 +403,84 @@ class DnnTrainer(object):
_main = worker.append_send_ops_pass(_main, compiled_config)
if fleet.is_server():
_main_file = '/' + sync_mode + "_" + str(config[
_main_file = ps_log_root_dir + sync_mode + "_" + str(config[
"applied_pass_name"]) + '_debug:_' + str(self.config[
'debug_new_pass']) + '_server_main.prototxt'
debug_program(_main_file, _main)
elif fleet.is_worker():
_main_file = '/' + sync_mode + "_" + str(config[
_main_file = ps_log_root_dir + sync_mode + "_" + str(config[
"applied_pass_name"]) + '_debug:_' + str(self.config[
'debug_new_pass']) + '_worker_main.prototxt'
debug_program(_main_file, _main)
def run_the_one_ps(self):
self.init_fleet_with_gloo()
self.model = get_model(self.config)
self.input_data = self.model.create_feeds()
self.metrics = self.model.net(self.input_data)
loss = self.model._cost
user_defined_strategy = get_user_defined_strategy(self.config)
learning_rate = self.config.get(
"hyper_parameters.optimizer.learning_rate")
sync_mode = self.config.get("runner.sync_mode")
inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True)
self.role_maker._generate_role() # 必要
if self.config['debug_the_one_ps'] == 1:
logger.info("entering run_the_one_ps -- new")
from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer
ps_optimizer = ParameterServerOptimizer(inner_optimizer)
ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer,
user_defined_strategy)
ps_optimizer.minimize_impl(loss)
from paddle.distributed.ps.the_one_ps import TheOnePSRuntime
_runtime_handle = TheOnePSRuntime() # ps 目录下重构版的 TheOnePSRuntime
_runtime_handle._set_basic_info(ps_optimizer.pass_ctx._attrs)
if fleet.is_worker():
worker_desc = _runtime_handle.ps_desc_builder.build_worker_desc(
)
with open(ps_log_root_dir + sync_mode + '_' +
'new_worker_ps_desc', 'w') as f:
f.write(worker_desc)
if fleet.is_server():
server_desc = _runtime_handle.ps_desc_builder.build_server_desc(
)
with open(ps_log_root_dir + sync_mode + '_' +
'new_server_ps_desc', 'w') as f:
f.write(server_desc)
else:
pass
'''
logger.info("entering run_the_one_ps -- old")
fleet_obj = fleet.distributed_optimizer(
inner_optimizer, user_defined_strategy)
fleet_obj.minimize(loss)
if fleet.is_worker():
worker_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=False, is_sync=False)
server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False)
with open(ps_log_root_dir + sync_mode + '_' + 'worker_ps_desc', 'w') as f:
f.write(str(worker_desc) + str(server_desc))
if fleet.is_server():
server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False)
with open(ps_log_root_dir + sync_mode + '_' + 'server_ps_desc', 'w') as f:
f.write(str(server_desc) + str(fleet_obj._runtime_handle._get_fs_client_desc().to_string()))
'''
if fleet.is_server():
_main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str(
self.config['debug_the_one_ps']) + '_server_main.prototxt'
debug_program(_main_file, loss.block.program)
elif fleet.is_worker():
_main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str(
self.config['debug_the_one_ps']) + '_worker_main.prototxt'
debug_program(_main_file, loss.block.program)
elif self.role_maker._is_heter_worker():
_main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str(
self.config['debug_the_one_ps']) + '_heter_worker_main.prototxt'
debug_program(_main_file, loss.block.program)
if __name__ == "__main__":
paddle.enable_static()
......@@ -418,3 +492,5 @@ if __name__ == "__main__":
benchmark_main.run_single_pass()
elif config['run_minimize'] == 1:
benchmark_main.run_minimize()
elif config['run_the_one_ps'] == 1:
benchmark_main.run_the_one_ps()
......@@ -22,16 +22,100 @@ import numpy as np
import paddle
import paddle.fluid as fluid
import paddle
from paddle.fluid.tests.unittests.distributed_passes.ps_pass_test_base import *
from paddle.distributed.ps.utils.public import logger, ps_log_root_dir
from ps_dnn_trainer import DnnTrainer
from paddle.distributed.fleet.proto import ps_pb2
from google.protobuf import text_format
class TestTheOnePs(unittest.TestCase):
class TestTheOnePs(PsPassTestBase):
def setUp(self):
print('setUp...')
pass
def tearDown(self):
print('tearDown...')
pass
def test_main(self):
def check(self, file1, file2):
pass
'''
f = open(file1, "rb")
ps_desc_1 = ps_pb2.PSParameter()
text_format.Parse(f.read(), ps_desc_1)
f.close()
f = open(file2, "rb")
ps_desc_2 = ps_pb2.PSParameter()
text_format.Parse(f.read(), ps_desc_2)
f.close()
str1 = text_format.MessageToString(ps_desc_1)
str2 = text_format.MessageToString(ps_desc_2)
#logger.info('### msg10: {}'.format(str1))
#logger.info('### msg20: {}'.format(str2))
if str1 == str2:
return True
else:
return False
'''
def test_ps_cpu_async(self):
self.init()
self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml"
self.config['run_the_one_ps'] = '1'
self.config['debug_the_one_ps'] = '0'
self.config[
'log_dir'] = ps_log_root_dir + "async_cpu_log_old_the_one_ps"
remove_path_if_exists(self.config['log_dir'])
self.ps_launch()
self.config['debug_the_one_ps'] = '1'
self.config[
'log_dir'] = ps_log_root_dir + "async_cpu_log_new_the_one_ps"
remove_path_if_exists(self.config['log_dir'])
self.ps_launch()
desc1 = '/ps_desc_baseline/async_worker_ps_desc'
desc2 = '/ps_log/async_new_worker_ps_desc'
desc3 = '/ps_desc_baseline/async_server_ps_desc'
desc4 = '/ps_log/async_new_server_ps_desc'
if self.check(desc1, desc2):
logger.info('test_ps_cpu_async ps_desc: worker passed!')
else:
logger.info('test_ps_cpu_async ps_desc: worker failed!')
if self.check(desc3, desc4):
logger.info('test_ps_cpu_async ps_desc: server passed!')
else:
logger.info('test_ps_cpu_async ps_desc: server failed!')
def test_ps_cpu_geo(self):
self.init()
self.config['ps_mode_config'] = "../ps/cpu_geo_ps_config.yaml"
self.config['run_the_one_ps'] = '1'
self.config['debug_the_one_ps'] = '0'
self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_old_the_one_ps"
remove_path_if_exists(self.config['log_dir'])
self.ps_launch()
self.config['debug_the_one_ps'] = '1'
self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_new_the_one_ps"
remove_path_if_exists(self.config['log_dir'])
self.ps_launch()
desc1 = '/ps_desc_baseline/geo_worker_ps_desc'
desc2 = '/ps_log/geo_new_worker_ps_desc'
desc3 = '/ps_desc_baseline/geo_server_ps_desc'
desc4 = '/ps_log/geo_new_server_ps_desc'
if self.check(desc1, desc2):
logger.info('test_ps_cpu_geo ps_desc: worker passed!')
else:
logger.info('test_ps_cpu_geo ps_desc: worker failed!')
if self.check(desc3, desc4):
logger.info('test_ps_cpu_geo ps_desc: server passed!')
else:
logger.info('test_ps_cpu_geo ps_desc: server failed!')
if __name__ == '__main__':
......
......@@ -74,6 +74,7 @@ class DNNLayer(nn.Layer):
else:
emb = self.embedding(s_input)
emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim])
# emb.stop_gradient = True
sparse_embs.append(emb)
y_dnn = paddle.concat(x=sparse_embs + [dense_inputs], axis=1)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册