ps_dnn_trainer.py 21.4 KB
Newer Older
Z
ziyoujiyi 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle.distributed.fleet.base.role_maker as role_maker
16 17 18 19 20 21
from paddle.distributed.ps.utils.ps_program_builder import (
    debug_program,
    logger,
    new_pass,
    ps_log_root_dir,
)
Z
ziyoujiyi 已提交
22 23 24
import paddle.distributed.fleet as fleet
import argparse
import sys
25 26
import yaml
import copy
Z
ziyoujiyi 已提交
27 28 29 30 31
import paddle
import os
import ast
import numpy as np
import struct
32

Z
ziyoujiyi 已提交
33 34 35 36 37 38 39 40 41
sys.path.append("..")
from ps_dnn_model import StaticModel

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))


def is_distributed_env():
    node_role = os.getenv("TRAINING_ROLE")
42
    print("-- Role: {} --".format(node_role))
Z
ziyoujiyi 已提交
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
    if node_role is None:
        return False
    else:
        return True


class YamlHelper(object):
    def load_yaml(self, yaml_file, other_part=None):
        part_list = ["runner", "hyper_parameters"]
        if other_part:
            part_list += other_part
        running_config = self.get_all_inters_from_yaml(yaml_file, part_list)
        running_config = self.workspace_adapter(running_config)
        return running_config

    def print_yaml(self, config):
        print(self.pretty_print_envs(config))

    def parse_yaml(self, config):
        vs = [int(i) for i in yaml.__version__.split(".")]
        if vs[0] < 5:
            use_full_loader = False
        elif vs[0] > 5:
            use_full_loader = True
        else:
            if vs[1] >= 1:
                use_full_loader = True
            else:
                use_full_loader = False

        if os.path.isfile(config):
74 75 76 77 78 79
            with open(config, 'r', encoding="utf-8") as rb:
                if use_full_loader:
                    _config = yaml.load(rb.read(), Loader=yaml.FullLoader)
                else:
                    _config = yaml.load(rb.read())
                return _config
Z
ziyoujiyi 已提交
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
        else:
            raise ValueError("config {} can not be supported".format(config))

    def get_all_inters_from_yaml(self, file, filters):
        _envs = self.parse_yaml(file)
        all_flattens = {}

        def fatten_env_namespace(namespace_nests, local_envs):
            for k, v in local_envs.items():
                if isinstance(v, dict):
                    nests = copy.deepcopy(namespace_nests)
                    nests.append(k)
                    fatten_env_namespace(nests, v)
                else:
                    global_k = ".".join(namespace_nests + [k])
                    all_flattens[global_k] = v

        fatten_env_namespace([], _envs)
        ret = {}
        for k, v in all_flattens.items():
            for f in filters:
                if k.startswith(f):
                    ret[k] = v
        return ret

    def workspace_adapter(self, config):
        workspace = config.get("workspace")
        for k, v in config.items():
            if isinstance(v, str) and "{workspace}" in v:
                config[k] = v.replace("{workspace}", workspace)
        return config

    def pretty_print_envs(self, envs, header=None):
        spacing = 2
        max_k = 40
        max_v = 45

        for k, v in envs.items():
            max_k = max(max_k, len(k))

120
        h_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(
121 122
            max_k, " " * spacing, max_v
        )
Z
ziyoujiyi 已提交
123 124 125 126 127 128 129 130 131 132 133 134
        l_format = "    " + "|{{:>{}s}}{{}}{{:^{}s}}|\n".format(max_k, max_v)
        length = max_k + max_v + spacing

        border = "    +" + "".join(["="] * length) + "+"
        line = "    +" + "".join(["-"] * length) + "+"

        draws = ""
        draws += border + "\n"

        if header:
            draws += h_format.format(header[0], header[1])
        else:
Z
ziyoujiyi 已提交
135
            draws += h_format.format("Ps Benchmark Envs", "Value")
Z
ziyoujiyi 已提交
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157

        draws += line + "\n"

        for k, v in sorted(envs.items()):
            if isinstance(v, str) and len(v) >= max_v:
                str_v = "... " + v[-41:]
            else:
                str_v = v

            draws += l_format.format(k, " " * spacing, str(str_v))

        draws += border

        _str = "\n{}\n".format(draws)
        return _str


def get_user_defined_strategy(config):
    if not is_distributed_env():
        logger.warn(
            "Not Find Distributed env, Change To local train mode. If you want train with fleet, please use [fleetrun] command."
        )
158
        # return None
Z
ziyoujiyi 已提交
159 160 161 162 163 164 165 166
    sync_mode = config.get("runner.sync_mode")
    assert sync_mode in ["async", "sync", "geo", "heter", "gpubox"]
    if sync_mode == "sync":
        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = False
    elif sync_mode == "async":
        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
167 168 169
        strategy.is_fl_ps_mode = (
            True if config.get("runner.is_fl_ps_mode") == 1 else False
        )
170 171 172 173 174 175
        if strategy.is_fl_ps_mode == True:
            strategy.pipeline = False
            micro_num = 1
            strategy.pipeline_configs = {
                "accumulate_steps": micro_num
            }  ## num_microbatches
Z
ziyoujiyi 已提交
176 177 178 179 180 181 182 183
    elif sync_mode == "geo":
        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        strategy.a_sync_configs = {"k_steps": config.get("runner.geo_step")}
    elif sync_mode == "heter":
        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        strategy.a_sync_configs = {"heter_worker_device_guard": "gpu"}
184 185 186 187
        strategy.pipeline = True
        strategy.pipeline_configs = {
            "accumulate_steps": config.get('runner.micro_num')
        }
Z
ziyoujiyi 已提交
188 189 190 191 192 193 194 195 196 197
    elif sync_mode == "gpubox":
        print("sync_mode = {}".format(sync_mode))
        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        strategy.a_sync_configs = {"use_ps_gpu": 1}

    strategy.trainer_desc_configs = {
        "dump_fields_path": config.get("runner.dump_fields_path", ""),
        "dump_fields": config.get("runner.dump_fields", []),
        "dump_param": config.get("runner.dump_param", []),
198 199
        "stat_var_names": config.get("stat_var_names", []),
        "local_sparse": config.get("runner.local_sparse", []),
200
        "remote_sparse": config.get("runner.remote_sparse", []),
Z
ziyoujiyi 已提交
201 202 203 204 205 206 207 208
    }
    print("strategy:", strategy.trainer_desc_configs)

    if config.get("runner.fs_client.uri") is not None:
        strategy.fs_client_param = {
            "uri": config.get("runner.fs_client.uri", ""),
            "user": config.get("runner.fs_client.user", ""),
            "passwd": config.get("runner.fs_client.passwd", ""),
209
            "hadoop_bin": config.get("runner.fs_client.hadoop_bin", "hadoop"),
Z
ziyoujiyi 已提交
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
        }
    print("strategy:", strategy.fs_client_param)

    strategy.adam_d2sum = config.get("hyper_parameters.adam_d2sum", True)
    table_config = {}
    for x in config:
        if x.startswith("table_parameters"):
            table_name = x.split('.')[1]
            if table_name not in table_config:
                table_config[table_name] = {}
            table_config[table_name][x] = config[x]
    print("table_config:", table_config)
    strategy.sparse_table_configs = table_config
    print("strategy table config:", strategy.sparse_table_configs)
    a_sync_configs = strategy.a_sync_configs
    a_sync_configs["launch_barrier"] = False
226
    # a_sync_configs["launch_barrier"] = True
Z
ziyoujiyi 已提交
227 228 229 230 231 232
    strategy.a_sync_configs = a_sync_configs
    print("launch_barrier: ", strategy.a_sync_configs["launch_barrier"])

    return strategy


233
def get_distributed_strategy(user_defined_strategy):  # pslib
234 235 236
    from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import (
        StrategyFactory,
    )
Z
ziyoujiyi 已提交
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264

    k_steps = user_defined_strategy.a_sync_configs["k_steps"]
    strategy = None

    if not user_defined_strategy.a_sync and k_steps == 0:
        strategy = StrategyFactory.create_sync_strategy()

    if user_defined_strategy.a_sync and k_steps == 0:
        strategy = StrategyFactory.create_async_strategy()

    if user_defined_strategy.a_sync and k_steps > 0:
        strategy = StrategyFactory.create_geo_strategy(k_steps)

    if not strategy:
        raise ValueError("k_steps must be invalid value, please check")

    return strategy


def get_model(config):
    abs_dir = config['config_abs_dir']
    sys.path.append(abs_dir)
    static_model = StaticModel(config)
    return static_model


def parse_args():
    parser = argparse.ArgumentParser("PsTest train script")
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
    parser.add_argument(
        '-m', '--config_yaml', type=str, required=True, help='config file path'
    )
    parser.add_argument(
        '-bf16',
        '--pure_bf16',
        type=ast.literal_eval,
        default=False,
        help="whether use bf16",
    )

    parser.add_argument(
        '--run_minimize', type=int, default=0, help="test single pass"
    )
    parser.add_argument(
        '--run_single_pass', type=int, default=0, help="test single pass"
    )
    parser.add_argument(
        '--run_the_one_ps', type=int, default=0, help="test the_one_ps"
    )
    parser.add_argument(
        '--debug_new_minimize', type=int, default=0, help="test single pass"
    )
    parser.add_argument(
        '--debug_new_pass', type=int, default=0, help="test single pass"
    )
    parser.add_argument(
        '--applied_pass_name', type=str, default="", help="test single pass"
    )
    parser.add_argument(
        '--debug_the_one_ps', type=int, default=0, help="test the_one_ps"
    )
Z
ziyoujiyi 已提交
297 298 299 300 301 302 303 304 305 306

    args = parser.parse_args()
    args.abs_dir = os.path.dirname(os.path.abspath(args.config_yaml))
    yaml_helper = YamlHelper()
    config = yaml_helper.load_yaml(args.config_yaml)
    config["yaml_path"] = args.config_yaml
    config["config_abs_dir"] = args.abs_dir
    config["pure_bf16"] = args.pure_bf16
    config['run_minimize'] = args.run_minimize
    config['run_single_pass'] = args.run_single_pass
Z
ziyoujiyi 已提交
307
    config['run_the_one_ps'] = args.run_the_one_ps
Z
ziyoujiyi 已提交
308 309 310
    config['debug_new_minimize'] = args.debug_new_minimize
    config['debug_new_pass'] = args.debug_new_pass
    config['applied_pass_name'] = args.applied_pass_name
Z
ziyoujiyi 已提交
311
    config['debug_the_one_ps'] = args.debug_the_one_ps
Z
ziyoujiyi 已提交
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
    yaml_helper.print_yaml(config)
    return config


def bf16_to_fp32(val):
    return np.float32(struct.unpack('<f', struct.pack('<I', val << 16))[0])


class DnnTrainer(object):
    def __init__(self, config):
        self.metrics = {}
        self.config = config
        self.input_data = None
        self.reader = None
        self.exe = None
        self.train_result_dict = {}
        self.train_result_dict["speed"] = []
        self.model = None
        self.pure_bf16 = self.config['pure_bf16']
        self.role_maker = role_maker.PaddleCloudRoleMaker()

    def init_fleet_with_gloo(self, use_gloo=False):
        if use_gloo:
            os.environ["PADDLE_WITH_GLOO"] = "1"
            fleet.init(self.role_maker)
        else:
            fleet.init()

        if fleet.is_server():
341
            print("server: {} started".format(fleet.server_index()))
Z
ziyoujiyi 已提交
342
        else:
343
            print("worker: {} started".format(fleet.worker_index()))
Z
ziyoujiyi 已提交
344 345 346 347

    def run_minimize(self):
        self.init_fleet_with_gloo()
        self.model = get_model(self.config)
348
        print("cpu_num: {}".format(os.getenv("CPU_NUM")))
Z
ziyoujiyi 已提交
349 350 351 352 353
        self.input_data = self.model.create_feeds()
        self.metrics = self.model.net(self.input_data)
        loss = self.model._cost
        user_defined_strategy = get_user_defined_strategy(self.config)
        learning_rate = self.config.get(
354 355
            "hyper_parameters.optimizer.learning_rate"
        )
Z
ziyoujiyi 已提交
356
        sync_mode = self.config.get("runner.sync_mode")
Z
ziyoujiyi 已提交
357 358
        inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True)

359
        self.role_maker._generate_role()  # 必要
Z
ziyoujiyi 已提交
360
        if self.config['debug_new_minimize'] == 1:
361
            print("entering run_minimize -- new")
362 363 364 365
            from paddle.distributed.fleet.meta_optimizers.ps_optimizer import (
                ParameterServerOptimizer,
            )

Z
ziyoujiyi 已提交
366
            ps_optimizer = ParameterServerOptimizer(inner_optimizer)
367 368 369
            ps_optimizer._set_basic_info(
                loss, self.role_maker, inner_optimizer, user_defined_strategy
            )
Z
ziyoujiyi 已提交
370 371
            ps_optimizer.minimize_impl(loss)
        else:
372
            print("entering run_minimize -- old")
Z
ziyoujiyi 已提交
373
            fleet_obj = fleet.distributed_optimizer(
374 375
                inner_optimizer, user_defined_strategy
            )  ## Fleet 对象
Z
ziyoujiyi 已提交
376 377 378
            fleet_obj.minimize(loss)

        if fleet.is_server():
379 380 381 382 383 384 385 386
            _main_file = (
                ps_log_root_dir
                + sync_mode
                + '_run_minimize'
                + '_debug:_'
                + str(self.config['debug_new_minimize'])
                + '_server_main.prototxt'
            )
387
            debug_program(_main_file, loss.block.program)
Z
ziyoujiyi 已提交
388
        elif fleet.is_worker():
389 390 391 392 393 394 395 396
            _main_file = (
                ps_log_root_dir
                + sync_mode
                + '_run_minimize'
                + '_debug:_'
                + str(self.config['debug_new_minimize'])
                + '_worker_main.prototxt'
            )
397 398
            debug_program(_main_file, loss.block.program)
        elif self.role_maker._is_heter_worker():
399 400 401 402 403 404 405 406
            _main_file = (
                ps_log_root_dir
                + sync_mode
                + '_run_minimize'
                + '_debug:_'
                + str(self.config['debug_new_minimize'])
                + '_heter_worker_main.prototxt'
            )
407
            debug_program(_main_file, loss.block.program)
Z
ziyoujiyi 已提交
408 409 410 411 412 413 414 415 416

    def run_single_pass(self):
        self.init_fleet_with_gloo()
        self.model = get_model(config)
        input_data = self.model.create_feeds()
        metrics = self.model.net(input_data)
        loss = self.model._cost
        user_defined_strategy = get_user_defined_strategy(config)
        learning_rate = config.get("hyper_parameters.optimizer.learning_rate")
Z
ziyoujiyi 已提交
417
        sync_mode = self.config.get("runner.sync_mode")
Z
ziyoujiyi 已提交
418 419 420 421
        inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True)
        startup_program = paddle.static.default_startup_program()
        inner_optimizer.minimize(loss, startup_program)
        if self.config['debug_new_pass'] == 1:
422 423 424 425 426 427 428
            print(
                "entering run {} - new".format(str(config["applied_pass_name"]))
            )
            from paddle.distributed.fleet.meta_optimizers.ps_optimizer import (
                ParameterServerOptimizer,
            )

Z
ziyoujiyi 已提交
429
            ps_optimizer = ParameterServerOptimizer(inner_optimizer)
430 431 432
            ps_optimizer._set_basic_info(
                loss, self.role_maker, inner_optimizer, user_defined_strategy
            )
433
            ps_optimizer._set_origin_programs([loss])
Z
ziyoujiyi 已提交
434
            ps_optimizer._init_ps_pass_context(loss, startup_program)
Z
ziyoujiyi 已提交
435
            _main = ps_optimizer.pass_ctx._attrs['cloned_main']
Z
ziyoujiyi 已提交
436

437 438 439
            append_send_ops_pass = new_pass(
                config["applied_pass_name"], ps_optimizer.pass_ctx._attrs
            )
Z
ziyoujiyi 已提交
440 441
            append_send_ops_pass.apply([_main], [None], ps_optimizer.pass_ctx)
        else:
442 443 444 445 446 447 448
            print(
                "entering run {} - old".format(str(config["applied_pass_name"]))
            )
            from paddle.fluid.incubate.fleet.parameter_server.ir import (
                public as public,
            )

Z
ziyoujiyi 已提交
449 450
            dist_strategy = get_distributed_strategy(user_defined_strategy)
            compiled_config = public.CompileTimeStrategy(
451 452 453 454 455
                loss.block.program,
                startup_program,
                dist_strategy,
                self.role_maker,
            )
Z
ziyoujiyi 已提交
456 457 458

            _main = compiled_config.origin_main_program.clone()
            _startup = compiled_config.origin_startup_program.clone()
459 460 461 462
            from paddle.fluid.incubate.fleet.parameter_server.ir import (
                trainer_pass as worker,
            )

Z
ziyoujiyi 已提交
463 464 465
            _main = worker.append_send_ops_pass(_main, compiled_config)

        if fleet.is_server():
466 467 468 469 470 471 472 473 474
            _main_file = (
                ps_log_root_dir
                + sync_mode
                + "_"
                + str(config["applied_pass_name"])
                + '_debug:_'
                + str(self.config['debug_new_pass'])
                + '_server_main.prototxt'
            )
475
            debug_program(_main_file, _main)
Z
ziyoujiyi 已提交
476
        elif fleet.is_worker():
477 478 479 480 481 482 483 484 485
            _main_file = (
                ps_log_root_dir
                + sync_mode
                + "_"
                + str(config["applied_pass_name"])
                + '_debug:_'
                + str(self.config['debug_new_pass'])
                + '_worker_main.prototxt'
            )
486
            debug_program(_main_file, _main)
Z
ziyoujiyi 已提交
487

Z
ziyoujiyi 已提交
488 489 490 491 492 493 494 495
    def run_the_one_ps(self):
        self.init_fleet_with_gloo()
        self.model = get_model(self.config)
        self.input_data = self.model.create_feeds()
        self.metrics = self.model.net(self.input_data)
        loss = self.model._cost
        user_defined_strategy = get_user_defined_strategy(self.config)
        learning_rate = self.config.get(
496 497
            "hyper_parameters.optimizer.learning_rate"
        )
Z
ziyoujiyi 已提交
498 499 500 501 502
        sync_mode = self.config.get("runner.sync_mode")
        inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True)

        self.role_maker._generate_role()  # 必要
        if self.config['debug_the_one_ps'] == 1:
503
            print("entering run_the_one_ps -- new")
Z
ziyoujiyi 已提交
504

505 506 507 508
            from paddle.distributed.fleet.meta_optimizers.ps_optimizer import (
                ParameterServerOptimizer,
            )

Z
ziyoujiyi 已提交
509
            ps_optimizer = ParameterServerOptimizer(inner_optimizer)
510 511 512
            ps_optimizer._set_basic_info(
                loss, self.role_maker, inner_optimizer, user_defined_strategy
            )
Z
ziyoujiyi 已提交
513 514 515
            ps_optimizer.minimize_impl(loss)

            from paddle.distributed.ps.the_one_ps import TheOnePSRuntime
516

Z
ziyoujiyi 已提交
517 518 519
            _runtime_handle = TheOnePSRuntime()  # ps 目录下重构版的 TheOnePSRuntime
            _runtime_handle._set_basic_info(ps_optimizer.pass_ctx._attrs)
            if fleet.is_worker():
520 521
                worker_desc = (
                    _runtime_handle.ps_desc_builder.build_worker_desc()
Z
ziyoujiyi 已提交
522
                )
523
                with open(
524 525 526
                    ps_log_root_dir + sync_mode + '_' + 'new_worker_ps_desc',
                    'w',
                ) as f:
Z
ziyoujiyi 已提交
527 528
                    f.write(worker_desc)
            if fleet.is_server():
529 530
                server_desc = (
                    _runtime_handle.ps_desc_builder.build_server_desc()
Z
ziyoujiyi 已提交
531
                )
532
                with open(
533 534 535
                    ps_log_root_dir + sync_mode + '_' + 'new_server_ps_desc',
                    'w',
                ) as f:
Z
ziyoujiyi 已提交
536 537 538 539
                    f.write(server_desc)

        else:
            pass
540
        '''
541
            print("entering run_the_one_ps -- old")
Z
ziyoujiyi 已提交
542
            fleet_obj = fleet.distributed_optimizer(
543 544
                inner_optimizer, user_defined_strategy)
            fleet_obj.minimize(loss)
Z
ziyoujiyi 已提交
545 546 547 548 549 550 551 552 553 554 555
            if fleet.is_worker():
                worker_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=False, is_sync=False)
                server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False)
                with open(ps_log_root_dir + sync_mode + '_' + 'worker_ps_desc', 'w') as f:
                    f.write(str(worker_desc) + str(server_desc))
            if fleet.is_server():
                server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False)
                with open(ps_log_root_dir + sync_mode + '_' + 'server_ps_desc', 'w') as f:
                    f.write(str(server_desc) + str(fleet_obj._runtime_handle._get_fs_client_desc().to_string()))
        '''
        if fleet.is_server():
556 557 558 559 560 561 562 563
            _main_file = (
                ps_log_root_dir
                + sync_mode
                + '_run_the_one_ps'
                + '_debug:_'
                + str(self.config['debug_the_one_ps'])
                + '_server_main.prototxt'
            )
Z
ziyoujiyi 已提交
564 565
            debug_program(_main_file, loss.block.program)
        elif fleet.is_worker():
566 567 568 569 570 571 572 573
            _main_file = (
                ps_log_root_dir
                + sync_mode
                + '_run_the_one_ps'
                + '_debug:_'
                + str(self.config['debug_the_one_ps'])
                + '_worker_main.prototxt'
            )
Z
ziyoujiyi 已提交
574 575
            debug_program(_main_file, loss.block.program)
        elif self.role_maker._is_heter_worker():
576 577 578 579 580 581 582 583
            _main_file = (
                ps_log_root_dir
                + sync_mode
                + '_run_the_one_ps'
                + '_debug:_'
                + str(self.config['debug_the_one_ps'])
                + '_heter_worker_main.prototxt'
            )
Z
ziyoujiyi 已提交
584 585
            debug_program(_main_file, loss.block.program)

Z
ziyoujiyi 已提交
586 587 588 589

if __name__ == "__main__":
    paddle.enable_static()
    config = parse_args()
590
    print(">>>>>>>>>> python process started")
Z
ziyoujiyi 已提交
591 592 593 594 595 596
    os.environ["CPU_NUM"] = str(config.get("runner.thread_num"))
    benchmark_main = DnnTrainer(config)
    if config['run_single_pass'] == 1:
        benchmark_main.run_single_pass()
    elif config['run_minimize'] == 1:
        benchmark_main.run_minimize()
Z
ziyoujiyi 已提交
597 598
    elif config['run_the_one_ps'] == 1:
        benchmark_main.run_the_one_ps()