public.py 61.5 KB
Newer Older
Z
ziyoujiyi 已提交
1
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2
#
Z
ziyoujiyi 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
6
#
Z
ziyoujiyi 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
8
#
Z
ziyoujiyi 已提交
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from functools import reduce

import collections
import os
import warnings
import logging
import paddle.fluid as fluid
from paddle.fluid import core
import paddle.fluid.framework as framework

25
# logging.basicConfig(
Z
ziyoujiyi 已提交
26
#    format='%(levelname)s - %(asctime)s - %(pathname)s: %(lineno)s - %(message)s', level=logging.INFO)
27
# logger = logging.getLogger(__name__)
Z
ziyoujiyi 已提交
28

Z
ziyoujiyi 已提交
29 30 31 32 33 34 35 36
OP_NAME_SCOPE = "op_namescope"
CLIP_OP_NAME_SCOPE = "gradient_clip"
STEP_COUNTER = "@PS_STEP_COUNTER@"
LEARNING_RATE_DECAY_COUNTER = "@LR_DECAY_COUNTER@"

OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
RPC_OP_ROLE_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleAttrName()
RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
37
op_role = core.op_proto_and_checker_maker.OpRole
Z
ziyoujiyi 已提交
38 39 40
op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
41
backward = core.op_proto_and_checker_maker.OpRole.Backward
42
OP_DEVICE_KEY = core.op_proto_and_checker_maker.kOpDeviceAttrName()
Z
ziyoujiyi 已提交
43

44 45
DEVICE_LIST = ["cpu", "gpu", "xpu"]
COMMUNICATE_OPS_TYPE = ["send", "recv", "fetch_barrier", "send_barrier"]
Z
ziyoujiyi 已提交
46 47
SPARSE_OP_LIST = ["lookup_table", "lookup_table_v2"]
SPARSE_OP_TYPE_DICT = {"lookup_table": "W", "lookup_table_v2": "W"}
48 49
SPARSE_GRAD_OP_TYPE_DICT = {
    "lookup_table_grad": "W",
50
    "lookup_table_v2_grad": "W",
51 52
}
DEFAULT_DEVICE = 'cpu'
Z
ziyoujiyi 已提交
53

W
wangguanqun 已提交
54 55 56
DATA_NORM_NAME = [".batch_size", ".batch_sum", ".batch_square_sum"]
DATA_NORM_GRAD_NAME = [x + "@GRAD" for x in DATA_NORM_NAME]

Z
ziyoujiyi 已提交
57

Z
ziyoujiyi 已提交
58 59
def logger_config(log_path, logging_name):
    logger = logging.getLogger(logging_name)
Z
zhaocaibei123 已提交
60
    logger.setLevel(level=logging.WARNING)
61 62 63
    handler = logging.FileHandler(
        log_path, mode='a', encoding='UTF-8', delay=True
    )
Z
ziyoujiyi 已提交
64 65
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter(
66 67
        '%(levelname)s - %(asctime)s - %(pathname)s: %(lineno)s - %(message)s'
    )
Z
ziyoujiyi 已提交
68 69 70 71 72 73 74 75
    handler.setFormatter(formatter)
    console = logging.StreamHandler()
    console.setLevel(logging.DEBUG)
    logger.addHandler(handler)
    logger.addHandler(console)
    return logger


76
ps_log_root_dir = './ps_log/'
77 78 79
logger = logger_config(
    log_path='./ps_usr_print_log', logging_name='ps_usr_print_log'
)
Z
ziyoujiyi 已提交
80 81


Z
ziyoujiyi 已提交
82 83 84 85 86 87
class DistributedMode:
    SYNC = 0
    ASYNC = 1
    HALF_ASYNC = 2
    GEO = 3
    FL = 4
88
    NU = 5
Z
ziyoujiyi 已提交
89 90


91
class TrainerRuntimeConfig:
Z
ziyoujiyi 已提交
92
    def __init__(self, valid_strategy):
93
        self.mode = None
W
wangguanqun 已提交
94 95
        num_threads = os.getenv("CPU_NUM", "1")
        send_queue_size = num_threads
Z
ziyoujiyi 已提交
96
        k_steps = valid_strategy.a_sync_configs["k_steps"]
97

Z
ziyoujiyi 已提交
98 99 100 101 102 103 104 105
        if not valid_strategy.a_sync and k_steps == 0:
            self.mode = DistributedMode.SYNC

        if valid_strategy.a_sync and k_steps == 0:
            self.mode = DistributedMode.ASYNC

        if valid_strategy.a_sync and k_steps > 0:
            self.mode = DistributedMode.GEO
W
wangguanqun 已提交
106
            send_queue_size = k_steps
Z
ziyoujiyi 已提交
107 108 109

        self.runtime_configs = {}
        self.runtime_configs['communicator_max_merge_var_num'] = os.getenv(
110 111
            "FLAGS_communicator_max_merge_var_num", send_queue_size
        )
Z
ziyoujiyi 已提交
112
        self.runtime_configs['communicator_send_queue_size'] = os.getenv(
113 114
            "FLAGS_communicator_send_queue_size", send_queue_size
        )
Z
ziyoujiyi 已提交
115
        self.runtime_configs[
116 117
            'communicator_independent_recv_thread'
        ] = os.getenv("FLAGS_communicator_independent_recv_thread", "1")
Z
ziyoujiyi 已提交
118
        self.runtime_configs[
119 120 121 122
            'communicator_min_send_grad_num_before_recv'
        ] = os.getenv(
            "FLAGS_communicator_min_send_grad_num_before_recv", num_threads
        )
Z
ziyoujiyi 已提交
123
        self.runtime_configs['communicator_thread_pool_size'] = os.getenv(
124 125
            "FLAGS_communicator_thread_pool_size", "5"
        )
Z
ziyoujiyi 已提交
126
        self.runtime_configs['communicator_send_wait_times'] = os.getenv(
127 128
            "FLAGS_communicator_send_wait_times", "5"
        )
Z
ziyoujiyi 已提交
129
        self.runtime_configs['communicator_is_sgd_optimizer'] = os.getenv(
130 131
            "FLAGS_communicator_is_sgd_optimizer", "1"
        )
Z
ziyoujiyi 已提交
132

W
wangguanqun 已提交
133 134 135 136 137 138 139
    def get_communicator_flags(self):
        need_keys = []
        num_threads = os.getenv("CPU_NUM", "1")
        mode_str = ""
        if self.mode is None or self.mode == DistributedMode.ASYNC:
            need_keys = self.runtime_configs.keys()
            mode_str = "async"
140 141 142 143
        elif (
            self.mode == DistributedMode.SYNC
            or self.mode == DistributedMode.HALF_ASYNC
        ):
W
wangguanqun 已提交
144 145 146
            mode_str = "sync or half_async"
            need_keys = [
                'communicator_max_merge_var_num',
147 148 149
                'communicator_send_wait_times',
                'communicator_thread_pool_size',
                'communicator_send_queue_size',
W
wangguanqun 已提交
150 151 152 153
            ]
        elif self.mode == DistributedMode.GEO:
            mode_str = "GEO"
            need_keys = [
154 155 156 157
                'communicator_thread_pool_size',
                'communicator_send_wait_times',
                'communicator_max_merge_var_num',
                'communicator_send_queue_size',
W
wangguanqun 已提交
158 159 160 161
            ]
        else:
            raise ValueError("Unsupported Mode")

162 163 164 165
        if (
            self.mode == DistributedMode.SYNC
            or self.mode == DistributedMode.HALF_ASYNC
        ):
W
wangguanqun 已提交
166
            max_merge_var_num = self.runtime_configs[
167 168
                'communicator_max_merge_var_num'
            ]
W
wangguanqun 已提交
169
            send_queue_size = self.runtime_configs[
170 171
                'communicator_send_queue_size'
            ]
W
wangguanqun 已提交
172
            if max_merge_var_num != num_threads:
173 174 175 176
                print(
                    'WARNING: In {} mode, communicator_max_merge_var_num '
                    'must be equal to CPU_NUM. But received, '
                    'communicator_max_merge_var_num = {}, CPU_NUM = '
177 178 179 180
                    '{}. communicator_max_merge_var_num will be forced to {}.'.format(
                        mode_str, max_merge_var_num, num_threads, num_threads
                    )
                )
W
wangguanqun 已提交
181
                self.runtime_configs[
182 183
                    'communicator_max_merge_var_num'
                ] = num_threads
W
wangguanqun 已提交
184
            if send_queue_size != num_threads:
185 186 187 188 189 190 191 192
                print(
                    'WARNING: In {} mode, communicator_send_queue_size '
                    'must be equal to CPU_NUM. But received, '
                    'communicator_send_queue_size = {}, CPU_NUM = '
                    '{}. communicator_send_queue_size will be forced to {}.'.format(
                        mode_str, send_queue_size, num_threads, num_threads
                    )
                )
W
wangguanqun 已提交
193
                self.runtime_configs[
194 195
                    'communicator_send_queue_size'
                ] = num_threads
W
wangguanqun 已提交
196 197 198

        return dict((key, str(self.runtime_configs[key])) for key in need_keys)

Z
ziyoujiyi 已提交
199 200 201 202 203

def get_lr_ops(program):
    lr_ops = []
    for index, op in enumerate(program.global_block().ops):
        role_id = int(op.attr(RPC_OP_ROLE_ATTR_NAME))
204 205 206
        if role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) or role_id == int(
            LR_SCHED_OP_ROLE_ATTR_VALUE
        ) | int(OPT_OP_ROLE_ATTR_VALUE):
Z
ziyoujiyi 已提交
207 208 209 210
            lr_ops.append(op)
    return lr_ops


211
def get_optimize_ops(_program, remote_sparse=[]):
Z
ziyoujiyi 已提交
212 213 214 215
    block = _program.global_block()
    opt_ops = []
    for op in block.ops:
        if _is_opt_role_op(op):
216 217 218 219
            if (
                len(remote_sparse) > 0
                and op.input("Param")[0] not in remote_sparse
            ):  # for fl: only delete remote sparse optimize
220
                continue
Z
ziyoujiyi 已提交
221
            # delete clip op from opt_ops when run in Parameter Server mode
222 223 224 225
            if (
                OP_NAME_SCOPE in op.all_attrs()
                and CLIP_OP_NAME_SCOPE in op.attr(OP_NAME_SCOPE)
            ):
Z
ziyoujiyi 已提交
226 227
                op._set_attr(
                    "op_role",
228 229
                    int(core.op_proto_and_checker_maker.OpRole.Backward),
                )
Z
ziyoujiyi 已提交
230 231 232 233 234
                continue
            opt_ops.append(op)
    return opt_ops


235 236 237 238 239 240 241 242 243
def get_datanorm_ops(_program):
    block = _program.global_block()
    opt_ops = []
    for op in block.ops:
        if op.type == 'data_norm':
            opt_ops.append(op)
    return opt_ops


Z
ziyoujiyi 已提交
244 245 246 247 248 249 250 251 252 253 254 255 256 257
def get_dist_env():
    trainer_id = int(os.getenv('PADDLE_TRAINER_ID', '0'))
    trainer_endpoints = ''
    current_endpoint = ''
    num_trainers = 0
    if os.getenv('PADDLE_TRAINER_ENDPOINTS'):
        trainer_endpoints = os.getenv('PADDLE_TRAINER_ENDPOINTS')
        current_endpoint = trainer_endpoints.split(',')[trainer_id]
        num_trainers = len(trainer_endpoints.split(','))

    return {
        'trainer_id': trainer_id,
        'num_trainers': num_trainers,
        'current_endpoint': current_endpoint,
258
        'trainer_endpoints': trainer_endpoints,
Z
ziyoujiyi 已提交
259 260 261
    }


262 263 264 265 266 267 268
def get_role_id(role_maker):
    try:
        return role_maker._role_id()
    except Exception:
        return role_maker.role_id()


Z
ziyoujiyi 已提交
269 270 271 272 273 274 275
def get_ps_endpoint(role_maker):
    try:
        return role_maker._get_pserver_endpoints()[get_role_id(role_maker)]
    except Exception:
        return role_maker.get_pserver_endpoints()[get_role_id(role_maker)]


W
wangguanqun 已提交
276 277 278 279 280 281 282
def get_ps_endpoints(role_maker):
    try:
        return role_maker._get_pserver_endpoints()
    except Exception:
        return role_maker.get_pserver_endpoints()


Z
ziyoujiyi 已提交
283
def get_heter_worker_endpoint(role_maker):
284
    return role_maker._get_heter_worker_endpoint()
Z
ziyoujiyi 已提交
285 286 287


def get_trainer_endpoint(role_maker):
288
    return role_maker._get_trainer_endpoint()
Z
ziyoujiyi 已提交
289 290


291 292 293 294
def get_trainer_endpoints(role_maker):
    return role_maker._get_trainer_endpoints()


Z
ziyoujiyi 已提交
295 296
def get_previous_stage_trainers(role_maker):
    try:
297
        return role_maker._get_previous_trainers()
Z
ziyoujiyi 已提交
298 299 300 301 302 303 304 305
    except Exception:
        return role_maker.get_previous_trainers()


def is_distributed_sparse_op(op):
    if op.type in SPARSE_OP_LIST and op.attr('is_distributed') is True:
        return True

306 307 308 309
    if (
        op.type == "distributed_lookup_table"
        and op.attr('is_distributed') is True
    ):
Z
ziyoujiyi 已提交
310 311 312 313 314 315 316 317 318 319
        return True

    return False


def get_sparse_tablename(op):
    return op.input("W")[0]


def is_sparse_op(op):
320 321 322 323 324
    if (
        op.type in SPARSE_OP_LIST
        and op.attr('is_sparse') is True
        and op.attr('is_distributed') is False
    ):
Z
ziyoujiyi 已提交
325 326
        return True

327 328 329 330
    if (
        op.type == "distributed_lookup_table"
        and op.attr('is_distributed') is False
    ):
Z
ziyoujiyi 已提交
331 332 333 334 335
        return True

    return False


W
wangguanqun 已提交
336
def get_sparse_tablenames(programs, is_distributed):
Z
ziyoujiyi 已提交
337
    tablenames = set()
W
wangguanqun 已提交
338 339 340 341 342 343 344 345 346
    for program in programs:
        if is_distributed:
            for op in program.global_block().ops:
                if is_distributed_sparse_op(op):
                    tablenames.add(get_sparse_tablename(op))
        else:
            for op in program.global_block().ops:
                if is_sparse_op(op):
                    tablenames.add(get_sparse_tablename(op))
Z
ziyoujiyi 已提交
347 348 349 350 351 352 353 354 355 356
    return list(tablenames)


def get_trainers(role_maker):
    try:
        return role_maker._worker_num()
    except Exception:
        return role_maker.worker_num()


357 358 359 360 361 362 363 364
def get_dense_send_context(
    program,
    send_ctx,
    idx,
    merged_dense_pairs,
    trainer_id,
    split_dense_table=False,
):
Z
ziyoujiyi 已提交
365 366 367
    if len(merged_dense_pairs) < 1:
        return idx
    if not split_dense_table:
W
wangguanqun 已提交
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
        dense_pairs = []
        data_norm_pairs = []
        for merged in merged_dense_pairs:
            is_data_norm = False
            grad = merged[1]
            varname = grad.merged_var.name
            for name in DATA_NORM_GRAD_NAME:
                if varname.endswith(name):
                    is_data_norm = True
            if is_data_norm:
                data_norm_pairs.append(merged)
            else:
                dense_pairs.append(merged)

        # simple dense table
Z
ziyoujiyi 已提交
383 384
        origin_varnames = []
        var_numel = 0
W
wangguanqun 已提交
385
        for merged in dense_pairs:
Z
ziyoujiyi 已提交
386 387
            grad = merged[1]
            origin_varnames.append(grad.merged_var.name)
W
wangguanqun 已提交
388
            var = program.global_block().vars[grad.merged_var.name]
Z
ziyoujiyi 已提交
389
            var_numel += reduce(lambda x, y: x * y, var.shape)
W
wangguanqun 已提交
390
        grad_name = "Dense@GRAD_" + str(idx)
Z
ziyoujiyi 已提交
391
        aggregate = True
392 393
        # print("public get_dense_send_context dense_table:", grad_name,
        #      var_numel, origin_varnames)
394
        from paddle.fluid.core import CommContext
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411

        dense_ctx = CommContext(
            grad_name,
            [grad_name],
            ["127.0.0.1:6071"],
            [var_numel],
            origin_varnames,
            trainer_id,
            aggregate,
            False,
            False,
            idx,
            False,
            False,
            id(program),
            [],
        )
Z
ziyoujiyi 已提交
412 413
        send_ctx[grad_name] = dense_ctx
        idx += 1
W
wangguanqun 已提交
414 415 416 417 418 419 420 421 422 423 424 425 426 427

        if len(data_norm_pairs) <= 0:
            return idx

        # data norm table
        origin_varnames = []
        var_numel = 0
        for merged in data_norm_pairs:
            grad = merged[1]
            origin_varnames.append(grad.merged_var.name)
            var = program.global_block().vars[grad.merged_var.name]
            var_numel += reduce(lambda x, y: x * y, var.shape)
        grad_name = "DataNorm@GRAD_" + str(idx)
        aggregate = True
428 429
        # print("public get_dense_send_context data_norm table:", grad_name,
        #      var_numel, origin_varnames)
430
        from paddle.fluid.core import CommContext
431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447

        data_norm_ctx = CommContext(
            grad_name,
            [grad_name],
            ["127.0.0.1:6071"],
            [var_numel],
            origin_varnames,
            trainer_id,
            aggregate,
            False,
            False,
            idx,
            False,
            True,
            id(program),
            [],
        )
W
wangguanqun 已提交
448 449
        send_ctx[grad_name] = data_norm_ctx
        idx += 1
Z
ziyoujiyi 已提交
450 451 452 453
    else:
        for merged in merged_dense_pairs:
            grad = merged[1]
            origin_varname = grad.merged_var.name
W
wangguanqun 已提交
454
            var = program.global_block().vars[origin_varname]
Z
ziyoujiyi 已提交
455 456 457
            var_numel = reduce(lambda x, y: x * y, var.shape)
            grad_name = origin_varname
            aggregate = True
458
            from paddle.fluid.core import CommContext
459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475

            dense_ctx = CommContext(
                grad_name,
                [grad_name],
                ["127.0.0.1:6071"],
                [var_numel],
                [origin_varname],
                trainer_id,
                aggregate,
                False,
                False,
                idx,
                False,
                False,
                id(program),
                [],
            )
Z
ziyoujiyi 已提交
476 477 478 479 480
            send_ctx[grad_name] = dense_ctx
            idx += 1
    return idx


481 482
def get_geo_trainer_send_context(attrs):
    if attrs['ps_mode'] != DistributedMode.GEO:
483 484 485 486
        raise ValueError(
            "ps mode: {} not matched {}",
            format(ps_mode, "get_geo_trainer_send_context"),
        )
Z
ziyoujiyi 已提交
487
    send_ctx = {}
488 489 490
    trainer_id = get_role_id(attrs['role_maker'])
    origin_programs = attrs['origin_main_programs']
    idx = 0  # table idx
491

W
wangguanqun 已提交
492 493
    distibuted_varnames = get_sparse_tablenames(origin_programs, True)
    for i, program in enumerate(origin_programs):
494
        merged_sparse_pairs = attrs['merged_sparse_pairs'][i]
W
wangguanqun 已提交
495 496 497 498
        for merged in merged_sparse_pairs:
            param, grad = merged
            grad_name = grad.merged_var.name
            param_name = param.merged_var.name
499 500
            if param_name in attrs['remote_sparse']:  # for recall/ncf model
                continue
W
wangguanqun 已提交
501

502 503 504
            is_distributed = (
                True if param_name in distibuted_varnames else False
            )
W
wangguanqun 已提交
505 506
            var = program.global_block().vars[grad.merged_var.name]
            var_numel = reduce(lambda x, y: x * y, var.shape[1:])
507
            from paddle.fluid.core import CommContext
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527

            print(
                "public get_the_geo_send_context sparse: ", grad_name, var_numel
            )
            sparse_ctx = CommContext(
                grad_name,
                [grad_name],
                ["127.0.0.1:6071"],
                [var_numel],
                [grad_name],
                trainer_id,
                True,
                True,
                is_distributed,
                idx,
                False,
                False,
                id(program),
                [],
            )
W
wangguanqun 已提交
528 529
            idx += 1
            send_ctx[sparse_ctx.var_name()] = sparse_ctx
530 531 532 533

    if len(send_ctx) == 0:
        raise ValueError("GeoSGD require sparse parameters in your net.")

534 535
    if len(attrs['tensor_table']) > 0 and attrs['is_worker']:
        name, ctx = _step_ctx(idx, attrs['role_maker'])
536 537
        send_ctx[name] = ctx

Z
ziyoujiyi 已提交
538 539 540 541 542 543 544 545 546
    return send_ctx


def _step_ctx(idx, role_maker):
    name = STEP_COUNTER
    trainer_id = get_role_id(role_maker)
    endpoints = get_ps_endpoints(role_maker)
    sections = [1] * len(endpoints)
    names = [name] * len(endpoints)
547
    from paddle.fluid.core import CommContext
548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564

    ctx = CommContext(
        name,
        names,
        endpoints,
        sections,
        [name],
        trainer_id,
        True,
        False,
        False,
        idx,
        True,
        False,
        -1,
        [],
    )
Z
ziyoujiyi 已提交
565 566 567
    return name, ctx


568
def get_the_one_send_context(attrs, split_dense_table=False, ep_list=None):
Z
ziyoujiyi 已提交
569 570 571
    if ep_list is None:
        ep_list = ["127.0.0.1:6071"]
    send_ctx = {}
572 573
    trainer_id = get_role_id(attrs['role_maker'])
    origin_programs = attrs['origin_main_programs']
574
    print("is_heter_ps_mode? {}".format(split_dense_table))
Z
ziyoujiyi 已提交
575 576

    idx = 0
W
wangguanqun 已提交
577
    distibuted_varnames = get_sparse_tablenames(origin_programs, True)
578
    # print("public distibuted_varnames:", distibuted_varnames)
W
wangguanqun 已提交
579
    for i, program in enumerate(origin_programs):
580
        merged_sparse_pairs = attrs['merged_sparse_pairs'][i]
W
wangguanqun 已提交
581 582 583 584 585
        for merged in merged_sparse_pairs:
            param, grad = merged
            grad_name = grad.merged_var.name
            param_name = param.merged_var.name

586 587 588 589 590
            remote_sparse_ids = []
            if param_name in attrs['remote_sparse']:  # for recall/ncf model
                remote_sparse_ids.append(idx)

            splited_varname = []
W
wangguanqun 已提交
591 592 593
            for i in range(len(ep_list)):
                splited_varname.append("{}.block{}".format(param_name, i))

594 595 596
            is_distributed = (
                True if param_name in distibuted_varnames else False
            )
W
wangguanqun 已提交
597 598 599 600 601 602 603 604

            var = program.global_block().vars[grad.merged_var.name]

            shape = list(var.shape)
            shape[0] = 0 if is_distributed else shape[0]

            if grad_name in send_ctx:
                continue
605
            from paddle.fluid.core import CommContext
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628

            print(
                "public get_the_one_send_context sparse: ",
                grad_name,
                splited_varname,
                shape,
            )
            sparse_ctx = CommContext(
                grad_name,
                splited_varname,
                ep_list,
                shape,
                [grad_name],
                trainer_id,
                True,
                True,
                is_distributed,
                idx,
                False,
                False,
                id(program),
                remote_sparse_ids,
            )
Z
ziyoujiyi 已提交
629

W
wangguanqun 已提交
630 631
            idx += 1
            send_ctx[sparse_ctx.var_name()] = sparse_ctx
Z
ziyoujiyi 已提交
632

633
    for i, program in enumerate(origin_programs):
634
        merged_dense_pairs = attrs['merged_dense_pairs'][i]
635 636 637 638 639 640 641 642
        idx = get_dense_send_context(
            program,
            send_ctx,
            idx,
            merged_dense_pairs,
            trainer_id,
            split_dense_table,
        )
643

644 645
    if len(attrs['tensor_table']) > 0 and attrs['is_worker']:
        name, ctx = _step_ctx(idx, attrs['role_maker'])
Z
ziyoujiyi 已提交
646 647 648 649 650 651 652
        send_ctx[name] = ctx

    return send_ctx


def find_heter_ops(program, default_device="cpu"):
    if default_device not in DEVICE_LIST:
653 654 655 656 657
        raise ValueError(
            "Given device {} is not in device list {}".format(
                default_device, DEVICE_LIST
            )
        )
Z
ziyoujiyi 已提交
658 659 660 661 662 663 664 665

    def _is_heter_op(op, current_heter_device, default_device="cpu"):
        heter_devices = list(DEVICE_LIST)
        heter_devices.remove(default_device)
        op_device = op.attr("op_device")
        op_type = op.type
        if op_device in heter_devices:
            return True
666 667 668 669
        elif (
            op_type in COMMUNICATE_OPS_TYPE
            and current_heter_device != default_device
        ):
Z
ziyoujiyi 已提交
670 671
            # for distributed communciate ops: send & recv & barrier etc.
            # Todo: need update this method
672
            # op._set_attr('op_device', current_heter_device)
Z
ziyoujiyi 已提交
673
            return True
674
        elif op_device is None or op_device == default_device:
Z
ziyoujiyi 已提交
675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706
            op._set_attr('op_device', default_device)
            return False
        return False

    def _is_same_device(op, pre_device, default_device="cpu"):
        op_device = op.attr("op_device")
        if op_device == pre_device:
            return True
        if pre_device == default_device:
            return True
        return False

    def _append_heter_op(op, current_heter_block_ops, heter_ops):
        op_device = op.attr("op_device")
        if op_device not in heter_ops:
            heter_ops[op_device] = {}
        current_heter_block_ops.append(op)

    origin_porgram = program.clone()
    block = program.global_block()
    '''
       re-place sum op to fix bug for union forward backward op
    '''
    var2idx = {}
    op_list = list(block.ops)
    op_size = len(op_list)

    for i in range(op_size - 1, -1, -1):
        op_list = list(block.ops)
        op = op_list[i]
        if "_grad" in op.type:
            forward_op_type = op.type.split("_grad")[0]
707 708 709 710
            if (
                forward_op_type in SPARSE_OP_TYPE_DICT.keys()
                and op.attr('remote_prefetch') is True
            ):
Z
ziyoujiyi 已提交
711 712
                param_name = op.input(SPARSE_OP_TYPE_DICT[forward_op_type])[0]
                if param_name in var2idx:
713
                    # insert sum op & remove sum op from var2idx and origin place
Z
ziyoujiyi 已提交
714 715 716
                    op_list = list(block.ops)
                    sum_op = op_list[var2idx[param_name]]
                    sum_op_inputs = {
717 718 719 720
                        sum_op.input_names[0]: [
                            block.vars[input]
                            for input in sum_op.input_arg_names
                        ]
Z
ziyoujiyi 已提交
721 722 723 724 725 726 727
                    }
                    sum_op_outputs = {
                        sum_op.output_names[0]: [
                            block.vars[output]
                            for output in sum_op.output_arg_names
                        ]
                    }
728 729 730 731 732 733 734
                    block._insert_op(
                        index=i + 1,
                        type=sum_op.type,
                        inputs=sum_op_inputs,
                        outputs=sum_op_outputs,
                        attrs=sum_op.all_attrs(),
                    )
Z
ziyoujiyi 已提交
735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754
                    block._remove_op(var2idx[param_name] + 1)
                    var2idx.pop(param_name)
                    for var_ in var2idx:
                        var2idx[var_] += 1
            elif forward_op_type == "elementwise_mul":
                """
                get output varname of pre op

                """
                output_vars_no_grad = []
                for key in op.output_names:
                    for varname in op.output(key):
                        if varname == "@EMPTY@":
                            continue
                        if "lod_tensor_blocking_queue" in varname:
                            continue
                        output_vars_no_grad.append(varname.split("@GRAD")[0])
                for no_grad_var in output_vars_no_grad:
                    if no_grad_var in var2idx:
                        """
755
                        insert sum op & remove sum op from var2idx and origin place
756

757
                        """
Z
ziyoujiyi 已提交
758 759 760 761 762 763 764 765 766 767 768 769 770 771
                        op_list = list(block.ops)
                        sum_op = op_list[var2idx[no_grad_var]]
                        sum_op_inputs = {
                            sum_op.input_names[0]: [
                                block.vars[input]
                                for input in sum_op.input_arg_names
                            ]
                        }
                        sum_op_outputs = {
                            sum_op.output_names[0]: [
                                block.vars[output]
                                for output in sum_op.output_arg_names
                            ]
                        }
772 773 774 775 776 777 778
                        block._insert_op(
                            index=i + 1,
                            type=sum_op.type,
                            inputs=sum_op_inputs,
                            outputs=sum_op_outputs,
                            attrs=sum_op.all_attrs(),
                        )
Z
ziyoujiyi 已提交
779 780 781 782 783 784 785 786 787 788 789 790
                        block._remove_op(var2idx[no_grad_var] + 1)
                        var2idx.pop(no_grad_var)
                        for var_ in var2idx:
                            var2idx[var_] += 1
        else:
            if op.type == "sum":
                var = op.output("Out")[0]
                if "@GRAD" in var:
                    origin_var = var.split("@GRAD")[0]
                    pre_op = op_list[i - 1]
                    if "_grad" in pre_op.type:
                        forward_op_type = pre_op.type.split("_grad")[0]
791 792 793 794
                        if (
                            forward_op_type in SPARSE_OP_TYPE_DICT.keys()
                            and pre_op.attr('remote_prefetch') is True
                        ):
795
                            param_name = pre_op.input(
796 797
                                SPARSE_OP_TYPE_DICT[forward_op_type]
                            )[0]
Z
ziyoujiyi 已提交
798
                            if param_name == origin_var and op.attr(
799 800
                                "op_device"
                            ) == pre_op.attr("op_device"):
Z
ziyoujiyi 已提交
801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852
                                continue
                            else:
                                var2idx[origin_var] = i
                        elif forward_op_type == "elementwise_mul":
                            output_vars = []
                            for key in pre_op.output_names:
                                for varname in pre_op.output(key):
                                    if varname == "@EMPTY@":
                                        continue
                                    if "lod_tensor_blocking_queue" in varname:
                                        continue
                                    output_vars.append(varname)
                            input_vars = []
                            for key in op.input_names:
                                for varname in op.input(key):
                                    if varname == "@EMPTY@":
                                        continue
                                    if "lod_tensor_blocking_queue" in varname:
                                        continue
                                    input_vars.append(varname)
                            is_match = False
                            for varname in output_vars:
                                if varname in input_vars:
                                    is_match = True
                                    break
                            if is_match:
                                continue
                            else:
                                var2idx[origin_var] = i
                    else:
                        var2idx[origin_var] = i

    origin_porgram = program.clone()
    block = program.global_block()

    program_block_ops = []
    default_ops = {default_device: {}}
    heter_ops = {}
    block_index = 0

    current_heter_block_ops = []
    current_default_block_ops = []
    current_heter_device = default_device
    is_heter = False
    for op in block.ops:
        if _is_heter_op(op, current_heter_device, default_device):
            # for gpu/xpu-op
            is_heter = True

            # for cpu-op block append
            if len(current_default_block_ops) > 1:
                default_ops[default_device][
853 854
                    block_index
                ] = current_default_block_ops
Z
ziyoujiyi 已提交
855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898
                program_block_ops.append(current_default_block_ops)
                current_default_block_ops = []
                block_index += 1

            if _is_same_device(op, current_heter_device, default_device):
                # for gpu-op, gpu-op -> gpu-op,...
                current_heter_device = op.attr("op_device")
                _append_heter_op(op, current_heter_block_ops, heter_ops)
            else:
                # for gpu-op -> xpu-op, ...
                op_device = current_heter_block_ops[0].attr("op_device")
                heter_ops[op_device][block_index] = current_heter_block_ops
                program_block_ops.append(current_heter_block_ops)
                block_index += 1
                current_heter_block_ops = []
                current_heter_device = op.attr("op_device")
                _append_heter_op(op, current_heter_block_ops, heter_ops)

        elif is_heter:
            # for gpu/xpu-op -> cpu-op
            op_device = current_heter_block_ops[0].attr("op_device")
            heter_ops[op_device][block_index] = current_heter_block_ops
            program_block_ops.append(current_heter_block_ops)
            block_index += 1
            current_heter_block_ops = []
            current_heter_device = default_device
            is_heter = False
            current_default_block_ops.append(op)
        else:
            # for cpu-op
            current_default_block_ops.append(op)

    if current_default_block_ops != []:
        default_ops[default_device][block_index] = current_default_block_ops
        program_block_ops.append(current_default_block_ops)

    if current_heter_block_ops != []:
        op_device = current_heter_block_ops[0].attr("op_device")
        heter_ops[op_device][block_index] = current_heter_block_ops
        program_block_ops.append(current_heter_block_ops)

    if len(heter_ops) == 0:
        warnings.warn(
            "No heterogeneous OP was found in your program , "
899 900
            " please using fluid.device_guard() to run OPs on different device."
        )
Z
ziyoujiyi 已提交
901 902 903 904 905 906 907 908 909

    total_heter_ops = 0
    heter_blocks = 0
    for device in heter_ops.keys():
        heter_block_dict = heter_ops[device]
        heter_blocks += len(heter_block_dict)
        for _, heter_block in heter_block_dict.items():
            total_heter_ops += len(heter_block)
    print(
910 911 912 913
        "There are {} OPs in your main_program, and contains {} heter-OPs which is made up of {} heter-blocks.".format(
            len(block.ops), total_heter_ops, heter_blocks
        )
    )
Z
ziyoujiyi 已提交
914 915 916 917 918 919 920

    return origin_porgram, heter_ops, default_ops, program_block_ops


def union_forward_gradient_op(program_block_ops_list):
    """
    before analyzing the input & output of each block in program_block_list, we should
921
    union the forward op and corresponding gradient op to elimincate the unnecessary variable
Z
ziyoujiyi 已提交
922 923 924 925 926 927 928 929
    transmit
    """
    """
    fix for 2emb model, re-place sum op

    """
    block_length = len(program_block_ops_list)
    union_program_block_ops_list = []
930 931 932
    assert (
        block_length % 2 != 0
    ), "the length of program_block_ops_list should be odd"
Z
ziyoujiyi 已提交
933 934
    for i in range(0, block_length // 2):
        block_op_list = {"forward": program_block_ops_list[i]}
935
        block_op_list.update(
936 937
            {"backward": program_block_ops_list[block_length - 1 - i]}
        )
Z
ziyoujiyi 已提交
938 939 940 941
        union_program_block_ops_list.append(block_op_list)

    block_op_list = {"forward": [], "backward": []}
    for op in program_block_ops_list[block_length // 2]:
942
        if "_grad" not in op.type and not (op.type == "sum"):
Z
ziyoujiyi 已提交
943 944 945 946 947 948 949 950
            block_op_list["forward"].append(op)
        else:
            block_op_list["backward"].append(op)
    union_program_block_ops_list.append(block_op_list)
    return union_program_block_ops_list


def find_block_joints(program, program_block_ops_list, heter_ops):
951 952 953 954 955 956 957 958 959
    block_var_detail = find_entrance_exit_private(
        program, program_block_ops_list
    )
    block_var_detail = entrance_exit_check(
        program, program_block_ops_list, block_var_detail, heter_ops
    )
    block_var_detail = delete_block_useless_exit(
        program, program_block_ops_list, block_var_detail
    )
Z
ziyoujiyi 已提交
960 961 962 963

    return block_var_detail


964 965 966 967 968 969 970 971 972 973 974 975 976 977
def find_ops_list_input_output(program, ops_list):
    input_var_list = []
    output_var_list = []
    for op in ops_list:
        inputs = _get_input_map_from_op(program.global_block().vars, op)
        input_var_list += get_varlist_from_op_map(inputs)
        outputs = _get_output_map_from_op(program.global_block().vars, op)
        output_var_list += get_varlist_from_op_map(outputs)

    input_var_list = list(set(input_var_list))
    output_var_list = list(set(output_var_list))
    return input_var_list, output_var_list


Z
ziyoujiyi 已提交
978 979 980 981
def find_entrance_exit_private(program, program_block_ops_list):
    block_var_detail = []
    persistables = []
    for index, block_op_list in enumerate(program_block_ops_list):
982
        # forward
Z
ziyoujiyi 已提交
983
        block_input, block_output = find_ops_list_input_output(
984 985
            program, block_op_list["forward"]
        )
Z
ziyoujiyi 已提交
986
        persistables = screen_persistables(
987 988
            program, block_input
        ) + screen_persistables(program, block_output)
Z
ziyoujiyi 已提交
989 990 991 992 993 994 995 996 997
        # find entrance & exit
        block_private_vars = list(set(block_input) & set(block_output))
        block_entrance = list(set(block_input) - set(block_private_vars))
        block_exit = list(set(block_output) - set(block_private_vars))
        detail = {
            "forward": {
                "entrance": block_entrance,
                "exit": block_exit,
                "private": block_private_vars,
998
                "persistables": persistables,
Z
ziyoujiyi 已提交
999 1000 1001
            }
        }

1002
        # backward
Z
ziyoujiyi 已提交
1003
        bp_block_input, bp_block_output = find_ops_list_input_output(
1004 1005
            program, block_op_list["backward"]
        )
Z
ziyoujiyi 已提交
1006
        bp_persistables = screen_persistables(
1007 1008
            program, bp_block_input
        ) + screen_persistables(program, bp_block_output)
Z
ziyoujiyi 已提交
1009 1010 1011
        # find entrance & exit
        bp_block_private_vars = list(set(bp_block_input) & set(bp_block_output))
        bp_block_entrance = list(
1012 1013
            set(bp_block_input) - set(bp_block_private_vars)
        )
Z
ziyoujiyi 已提交
1014
        bp_block_exit = list(set(bp_block_output) - set(bp_block_private_vars))
1015 1016 1017 1018 1019 1020 1021 1022
        detail.update(
            {
                "backward": {
                    "entrance": bp_block_entrance,
                    "exit": bp_block_exit,
                    "private": bp_block_private_vars,
                    "persistables": bp_persistables,
                }
Z
ziyoujiyi 已提交
1023
            }
1024
        )
Z
ziyoujiyi 已提交
1025 1026 1027 1028
        block_var_detail.append(detail)
    return block_var_detail


1029 1030 1031
def entrance_exit_check(
    program, program_block_ops_list, block_var_detail, heter_ops
):
Z
ziyoujiyi 已提交
1032 1033 1034 1035 1036 1037 1038 1039 1040
    for index in range(len(block_var_detail) - 1, -1, -1):
        if index - 1 < 0:
            break
        previous_block_exit = block_var_detail[index - 1]["forward"]["exit"]
        previous_block_exit.sort()
        current_block_entrance = block_var_detail[index]["forward"]["entrance"]

        backward_entrance = block_var_detail[index]["backward"]["entrance"]

1041 1042 1043 1044 1045
        forward_all = (
            block_var_detail[index]["forward"]["entrance"]
            + block_var_detail[index]["forward"]["private"]
            + block_var_detail[index]["forward"]["exit"]
        )
Z
ziyoujiyi 已提交
1046 1047 1048 1049 1050 1051 1052 1053 1054 1055

        for var in backward_entrance:
            if not ("@GRAD" in var) and not (var in forward_all):
                current_block_entrance.append(var)

        current_block_entrance.sort()

        if previous_block_exit == current_block_entrance:
            continue
        exist_vars = list(
1056 1057
            set(previous_block_exit) & set(current_block_entrance)
        )
Z
ziyoujiyi 已提交
1058 1059
        need_add_vars = list(set(current_block_entrance) - set(exist_vars))
        # var in different stage should not be ignored, since they are not placed in the same program & device
1060
        # need_add_vars = find_need_var_from_previous_block(
Z
ziyoujiyi 已提交
1061 1062
        #    need_add_vars, block_var_detail, index, heter_ops)

1063 1064 1065 1066 1067 1068
        previous_block_private = block_var_detail[index - 1]["forward"][
            "private"
        ]
        previous_block_entrance = block_var_detail[index - 1]["forward"][
            "entrance"
        ]
Z
ziyoujiyi 已提交
1069
        for var in need_add_vars:
1070 1071 1072 1073
            if (
                var not in previous_block_private
                and var not in previous_block_entrance
            ):
Z
ziyoujiyi 已提交
1074 1075
                previous_block_entrance.append(var)
            previous_block_exit.append(var)
1076
            if var not in current_block_entrance:
Z
ziyoujiyi 已提交
1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088
                current_block_entrance.append(var)

    for index in range(0, len(block_var_detail) - 1, 1):
        previous_block_exit = block_var_detail[index + 1]["backward"]["exit"]
        previous_block_exit.sort()
        current_block_entrance = block_var_detail[index]["backward"]["entrance"]

        current_block_entrance.sort()

        if previous_block_exit == current_block_entrance:
            continue
        exist_vars = list(
1089 1090
            set(previous_block_exit) & set(current_block_entrance)
        )
Z
ziyoujiyi 已提交
1091 1092 1093
        need_add_vars = list(set(current_block_entrance) - set(exist_vars))
        need_ignore_vars = []
        for var in need_add_vars:
1094
            if "@GRAD" not in var:
Z
ziyoujiyi 已提交
1095 1096
                need_ignore_vars.append(var)
        need_add_vars = list(
1097 1098 1099 1100 1101 1102 1103 1104
            set(need_add_vars).difference(set(need_ignore_vars))
        )
        previous_block_private = block_var_detail[index + 1]["backward"][
            "private"
        ]
        previous_block_entrance = block_var_detail[index + 1]["backward"][
            "entrance"
        ]
Z
ziyoujiyi 已提交
1105
        for var in need_add_vars:
1106 1107 1108 1109
            if (
                var not in previous_block_private
                and var not in previous_block_entrance
            ):
Z
ziyoujiyi 已提交
1110 1111 1112 1113 1114
                previous_block_entrance.append(var)
            previous_block_exit.append(var)
    return block_var_detail


1115 1116 1117
def delete_block_useless_exit(
    program, program_block_ops_list, block_var_detail
):
1118
    # forward
Z
ziyoujiyi 已提交
1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130
    for index in range(len(block_var_detail)):
        if index == len(block_var_detail) - 1:
            break
        current_block_exit = block_var_detail[index]["forward"]["exit"]
        next_block_entrance = block_var_detail[index + 1]["forward"]["entrance"]
        need_delete_var = []
        for var in current_block_exit:
            if var not in next_block_entrance:
                need_delete_var.append(var)

        for var in need_delete_var:
            current_block_exit.remove(var)
1131
    # backward
Z
ziyoujiyi 已提交
1132 1133 1134 1135
    for index in range(len(block_var_detail) - 1, -1, -1):
        if index - 1 < 0:
            break
        current_block_exit = block_var_detail[index]["backward"]["exit"]
1136 1137 1138
        next_block_entrance = block_var_detail[index - 1]["backward"][
            "entrance"
        ]
Z
ziyoujiyi 已提交
1139 1140 1141 1142 1143 1144 1145 1146 1147 1148
        need_delete_var = []
        for var in current_block_exit:
            if var not in next_block_entrance:
                need_delete_var.append(var)
        for var in need_delete_var:
            current_block_exit.remove(var)

    return block_var_detail


1149 1150 1151
def get_communicate_var_info(
    program, block_index, entrance_var_list, type="forward"
):
Z
ziyoujiyi 已提交
1152 1153 1154 1155 1156
    input_var_reshape_dim = []
    input_var_reshape_name = []

    if type == "forward":
        block_input_var_name = "forward_joint_{}_{}@Heter".format(
1157 1158
            block_index - 1, block_index
        )
Z
ziyoujiyi 已提交
1159 1160
    else:
        block_input_var_name = "backward_joint_{}_{}@Heter".format(
1161 1162
            block_index + 1, block_index
        )
Z
ziyoujiyi 已提交
1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184

    entrance_var_list.sort()
    # input
    # Heter_SERVER_BLOCK_index@JOINT_VAR -> slice -> var@Heter_SERVER_BLOCK@INPUT_RESHAPE_VAR -> reshape -> var
    for name in entrance_var_list:
        var = program.global_block().vars[name]
        shape = var.shape
        recv_var_dim = -1 * reduce(lambda x, y: x * y, shape)
        input_var_reshape_dim.append(recv_var_dim)
        input_var_reshape_name.append("{}.input_reshape@Heter".format(name))

    info = {
        "input_var_reshape_dim": input_var_reshape_dim,
        "input_var_reshape_name": input_var_reshape_name,
        "block_input_var_name": block_input_var_name,
    }

    return info


def add_vars_by_var_list(var_name_list, origin_program, program, block):
    for var_name in var_name_list:
1185 1186 1187 1188
        if (
            var_name not in program.global_block().vars
            and var_name not in block.vars
        ):
Z
ziyoujiyi 已提交
1189 1190
            var = origin_program.global_block().vars[var_name]
            if var.persistable:
1191 1192 1193
                program.global_block()._clone_variable(
                    var, force_persistable=False
                )
Z
ziyoujiyi 已提交
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215
            else:
                block._clone_variable(var, force_persistable=False)


def _get_output_map_from_op(varmap, op):
    """Returns a dict from op output name to the vars in varmap."""
    iomap = collections.OrderedDict()
    for key in op.output_names:
        vars = []
        for varname in op.output(key):
            if varname == "@EMPTY@":
                continue
            if "lod_tensor_blocking_queue" in varname:
                continue
            vars.append(varmap[varname])
        if len(vars) == 1:
            iomap[key] = vars[0]
        else:
            iomap[key] = vars
    return iomap


1216 1217
def get_varlist_from_op_map(var_map):
    var_list = []
1218
    for key, varlist in var_map.items():
1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263
        if not isinstance(varlist, list):
            varlist = [varlist]
        for i in range(len(varlist)):
            var = varlist[i]
            var_list.append(var.name)
    return var_list


def _get_input_map_from_op(varmap, op):
    """Returns a dict from op input name to the vars in varmap."""
    iomap = collections.OrderedDict()
    for key in op.input_names:
        vars = []
        for varname in op.input(key):
            if varname == "@EMPTY@":
                continue
            if "lod_tensor_blocking_queue" in varname:
                continue
            vars.append(varmap[varname])
        if len(vars) == 1:
            iomap[key] = vars[0]
        else:
            iomap[key] = vars
    return iomap


def screen_persistables(program, var_list):
    need_remove = []
    for var_name in var_list:
        if "@GRAD" in var_name:
            if "GRAD" != var_name.split("@")[-1]:
                continue
            origin_var_name = var_name.split("@GRAD")[0]
            var = program.global_block().vars[origin_var_name]
        else:
            var = program.global_block().vars[var_name]

        if fluid.io.is_persistable(var):
            need_remove.append(var_name)

    for var_name in need_remove:
        var_list.remove(var_name)
    return need_remove


Z
ziyoujiyi 已提交
1264 1265 1266 1267
def block_append_op(program, origin_program, block, op):
    merge_ordereddict = origin_program.global_block().vars.copy()
    merge_ordereddict.update(block.vars)
    inputs = _get_input_map_from_op(merge_ordereddict, op)
1268
    for key, varlist in inputs.items():
Z
ziyoujiyi 已提交
1269 1270 1271
        if not isinstance(varlist, list):
            varlist = [varlist]
        for var in varlist:
1272 1273 1274 1275
            if (
                var.name not in program.global_block().vars
                and var.name not in block.vars
            ):
Z
ziyoujiyi 已提交
1276 1277
                if var.persistable:
                    program.global_block()._clone_variable(
1278 1279
                        var, force_persistable=False
                    )
Z
ziyoujiyi 已提交
1280 1281 1282 1283
                else:
                    block._clone_variable(var, force_persistable=False)

    outputs = _get_output_map_from_op(origin_program.global_block().vars, op)
1284
    for key, varlist in outputs.items():
Z
ziyoujiyi 已提交
1285 1286 1287
        if not isinstance(varlist, list):
            varlist = [varlist]
        for var in varlist:
1288 1289 1290 1291
            if (
                var.name not in program.global_block().vars
                and var.name not in block.vars
            ):
Z
ziyoujiyi 已提交
1292 1293
                if var.persistable:
                    program.global_block()._clone_variable(
1294 1295
                        var, force_persistable=False
                    )
Z
ziyoujiyi 已提交
1296 1297 1298 1299 1300
                else:
                    block._clone_variable(var, force_persistable=False)

    if "_grad" not in op.type:
        # for forward op
1301 1302 1303
        return block.append_op(
            type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs()
        )
Z
ziyoujiyi 已提交
1304 1305 1306 1307 1308 1309 1310 1311 1312
    else:
        # for grad op
        op_desc = op.desc
        backward = core.op_proto_and_checker_maker.OpRole.Backward
        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()

        # append grad op
        new_op_desc = block.desc.append_op()
        new_op_desc.copy_from(op_desc)
1313
        new_op_desc._set_attr(RPC_OP_ROLE_ATTR_NAME, backward)
Z
ziyoujiyi 已提交
1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328

        # set device gard
        if op.desc.has_attr(device_attr_name):
            op_device = op_desc.attr(device_attr_name)
            new_op_desc._set_attr(device_attr_name, op_device)
        block._sync_with_cpp()


def get_next_stage_trainers(role_maker):
    try:
        return role_maker._get_next_trainers()
    except Exception:
        return role_maker.get_next_trainers()


1329 1330 1331 1332 1333 1334 1335 1336 1337 1338
def insert_communicate_op(
    orign_program,
    role_maker,
    heter_block,
    stage_id,
    first_op_index,
    block_var_detail,
    device,
    is_forward=True,
):
Z
ziyoujiyi 已提交
1339 1340 1341 1342

    if is_forward:
        next_heter_worker_endpoints = get_next_stage_trainers(role_maker)
        previous_heter_worker_endpoints = get_previous_stage_trainers(
1343 1344
            role_maker
        )
Z
ziyoujiyi 已提交
1345
        entrance_var = block_var_detail[stage_id]["forward"]["entrance"]
1346 1347 1348
        comm_info = get_communicate_var_info(
            orign_program, stage_id + 1, entrance_var
        )
Z
ziyoujiyi 已提交
1349 1350 1351 1352

    else:
        next_heter_worker_endpoints = get_next_stage_trainers(role_maker)
        previous_heter_worker_endpoints = get_previous_stage_trainers(
1353 1354
            role_maker
        )
Z
ziyoujiyi 已提交
1355
        entrance_var = block_var_detail[stage_id - 1]["backward"]["exit"]
1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376
        comm_info = get_communicate_var_info(
            orign_program, stage_id - 1, entrance_var, "backward"
        )

    heter_block._insert_op(
        index=first_op_index,
        type="send_and_recv",
        inputs={"X": heter_block.vars[entrance_var[0]]},
        outputs={"Out": []},
        attrs={
            "mode": "forward" if is_forward else "backward",
            "send_var_name": entrance_var + ["microbatch_id"],
            "recv_var_name": [],
            "message_name": comm_info["block_input_var_name"],
            "next_endpoints": next_heter_worker_endpoints,
            "previous_endpoints": previous_heter_worker_endpoints,
            "trainer_id": get_role_id(role_maker),
            "op_device": device,
            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
        },
    )
Z
ziyoujiyi 已提交
1377 1378 1379 1380

    return entrance_var


1381
def get_the_one_recv_context(context, is_dense=True, split_dense_table=False):
Z
ziyoujiyi 已提交
1382 1383 1384
    recv_id_maps = {}
    grad_name_to_param_name = {}
    if is_dense:
1385 1386 1387
        send_ctx = get_the_one_send_context(
            context, split_dense_table=split_dense_table
        )
Z
ziyoujiyi 已提交
1388 1389 1390 1391 1392 1393 1394 1395 1396 1397
        for idx, (name, ctx) in enumerate(send_ctx.items()):
            if ctx.is_sparse():
                continue
            if ctx.is_tensor_table():
                continue

            origin_grad_varnames = ctx.origin_varnames()

            param_names = []
            for grad_varname in origin_grad_varnames:
W
wangguanqun 已提交
1398
                param_name = context["grad_name_to_param_name"][grad_varname]
Z
ziyoujiyi 已提交
1399 1400 1401
                param_names.append(param_name)
            recv_id_maps[ctx.table_id()] = param_names
    else:
1402 1403 1404
        send_ctx = get_the_one_send_context(
            context, split_dense_table=False, ep_list=None
        )
Z
ziyoujiyi 已提交
1405 1406 1407 1408 1409 1410 1411 1412
        for idx, (name, ctx) in enumerate(send_ctx.items()):
            if not ctx.is_sparse():
                continue

            origin_grad_varnames = ctx.origin_varnames()

            param_names = []
            for grad_varname in origin_grad_varnames:
W
wangguanqun 已提交
1413
                param_name = context["grad_name_to_param_name"][grad_varname]
Z
ziyoujiyi 已提交
1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425
                param_names.append(param_name)
            recv_id_maps[ctx.table_id()] = param_names
    return recv_id_maps


def _get_varname_parts(varname):
    # returns origin, blockid, trainerid
    orig_var_name = ""
    trainer_part = ""
    block_part = ""
    trainer_idx = varname.find(".trainer_")
    if trainer_idx >= 0:
1426
        trainer_part = varname[trainer_idx + 1 :]
Z
ziyoujiyi 已提交
1427 1428 1429 1430
    else:
        trainer_idx = len(varname)
    block_index = varname.find(".block")
    if block_index >= 0:
1431
        block_part = varname[block_index + 1 : trainer_idx]
Z
ziyoujiyi 已提交
1432 1433
    else:
        block_index = len(varname)
1434
    orig_var_name = varname[0 : min(block_index, trainer_idx)]
Z
ziyoujiyi 已提交
1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463
    return orig_var_name, block_part, trainer_part


dtype_to_size = {
    core.VarDesc.VarType.FP16: 2,
    core.VarDesc.VarType.FP32: 4,
    core.VarDesc.VarType.FP64: 8,
    core.VarDesc.VarType.INT16: 2,
    core.VarDesc.VarType.INT32: 4,
    core.VarDesc.VarType.INT64: 8,
    core.VarDesc.VarType.BOOL: 1,
    core.VarDesc.VarType.UINT8: 1,
}


def get_var_mem_size(var):
    m_size = reduce(lambda x, y: x * y, var.shape)
    m_size *= dtype_to_size[var.dtype]
    return m_size


class MergedVariable:
    def __init__(self, merged, ordered, offsets):
        self.merged_var = merged
        self.ordered_vars = ordered
        self.offsets = offsets


def build_var_distributed(context):
W
wangguanqun 已提交
1464 1465 1466
    origin_programs = context['origin_main_programs']

    param_name_to_grad_name = {}
Z
ziyoujiyi 已提交
1467
    grad_name_to_param_name = {}
W
wangguanqun 已提交
1468 1469
    context["origin_sparse_pairs"] = []
    context["origin_dense_pairs"] = []
Z
ziyoujiyi 已提交
1470 1471
    context["merged_sparse_pairs"] = []
    context['merged_dense_pairs'] = []
W
wangguanqun 已提交
1472
    context["merged_variables_pairs"] = []
Z
ziyoujiyi 已提交
1473
    context["merged_variable_map"] = {}
W
wangguanqun 已提交
1474 1475
    for origin_program in origin_programs:
        sparse_pairs, dense_pairs = get_param_grads(origin_program)
1476 1477
        # print("public build_var_distributed sparse_pairs:", sparse_pairs)
        # print("public build_var_distributed dense_pairs:", dense_pairs)
W
wangguanqun 已提交
1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496
        origin_for_sparse = []
        origin_for_dense = []
        merged_sparse_pairs = []
        merged_dense_pairs = []
        merged_variables_pairs = []

        for param, grad in sparse_pairs:
            origin_for_sparse.append((param, grad))

        for param, grad in dense_pairs:
            origin_for_dense.append((param, grad))

        for dense_pair in origin_for_dense:
            param, grad = dense_pair

            m_param = MergedVariable(param, [param], [0])
            m_grad = MergedVariable(grad, [grad], [0])
            merged_variables_pairs.append((m_param, m_grad))
            merged_dense_pairs.append((m_param, m_grad))
1497
        # print("public build_var_distributed merged_dense_pairs:",
1498
        #       merged_dense_pairs)
W
wangguanqun 已提交
1499 1500 1501 1502 1503 1504 1505 1506

        for sparse_pair in origin_for_sparse:
            param, grad = sparse_pair

            m_param = MergedVariable(param, [param], [0])
            m_grad = MergedVariable(grad, [grad], [0])
            merged_variables_pairs.append((m_param, m_grad))
            merged_sparse_pairs.append((m_param, m_grad))
1507
        # print("public build_var_distributed merged_sparse_pairs:",
1508
        #       merged_sparse_pairs)
W
wangguanqun 已提交
1509 1510 1511 1512

        for merged in merged_variables_pairs:
            m_param, m_grad = merged
            context["merged_variable_map"][
1513 1514
                m_param.merged_var.name
            ] = m_param.merged_var
W
wangguanqun 已提交
1515
            context["merged_variable_map"][
1516 1517
                m_grad.merged_var.name
            ] = m_grad.merged_var
W
wangguanqun 已提交
1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532

        param_merges = []
        param_merges.extend(origin_for_sparse)
        param_merges.extend(origin_for_dense)

        for param, grad in param_merges:
            param_name_to_grad_name[param.name] = grad.name
            grad_name_to_param_name[grad.name] = param.name

        context["origin_sparse_pairs"].append(origin_for_sparse)
        context["origin_dense_pairs"].append(origin_for_dense)
        context["merged_sparse_pairs"].append(merged_sparse_pairs)
        context['merged_dense_pairs'].append(merged_dense_pairs)

    context["param_name_to_grad_name"] = param_name_to_grad_name
Z
ziyoujiyi 已提交
1533
    context["grad_name_to_param_name"] = grad_name_to_param_name
1534
    '''
1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547
    print("public build_var_distributed origin_sparse_pairs:",
        context["origin_sparse_pairs"])
    print("public build_var_distributed origin_for_dense:",
        context["origin_dense_pairs"])
    print("public build_var_distributed merged_sparse_pairs:",
        context["merged_sparse_pairs"])
    print("public build_var_distributed merged_dense_pairs:",
        context['merged_dense_pairs'])
    print("public build_var_distributed param_name_to_grad_name:",
        param_name_to_grad_name)
    print("public build_var_distributed grad_name_to_param_name:",
        grad_name_to_param_name)
    '''
W
wangguanqun 已提交
1548

Z
ziyoujiyi 已提交
1549 1550 1551 1552 1553 1554

def _is_opt_role_op(op):
    # NOTE : depend on oprole to find out whether this op is for
    # optimize
    op_maker = core.op_proto_and_checker_maker
    optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
1555 1556 1557
    if op_maker.kOpRoleAttrName() in op.attr_names and int(
        op.all_attrs()[op_maker.kOpRoleAttrName()]
    ) == int(optimize_role):
Z
ziyoujiyi 已提交
1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574
        return True
    return False


def get_param_grads(origin_program):
    def _get_params_grads(sparse_varnames):
        block = origin_program.global_block()

        dense_param_grads = []
        sparse_param_grads = []

        optimize_params = set()
        origin_var_dict = origin_program.global_block().vars
        role_id = int(core.op_proto_and_checker_maker.OpRole.Backward)
        for op in block.ops:
            if _is_opt_role_op(op):
                # delete clip op from opt_ops when run in Parameter Server mode
1575 1576 1577 1578
                if (
                    OP_NAME_SCOPE in op.all_attrs()
                    and CLIP_OP_NAME_SCOPE in op.attr(OP_NAME_SCOPE)
                ):
Z
ziyoujiyi 已提交
1579 1580 1581 1582 1583 1584 1585
                    op._set_attr("op_role", role_id)
                    continue
                if op.attr(OP_ROLE_VAR_ATTR_NAME):
                    param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
                    grad_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[1]
                    if param_name not in optimize_params:
                        optimize_params.add(param_name)
1586 1587 1588 1589
                        param_grad = (
                            origin_var_dict[param_name],
                            origin_var_dict[grad_name],
                        )
Z
ziyoujiyi 已提交
1590 1591 1592 1593 1594 1595 1596 1597 1598 1599

                        if param_name in sparse_varnames:
                            sparse_param_grads.append(param_grad)
                        else:
                            dense_param_grads.append(param_grad)
        return sparse_param_grads, dense_param_grads

    def _get_sparse_varnames():
        varnames = []
        for op in origin_program.global_block().ops:
1600 1601 1602 1603
            if (
                op.type in SPARSE_OP_TYPE_DICT.keys()
                and op.attr('remote_prefetch') is True
            ):
Z
ziyoujiyi 已提交
1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614
                param_name = op.input(SPARSE_OP_TYPE_DICT[op.type])[0]
                varnames.append(param_name)

        return list(set(varnames))

    sparse_varnames = _get_sparse_varnames()
    sparse_param_grads, dense_param_grads = _get_params_grads(sparse_varnames)

    return sparse_param_grads, dense_param_grads


1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643
def delete_ops(block, ops):
    for op in ops:
        try:
            idx = list(block.ops).index(op)
            block._remove_op(idx)
        except Exception as e:
            print(e)


def find_send_op(program):
    send_op_list = []
    for op in program.global_block().ops:
        if op.type == "send":
            send_op_list.append(op)
    return send_op_list


def find_op_input_output(program, block, op):
    input_var_list = []
    output_var_list = []
    inputs = _get_input_map_from_op(block.vars, op)
    input_var_list += get_varlist_from_op_map(inputs)
    outputs = _get_output_map_from_op(block.vars, op)
    output_var_list += get_varlist_from_op_map(outputs)
    input_var_list = list(set(input_var_list))
    output_var_list = list(set(output_var_list))
    return input_var_list, output_var_list


1644
def add_send_op(program, block, _vars):
1645 1646 1647 1648
    def _get_send_op_dict():
        send_op_dict = {}
        send_op_list = find_send_op(program)
        for op in send_op_list:
1649 1650 1651
            input_list, _ = find_op_input_output(
                program, program.global_block(), op
            )
1652 1653 1654 1655 1656 1657 1658
            for var in input_list:
                send_op_dict[var] = op
        return send_op_dict

    send_grad_var_list = []
    send_op_dict = _get_send_op_dict()
    table_dict = {}
1659
    for persistable_var in _vars:
1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679
        if "@GRAD" not in persistable_var:
            continue
        if "GRAD" != persistable_var.split("@")[-1]:
            continue
        if persistable_var not in send_op_dict:
            continue
        send_op = send_op_dict[persistable_var]
        is_sparse = send_op.attr('is_sparse')
        table_id = send_op.attr('table_id')
        send_varnames = send_op.attr('send_varnames')
        send_grad_var_list.append(persistable_var)
        if table_id not in table_dict:
            table_dict[table_id] = {}
            table_dict[table_id]['var_list'] = []
            table_dict[table_id]['is_sparse'] = is_sparse
            table_dict[table_id]['send_varnames'] = send_varnames
        table_dict[table_id]['var_list'].append(persistable_var)

    for table_id in table_dict:
        dummy_output = block.create_var(
1680 1681
            name=framework.generate_control_dev_var_name()
        )
1682 1683 1684 1685
        send_input_vars = [
            block.vars[union_var]
            for union_var in table_dict[table_id]['var_list']
        ]
1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696
        block.append_op(
            type="send",
            inputs={"X": send_input_vars},
            outputs={"Out": dummy_output},
            attrs={
                "send_varnames": table_dict[table_id]['send_varnames'],
                "is_sparse": is_sparse,
                "table_id": table_id,
                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
            },
        )
1697 1698 1699 1700

    return send_grad_var_list


1701 1702 1703 1704 1705 1706
def get_vars_name_in_block(block):
    vars_list = block.vars.keys()
    vars_name_list = [var_name for var_name in vars_list]
    return vars_name_list


1707
# reserve static_var
1708 1709 1710 1711 1712
def delete_trainer_useless_var(program, static_var):
    static_var = list(set(static_var))
    program_useful_var_list = []
    for op in program.global_block().ops:
        input_var_list, output_var_list = find_op_input_output(
1713 1714
            program, program.global_block(), op
        )
1715 1716
        op_var_list = list(set(input_var_list).union(set(output_var_list)))
        program_useful_var_list = list(
1717 1718
            set(program_useful_var_list).union(set(op_var_list))
        )
1719 1720 1721
    program_useful_var_list += static_var
    program_useless_var_list = list(
        set(get_vars_name_in_block(program.global_block())).difference(
1722 1723 1724
            set(program_useful_var_list)
        )
    )
1725 1726 1727 1728 1729
    for var in program_useless_var_list:
        program.global_block()._remove_var(var)
    return program_useless_var_list


1730 1731 1732
def create_backward_block(
    program, origin_program, bp_ops_list, block_var_detail
):
1733 1734 1735 1736 1737 1738 1739 1740
    pre_block_idx = program.num_blocks - 1
    heter_block = program._create_block(pre_block_idx)

    for _, op in enumerate(bp_ops_list):
        if op.type == "send":
            send_varnames = op.attr('send_varnames')
            is_skip = False
            for varname in send_varnames:
1741 1742 1743 1744
                if (
                    varname not in program.global_block().vars
                    and varname not in heter_block.vars
                ):
1745 1746
                    is_skip = True
                    break
1747
            if is_skip:
1748 1749 1750 1751 1752 1753 1754 1755 1756 1757
                continue
        block_append_op(program, origin_program, heter_block, op)

    entrance_vars = block_var_detail[0]["backward"]["entrance"]
    add_vars_by_var_list(entrance_vars, origin_program, program, heter_block)
    exit_vars = block_var_detail[0]["backward"]["exit"]
    add_vars_by_var_list(exit_vars, origin_program, program, heter_block)
    return heter_block


1758 1759
def is_backward_op(op):
    return op_role_attr_name in op.attr_names and (
1760 1761
        int(op.attr(op_role_attr_name)) & int(op_role.Backward)
    )
1762 1763 1764


def is_forward_op(op):
1765 1766 1767
    return op_role_attr_name in op.attr_names and (
        int(op.attr(op_role_attr_name)) == int(op_role.Forward)
    )
1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809


def is_push_sparse_op(op):
    return op.type == 'distributed_push_sparse'


def get_distributed_push_sparse_op_list(block):
    push_sparse_op_list = []
    for op_idx in range(block.desc.op_size()):
        op = block.ops[op_idx]
        if is_push_sparse_op(op):
            push_sparse_op_list.append(op)
    return push_sparse_op_list


def get_bp_op_list(block):
    bp_op_list = []
    for op_idx in range(block.desc.op_size()):
        op = block.ops[op_idx]
        if is_backward_op(op):
            bp_op_list.append(op)
    return bp_op_list


def delete_same_ops(block, ops):
    for op in ops:
        try:
            for origin_op in block.ops:
                if str(origin_op) == str(op):
                    idx = list(block.ops).index(origin_op)
                    block._remove_op(idx)
                    break
        except Exception as e:
            print(e)


def check_program(program):
    block_idx = 0
    for block in program.blocks:
        for op in block.ops:
            input_var_names = op.desc.input_arg_names()
            output_var_names = op.desc.output_arg_names()
1810
            for var_name in input_var_names + output_var_names:
1811 1812 1813
                if not block._find_var_recursive(str(var_name)):
                    raise ValueError(
                        'var: {} needed by op is not found in block: {}'.format(
1814 1815 1816
                            str(var_name), block_idx
                        )
                    )
1817 1818 1819 1820
        block_idx += 1
    print('program checked valid')


1821
def debug_program(file, program):
1822 1823
    # py >= 3.2
    os.makedirs(os.path.dirname(file), exist_ok=True)
1824 1825
    with open(file, 'w+') as f:
        f.write(str(program))
1826 1827 1828 1829 1830 1831 1832 1833


def is_distributed_env():
    node_role = os.getenv("TRAINING_ROLE")
    if node_role is None:
        return False
    else:
        return True