dist_reshape.py 28.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

C
caozhou 已提交
15
from paddle.distributed.fleet.meta_optimizers.common import OpRole
16

17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
from ..cost import (
    Reshape2GradOpCost,
    Reshape2OpCost,
    build_comp_costs_from_descs,
    build_comp_desc_from_dist_op,
    build_dp_costs,
)
from ..utils import (
    compute_compatible_and_update_dim_mapping,
    is_dim_shard,
    set_dist_op_desc_original_id,
)
from .common import (
    DistributedOperatorImpl,
    DistributedOperatorImplContainer,
    is_parameter_related,
    register_distributed_operator_impl,
    register_distributed_operator_impl_container,
)
from .dist_default import DistributedDefaultImpl0

38

39
class DistributedReshape2(DistributedOperatorImplContainer):
40
    def __init__(self, op_type):
41
        super().__init__(op_type)
42 43


44
register_distributed_operator_impl_container(DistributedReshape2("reshape2"))
45 46 47 48


class DistributedReshapeImpl0(DistributedOperatorImpl):
    def __init__(self, name):
49
        super().__init__(name)
50
        self._forward_implemented = True
51
        self._backward_implemented = False
52

C
caozhou 已提交
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
    def calc_cost(self, op_role, dist_op, ctx, cluster):
        cost = None
        if int(op_role) == int(OpRole.Backward):
            cost = self.calc_bwd_cost(dist_op, ctx, cluster)
        else:
            cost = self.calc_fwd_cost(dist_op, ctx, cluster)
        assert cost is not None
        return cost

    def calc_fwd_cost(self, dist_op, ctx, cluster):
        res = []
        op = dist_op.serial_op
        dist_attr = dist_op.dist_attr

        shape_list = op.desc.attr("shape")
        # got dist attribute info
        dim_mapping = dist_attr.get_output_dims_mapping(op.output("Out")[0])
        process_mesh_shape = dist_attr.process_mesh.topology

        # modify target shape
        for idx, axis in enumerate(dim_mapping):
            if axis >= 0:
                if len(shape_list) > idx:
76 77 78
                    shape_list[idx] = (
                        shape_list[idx] // process_mesh_shape[axis]
                    )
C
caozhou 已提交
79 80

        # calc comp op cost
81 82 83
        desc_mapping = build_comp_desc_from_dist_op(
            dist_op=dist_op, dist_context=ctx
        )
C
caozhou 已提交
84 85 86 87
        processes = dist_attr.process_mesh.processes
        for key in desc_mapping:
            desc_mapping[key]["shape"] = shape_list

88 89 90
        cost_mapping = build_comp_costs_from_descs(
            Reshape2OpCost, ctx, processes, desc_mapping, cluster
        )
C
caozhou 已提交
91 92 93 94 95 96 97
        res.append(cost_mapping)

        return res

    def calc_bwd_cost(self, dist_op, ctx, cluster):
        # calc comp op cost
        res = []
98 99 100
        desc_mapping = build_comp_desc_from_dist_op(
            dist_op=dist_op, dist_context=ctx
        )
C
caozhou 已提交
101 102 103 104 105
        dist_attr = dist_op.dist_attr
        process_mesh = dist_attr.process_mesh
        processes = process_mesh.processes
        op_type = dist_op.serial_op.type

106 107 108
        cost_mapping = build_comp_costs_from_descs(
            Reshape2GradOpCost, ctx, processes, desc_mapping, cluster
        )
C
caozhou 已提交
109 110 111 112 113 114 115 116
        res.append(cost_mapping)

        backward_op = dist_op.serial_op
        main_block = backward_op.block
        need_gradient_allreduce = False
        for input_name in backward_op.desc.input_names():
            for varname in backward_op.desc.input(input_name):
                if "@GRAD" not in varname and is_parameter_related(
117 118
                    varname, main_block
                ):
C
caozhou 已提交
119 120 121 122 123 124 125 126 127
                    # NOTE input var's dim_mapping of backward op should be the same with input var instead of corresponding varname of forward op
                    var_dim_mapping = dist_attr.get_input_dims_mapping(varname)

                    mesh_shape = process_mesh.topology
                    batch_size_axis = var_dim_mapping[0]
                    if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1:
                        parallel_axis = batch_size_axis
                        attrs = {"use_calc_stream": True}
                        var_names = [varname + "@GRAD"]
128 129 130 131 132 133 134 135 136
                        build_dp_costs(
                            res,
                            dist_op,
                            ctx,
                            var_names,
                            attrs,
                            parallel_axis,
                            cluster,
                        )
C
caozhou 已提交
137 138 139

        return res

140 141 142
    def is_input_compatible(self, dist_op):
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
143 144 145 146 147 148 149 150 151 152
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

        if len(x_dims_mapping) != len(out_dims_mapping) - 1:
            return False

        return True

153 154 155
    def is_output_compatible(self, dist_op):
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
156 157 158 159 160 161 162 163 164 165 166 167 168
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

        if len(x_dims_mapping) != len(out_dims_mapping) - 1:
            return False

        if is_dim_shard(out_dims_mapping[-1]):
            return False

        return True

沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
169
    def is_auto_compatible(self, dist_op):
170 171 172
        if (not self.is_input_compatible(dist_op)) or (
            not self.is_output_compatible(dist_op)
        ):
173 174
            return False

沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
175 176 177 178 179 180
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_shape_name = op_desc.output('XShape')[0]
        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
181 182
            x_shape_name
        )
沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
183 184 185
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

186 187
        for idx, dim_mapping in enumerate(out_dims_mapping[:-1]):
            if x_dims_mapping[idx] != dim_mapping:
沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
188 189 190 191 192 193 194 195 196 197
                return False

        if x_shape_dims_mapping[0] != -1:
            return False

        if x_shape_dims_mapping[1:] != x_dims_mapping[:]:
            return False

        return True

198
    def update_dims_mapping(self, dist_op):
199
        changed = False
200 201
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
202 203 204 205 206 207
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_shape_name = op_desc.output('XShape')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
208 209
            x_shape_name
        )
210 211 212

        for i in range(len(x_dims_mapping)):
            dim_changed = compute_compatible_and_update_dim_mapping(
213 214
                [x_dims_mapping, out_dims_mapping], [i, i]
            )
215 216 217 218 219 220 221 222
            if dim_changed:
                changed = True

        for i in range(len(x_dims_mapping)):
            x_shape_dims_mapping[i + 1] = x_dims_mapping[i]

        return changed

223 224 225 226 227 228
    @staticmethod
    def forward(ctx, *args, **kwargs):
        """
        kwargs: inputname_mapping & outputname_mapping
        """

229
        dist_op_context = ctx.dist_op_context
230 231 232
        main_block = dist_op_context.work_block
        src_op = dist_op_context.cur_src_op
        rank_id = dist_op_context.rank_id
233
        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
234 235 236
        assert (
            op_dist_attr is not None
        ), "backward op [{}] don't have dist attribute !".format(str(src_op))
237

238
        # check validation of inputs / outputs
239 240
        for input_name in src_op.desc.input_names():
            assert input_name in kwargs, "input [{}] is not given".format(
241 242
                input_name
            )
243 244 245 246 247
            assert len(kwargs[input_name]) == len(
                src_op.desc.input(input_name)
            ), "number of tensor for input [{}] is not match".format(input_name)
        for output_name in src_op.desc.output_names():
            assert output_name in kwargs, "input [{}] is not given".format(
248 249
                output_name
            )
250 251 252
            assert len(kwargs[output_name]) == len(
                src_op.desc.output(output_name)
            ), "number of tensor for input [{}] is not match".format(
253 254
                output_name
            )
255

Z
zhaoyingli 已提交
256 257 258
        X_var = main_block._var_recursive(kwargs['X'][0])
        Out_var = main_block._var_recursive(kwargs['Out'][0])
        XShape_var = main_block._var_recursive(kwargs['XShape'][0])
259 260 261 262 263 264 265 266 267 268
        shape_list = src_op.desc.attr("shape")
        ShapeTensor_var_list = []
        for name in kwargs['ShapeTensor']:
            ShapeTensor_var_list.append(name)
        Shape_var_list = []
        for name in kwargs['Shape']:
            Shape_var_list.append(name)

        # got dist attribute info
        dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name)
269
        process_mesh_shape = op_dist_attr.process_mesh.topology
270 271 272 273 274

        # modify target shape
        for idx, axis in enumerate(dim_mapping):
            if axis >= 0:
                if len(shape_list) > idx:
275 276 277
                    shape_list[idx] = (
                        shape_list[idx] // process_mesh_shape[axis]
                    )
278 279

        # create op
280
        new_op_desc = main_block.append_op(type='nop').desc
281
        new_op_desc.copy_from(src_op.desc)
282
        set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx)
283 284 285 286 287 288 289 290 291
        new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list)
        new_op_desc.set_input('Shape', Shape_var_list)
        new_op_desc.set_input('X', [X_var.name])
        new_op_desc.set_output('XShape', [XShape_var.name])
        new_op_desc.set_output('Out', [Out_var.name])
        new_op_desc._set_attr('shape', shape_list)

    @staticmethod
    def backward(ctx, *args, **kwargs):
292
        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
293

294 295 296

class DistributedReshapeImpl1(DistributedOperatorImpl):
    def __init__(self, name):
297
        super().__init__(name)
298
        self._forward_implemented = True
299
        self._backward_implemented = False
300

C
caozhou 已提交
301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
    def calc_cost(self, op_role, dist_op, ctx, cluster):
        cost = None
        if int(op_role) == int(OpRole.Backward):
            cost = self.calc_bwd_cost(dist_op, ctx, cluster)
        else:
            cost = self.calc_fwd_cost(dist_op, ctx, cluster)
        assert cost is not None
        return cost

    def calc_fwd_cost(self, dist_op, ctx, cluster):
        res = []
        op = dist_op.serial_op
        dist_attr = dist_op.dist_attr

        shape_list = op.desc.attr("shape")
        # got dist attribute info
        dim_mapping = dist_attr.get_output_dims_mapping(op.output("Out")[0])
        process_mesh_shape = dist_attr.process_mesh.topology

        # modify target shape
        for idx, axis in enumerate(dim_mapping):
            if axis >= 0:
                if len(shape_list) > idx:
324 325 326
                    shape_list[idx] = (
                        shape_list[idx] // process_mesh_shape[axis]
                    )
C
caozhou 已提交
327 328

        # calc comp op cost
329 330 331
        desc_mapping = build_comp_desc_from_dist_op(
            dist_op=dist_op, dist_context=ctx
        )
C
caozhou 已提交
332 333 334 335
        processes = dist_attr.process_mesh.processes
        for key in desc_mapping:
            desc_mapping[key]["shape"] = shape_list

336 337 338
        cost_mapping = build_comp_costs_from_descs(
            Reshape2OpCost, ctx, processes, desc_mapping, cluster
        )
C
caozhou 已提交
339 340 341 342 343 344 345
        res.append(cost_mapping)

        return res

    def calc_bwd_cost(self, dist_op, ctx, cluster):
        # calc comp op cost
        res = []
346 347 348
        desc_mapping = build_comp_desc_from_dist_op(
            dist_op=dist_op, dist_context=ctx
        )
C
caozhou 已提交
349 350 351 352 353
        dist_attr = dist_op.dist_attr
        process_mesh = dist_attr.process_mesh
        processes = process_mesh.processes
        op_type = dist_op.serial_op.type

354 355 356
        cost_mapping = build_comp_costs_from_descs(
            Reshape2GradOpCost, ctx, processes, desc_mapping, cluster
        )
C
caozhou 已提交
357 358 359 360 361 362 363 364
        res.append(cost_mapping)

        backward_op = dist_op.serial_op
        main_block = backward_op.block
        need_gradient_allreduce = False
        for input_name in backward_op.desc.input_names():
            for varname in backward_op.desc.input(input_name):
                if "@GRAD" not in varname and not is_parameter_related(
365 366
                    varname, main_block
                ):
C
caozhou 已提交
367 368 369 370 371 372 373 374 375
                    # NOTE input var's dim_mapping of backward op should be the same with input var instead of corresponding varname of forward op
                    var_dim_mapping = dist_attr.get_input_dims_mapping(varname)

                    mesh_shape = process_mesh.topology
                    batch_size_axis = var_dim_mapping[0]
                    if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1:
                        parallel_axis = batch_size_axis
                        attrs = {"use_calc_stream": True}
                        var_names = [varname + "@GRAD"]
376 377 378 379 380 381 382 383 384
                        build_dp_costs(
                            res,
                            dist_op,
                            ctx,
                            var_names,
                            attrs,
                            parallel_axis,
                            cluster,
                        )
C
caozhou 已提交
385 386 387

        return res

388 389 390
    def is_input_compatible(self, dist_op):
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
391 392 393 394 395 396 397 398 399 400 401 402 403
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

        if len(x_dims_mapping) != len(out_dims_mapping) + 1:
            return False

        if is_dim_shard(x_dims_mapping[-1]):
            return False

        return True

404 405 406
    def is_output_compatible(self, dist_op):
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
407 408 409 410 411 412 413 414 415 416
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

        if len(x_dims_mapping) != len(out_dims_mapping) + 1:
            return False

        return True

沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
417
    def is_auto_compatible(self, dist_op):
418 419 420
        if (not self.is_input_compatible(dist_op)) or (
            not self.is_output_compatible(dist_op)
        ):
421 422
            return False

沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
423 424 425 426 427 428 429 430
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_shape_name = op_desc.output('XShape')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
431 432
            x_shape_name
        )
沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
433 434 435 436

        if is_dim_shard(x_dims_mapping[-1]):
            return False

437
        for idx, item in enumerate(x_dims_mapping[:-1]):
沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
438 439 440 441 442 443 444 445 446 447 448
            if out_dims_mapping[idx] != item:
                return False

        if x_shape_dims_mapping[0] != -1:
            return False

        if x_shape_dims_mapping[1:] != x_dims_mapping[:]:
            return False

        return True

449
    def update_dims_mapping(self, dist_op):
450
        changed = False
451 452
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
453 454 455 456 457 458
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_shape_name = op_desc.output('XShape')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
459 460
            x_shape_name
        )
461 462 463

        for i in range(len(out_dims_mapping)):
            dim_changed = compute_compatible_and_update_dim_mapping(
464 465
                [x_dims_mapping, out_dims_mapping], [i, i]
            )
466 467 468 469 470 471 472 473
            if dim_changed:
                changed = True

        for i in range(len(x_dims_mapping)):
            x_shape_dims_mapping[i + 1] = x_dims_mapping[i]

        return changed

474 475 476 477 478 479
    @staticmethod
    def forward(ctx, *args, **kwargs):
        """
        kwargs: inputname_mapping & outputname_mapping
        """

480
        dist_op_context = ctx.dist_op_context
481 482 483
        main_block = dist_op_context.work_block
        src_op = dist_op_context.cur_src_op
        rank_id = dist_op_context.rank_id
484
        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
485 486 487
        assert (
            op_dist_attr is not None
        ), "backward op [{}] don't have dist attribute !".format(str(src_op))
488

489
        # check validation of inputs / outputs
490 491
        for input_name in src_op.desc.input_names():
            assert input_name in kwargs, "input [{}] is not given".format(
492 493
                input_name
            )
494 495 496 497 498
            assert len(kwargs[input_name]) == len(
                src_op.desc.input(input_name)
            ), "number of tensor for input [{}] is not match".format(input_name)
        for output_name in src_op.desc.output_names():
            assert output_name in kwargs, "input [{}] is not given".format(
499 500
                output_name
            )
501 502 503
            assert len(kwargs[output_name]) == len(
                src_op.desc.output(output_name)
            ), "number of tensor for input [{}] is not match".format(
504 505
                output_name
            )
506

Z
zhaoyingli 已提交
507 508 509
        X_var = main_block._var_recursive(kwargs['X'][0])
        Out_var = main_block._var_recursive(kwargs['Out'][0])
        XShape_var = main_block._var_recursive(kwargs['XShape'][0])
510 511 512 513 514 515 516 517 518 519
        shape_list = src_op.desc.attr("shape")
        ShapeTensor_var_list = []
        for name in kwargs['ShapeTensor']:
            ShapeTensor_var_list.append(name)
        Shape_var_list = []
        for name in kwargs['Shape']:
            Shape_var_list.append(name)

        # got dist attribute info
        dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name)
520
        process_mesh_shape = op_dist_attr.process_mesh.topology
521 522 523 524 525

        # modify target shape
        for idx, axis in enumerate(dim_mapping):
            if axis >= 0:
                if len(shape_list) > idx:
526 527 528
                    shape_list[idx] = (
                        shape_list[idx] // process_mesh_shape[axis]
                    )
529 530

        # create op
531
        new_op_desc = main_block.append_op(type='nop').desc
532
        new_op_desc.copy_from(src_op.desc)
533
        set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx)
534 535 536 537 538 539 540 541 542
        new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list)
        new_op_desc.set_input('Shape', Shape_var_list)
        new_op_desc.set_input('X', [X_var.name])
        new_op_desc.set_output('XShape', [XShape_var.name])
        new_op_desc.set_output('Out', [Out_var.name])
        new_op_desc._set_attr('shape', shape_list)

    @staticmethod
    def backward(ctx, *args, **kwargs):
543
        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
544

545

546 547
class DistributedReshapeImpl2(DistributedOperatorImpl):
    def __init__(self, name):
548
        super().__init__(name)
549 550 551
        self._forward_implemented = True
        self._backward_implemented = False

C
caozhou 已提交
552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574
    def calc_cost(self, op_role, dist_op, ctx, cluster):
        cost = None
        if int(op_role) == int(OpRole.Backward):
            cost = self.calc_bwd_cost(dist_op, ctx, cluster)
        else:
            cost = self.calc_fwd_cost(dist_op, ctx, cluster)
        assert cost is not None
        return cost

    def calc_fwd_cost(self, dist_op, ctx, cluster):
        res = []
        op = dist_op.serial_op
        dist_attr = dist_op.dist_attr

        shape_list = op.desc.attr("shape")
        # got dist attribute info
        dim_mapping = dist_attr.get_output_dims_mapping(op.output("Out")[0])
        process_mesh_shape = dist_attr.process_mesh.topology

        # modify target shape
        for idx, axis in enumerate(dim_mapping):
            if axis >= 0:
                if len(shape_list) > idx:
575 576 577
                    shape_list[idx] = (
                        shape_list[idx] // process_mesh_shape[axis]
                    )
C
caozhou 已提交
578 579

        # calc comp op cost
580 581 582
        desc_mapping = build_comp_desc_from_dist_op(
            dist_op=dist_op, dist_context=ctx
        )
C
caozhou 已提交
583 584 585 586
        processes = dist_attr.process_mesh.processes
        for key in desc_mapping:
            desc_mapping[key]["shape"] = shape_list

587 588 589
        cost_mapping = build_comp_costs_from_descs(
            Reshape2OpCost, ctx, processes, desc_mapping, cluster
        )
C
caozhou 已提交
590 591 592 593 594 595 596
        res.append(cost_mapping)

        return res

    def calc_bwd_cost(self, dist_op, ctx, cluster):
        # calc comp op cost
        res = []
597 598 599
        desc_mapping = build_comp_desc_from_dist_op(
            dist_op=dist_op, dist_context=ctx
        )
C
caozhou 已提交
600 601 602 603 604
        dist_attr = dist_op.dist_attr
        process_mesh = dist_attr.process_mesh
        processes = process_mesh.processes
        op_type = dist_op.serial_op.type

605 606 607
        cost_mapping = build_comp_costs_from_descs(
            Reshape2GradOpCost, ctx, processes, desc_mapping, cluster
        )
C
caozhou 已提交
608 609 610 611 612 613 614 615
        res.append(cost_mapping)

        backward_op = dist_op.serial_op
        main_block = backward_op.block
        need_gradient_allreduce = False
        for input_name in backward_op.desc.input_names():
            for varname in backward_op.desc.input(input_name):
                if "@GRAD" not in varname and not is_parameter_related(
616 617
                    varname, main_block
                ):
C
caozhou 已提交
618 619 620 621 622 623 624 625 626
                    # NOTE input var's dim_mapping of backward op should be the same with input var instead of corresponding varname of forward op
                    var_dim_mapping = dist_attr.get_input_dims_mapping(varname)

                    mesh_shape = process_mesh.topology
                    batch_size_axis = var_dim_mapping[0]
                    if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1:
                        parallel_axis = batch_size_axis
                        attrs = {"use_calc_stream": True}
                        var_names = [varname + "@GRAD"]
627 628 629 630 631 632 633 634 635
                        build_dp_costs(
                            res,
                            dist_op,
                            ctx,
                            var_names,
                            attrs,
                            parallel_axis,
                            cluster,
                        )
C
caozhou 已提交
636 637 638

        return res

639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665
    def is_input_compatible(self, dist_op):
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

        if len(x_dims_mapping) != len(out_dims_mapping):
            return False

        return True

    def is_output_compatible(self, dist_op):
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
        out_name = op_desc.output('Out')[0]
        x_name = op_desc.input('X')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

        if len(x_dims_mapping) != len(out_dims_mapping):
            return False

        return True

    def is_auto_compatible(self, dist_op):
666 667 668
        if (not self.is_input_compatible(dist_op)) or (
            not self.is_output_compatible(dist_op)
        ):
669 670 671 672 673 674 675 676 677 678
            return False

        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_shape_name = op_desc.output('XShape')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
679 680
            x_shape_name
        )
681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703

        for idx, item in enumerate(x_dims_mapping[:-1]):
            if out_dims_mapping[idx] != item:
                return False

        if x_shape_dims_mapping[0] != -1:
            return False

        if x_shape_dims_mapping[1:] != out_dims_mapping[:]:
            return False

        return True

    def update_dims_mapping(self, dist_op):
        changed = False
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_shape_name = op_desc.output('XShape')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
704 705
            x_shape_name
        )
706 707 708

        for i in range(len(out_dims_mapping) - 1):
            dim_changed = compute_compatible_and_update_dim_mapping(
709 710
                [x_dims_mapping, out_dims_mapping], [i, i]
            )
711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
            if dim_changed:
                changed = True

        for i in range(len(out_dims_mapping)):
            x_shape_dims_mapping[i + 1] = out_dims_mapping[i]

        return changed

    @staticmethod
    def forward(ctx, *args, **kwargs):
        """
        kwargs: inputname_mapping & outputname_mapping
        """

        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.work_block
        src_op = dist_op_context.cur_src_op
        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
729 730 731
        assert (
            op_dist_attr is not None
        ), "backward op [{}] don't have dist attribute !".format(str(src_op))
732 733 734 735

        # check validation of inputs / outputs
        for input_name in src_op.desc.input_names():
            assert input_name in kwargs, "input [{}] is not given".format(
736 737
                input_name
            )
738 739 740 741 742
            assert len(kwargs[input_name]) == len(
                src_op.desc.input(input_name)
            ), "number of tensor for input [{}] is not match".format(input_name)
        for output_name in src_op.desc.output_names():
            assert output_name in kwargs, "input [{}] is not given".format(
743 744
                output_name
            )
745 746 747
            assert len(kwargs[output_name]) == len(
                src_op.desc.output(output_name)
            ), "number of tensor for input [{}] is not match".format(
748 749
                output_name
            )
750

Z
zhaoyingli 已提交
751 752 753
        X_var = main_block._var_recursive(kwargs['X'][0])
        Out_var = main_block._var_recursive(kwargs['Out'][0])
        XShape_var = main_block._var_recursive(kwargs['XShape'][0])
754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769
        shape_list = src_op.desc.attr("shape")
        ShapeTensor_var_list = []
        for name in kwargs['ShapeTensor']:
            ShapeTensor_var_list.append(name)
        Shape_var_list = []
        for name in kwargs['Shape']:
            Shape_var_list.append(name)

        # got dist attribute info
        out_dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name)
        process_mesh_shape = op_dist_attr.process_mesh.topology

        # modify target shape
        for idx, axis in enumerate(out_dim_mapping):
            if axis >= 0:
                if len(shape_list) > idx:
770 771 772
                    shape_list[idx] = (
                        shape_list[idx] // process_mesh_shape[axis]
                    )
773 774

        # create op
775
        new_op_desc = main_block.append_op(type='nop').desc
776 777 778 779 780 781 782 783 784 785 786 787 788 789
        new_op_desc.copy_from(src_op.desc)
        set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx)
        new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list)
        new_op_desc.set_input('Shape', Shape_var_list)
        new_op_desc.set_input('X', [X_var.name])
        new_op_desc.set_output('XShape', [XShape_var.name])
        new_op_desc.set_output('Out', [Out_var.name])
        new_op_desc._set_attr('shape', shape_list)

    @staticmethod
    def backward(ctx, *args, **kwargs):
        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)


790
register_distributed_operator_impl(
791 792 793 794 795 796 797 798
    "reshape2", DistributedReshapeImpl0("add_one_dim_back")
)
register_distributed_operator_impl(
    "reshape2", DistributedReshapeImpl1("remove_one_dim_back")
)
register_distributed_operator_impl(
    "reshape2", DistributedReshapeImpl2("same_dim_shape")
)