dist_reshape.py 29.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

C
caozhou 已提交
15
from paddle.distributed.fleet.meta_optimizers.common import OpRole
16

17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
from ..cost import (
    Reshape2GradOpCost,
    Reshape2OpCost,
    build_comp_costs_from_descs,
    build_comp_desc_from_dist_op,
    build_dp_costs,
)
from ..utils import (
    compute_compatible_and_update_dim_mapping,
    is_dim_shard,
    set_dist_op_desc_original_id,
)
from .common import (
    DistributedOperatorImpl,
    DistributedOperatorImplContainer,
    is_parameter_related,
    register_distributed_operator_impl,
    register_distributed_operator_impl_container,
)
from .dist_default import DistributedDefaultImpl0

38

39
class DistributedReshape2(DistributedOperatorImplContainer):
40
    def __init__(self, op_type):
41
        super().__init__(op_type)
42 43


44
register_distributed_operator_impl_container(DistributedReshape2("reshape2"))
45 46 47 48


class DistributedReshapeImpl0(DistributedOperatorImpl):
    def __init__(self, name):
49
        super().__init__(name)
50
        self._forward_implemented = True
51
        self._backward_implemented = False
52

C
caozhou 已提交
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
    def calc_cost(self, op_role, dist_op, ctx, cluster):
        cost = None
        if int(op_role) == int(OpRole.Backward):
            cost = self.calc_bwd_cost(dist_op, ctx, cluster)
        else:
            cost = self.calc_fwd_cost(dist_op, ctx, cluster)
        assert cost is not None
        return cost

    def calc_fwd_cost(self, dist_op, ctx, cluster):
        res = []
        op = dist_op.serial_op
        dist_attr = dist_op.dist_attr

        shape_list = op.desc.attr("shape")
        # got dist attribute info
        dim_mapping = dist_attr.get_output_dims_mapping(op.output("Out")[0])
70
        process_mesh_shape = dist_attr.process_mesh.shape
C
caozhou 已提交
71 72 73 74 75

        # modify target shape
        for idx, axis in enumerate(dim_mapping):
            if axis >= 0:
                if len(shape_list) > idx:
76 77 78
                    shape_list[idx] = (
                        shape_list[idx] // process_mesh_shape[axis]
                    )
C
caozhou 已提交
79 80

        # calc comp op cost
81 82 83
        desc_mapping = build_comp_desc_from_dist_op(
            dist_op=dist_op, dist_context=ctx
        )
84
        processes = dist_attr.process_mesh.process_ids
C
caozhou 已提交
85 86 87
        for key in desc_mapping:
            desc_mapping[key]["shape"] = shape_list

88 89 90
        cost_mapping = build_comp_costs_from_descs(
            Reshape2OpCost, ctx, processes, desc_mapping, cluster
        )
C
caozhou 已提交
91 92 93 94 95 96 97
        res.append(cost_mapping)

        return res

    def calc_bwd_cost(self, dist_op, ctx, cluster):
        # calc comp op cost
        res = []
98 99 100
        desc_mapping = build_comp_desc_from_dist_op(
            dist_op=dist_op, dist_context=ctx
        )
C
caozhou 已提交
101 102
        dist_attr = dist_op.dist_attr
        process_mesh = dist_attr.process_mesh
103
        processes = process_mesh.process_ids
C
caozhou 已提交
104 105
        op_type = dist_op.serial_op.type

106 107 108
        cost_mapping = build_comp_costs_from_descs(
            Reshape2GradOpCost, ctx, processes, desc_mapping, cluster
        )
C
caozhou 已提交
109 110 111 112 113 114 115 116
        res.append(cost_mapping)

        backward_op = dist_op.serial_op
        main_block = backward_op.block
        need_gradient_allreduce = False
        for input_name in backward_op.desc.input_names():
            for varname in backward_op.desc.input(input_name):
                if "@GRAD" not in varname and is_parameter_related(
117 118
                    varname, main_block
                ):
C
caozhou 已提交
119 120 121
                    # NOTE input var's dim_mapping of backward op should be the same with input var instead of corresponding varname of forward op
                    var_dim_mapping = dist_attr.get_input_dims_mapping(varname)

122
                    mesh_shape = process_mesh.shape
C
caozhou 已提交
123 124 125 126 127
                    batch_size_axis = var_dim_mapping[0]
                    if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1:
                        parallel_axis = batch_size_axis
                        attrs = {"use_calc_stream": True}
                        var_names = [varname + "@GRAD"]
128 129 130 131 132 133 134 135 136
                        build_dp_costs(
                            res,
                            dist_op,
                            ctx,
                            var_names,
                            attrs,
                            parallel_axis,
                            cluster,
                        )
C
caozhou 已提交
137 138 139

        return res

140 141 142
    def is_input_compatible(self, dist_op):
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
143 144 145 146 147 148 149 150 151 152
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

        if len(x_dims_mapping) != len(out_dims_mapping) - 1:
            return False

        return True

153 154 155
    def is_output_compatible(self, dist_op):
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
156 157 158 159 160 161 162 163 164 165 166 167 168
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

        if len(x_dims_mapping) != len(out_dims_mapping) - 1:
            return False

        if is_dim_shard(out_dims_mapping[-1]):
            return False

        return True

沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
169
    def is_auto_compatible(self, dist_op):
170 171 172
        if (not self.is_input_compatible(dist_op)) or (
            not self.is_output_compatible(dist_op)
        ):
173 174
            return False

沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
175 176 177 178 179 180
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_shape_name = op_desc.output('XShape')[0]
        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
181 182
            x_shape_name
        )
沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
183 184 185
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

186 187
        for idx, dim_mapping in enumerate(out_dims_mapping[:-1]):
            if x_dims_mapping[idx] != dim_mapping:
沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
188 189 190 191 192 193 194 195 196 197
                return False

        if x_shape_dims_mapping[0] != -1:
            return False

        if x_shape_dims_mapping[1:] != x_dims_mapping[:]:
            return False

        return True

198
    def update_dims_mapping(self, dist_op):
199
        changed = False
200 201
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
202 203 204 205 206 207
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_shape_name = op_desc.output('XShape')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
208 209
            x_shape_name
        )
210 211 212

        for i in range(len(x_dims_mapping)):
            dim_changed = compute_compatible_and_update_dim_mapping(
213 214
                [x_dims_mapping, out_dims_mapping], [i, i]
            )
215 216 217 218 219 220
            if dim_changed:
                changed = True

        for i in range(len(x_dims_mapping)):
            x_shape_dims_mapping[i + 1] = x_dims_mapping[i]

221 222 223 224 225 226 227
        if changed:
            op_dist_attr.set_input_dims_mapping(x_name, x_dims_mapping)
            op_dist_attr.set_output_dims_mapping(out_name, out_dims_mapping)
            op_dist_attr.set_output_dims_mapping(
                x_shape_name, x_shape_dims_mapping
            )

228 229
        return changed

230 231 232 233 234 235
    @staticmethod
    def forward(ctx, *args, **kwargs):
        """
        kwargs: inputname_mapping & outputname_mapping
        """

236
        dist_op_context = ctx.dist_op_context
237 238 239
        main_block = dist_op_context.work_block
        src_op = dist_op_context.cur_src_op
        rank_id = dist_op_context.rank_id
240
        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
241 242 243
        assert (
            op_dist_attr is not None
        ), "backward op [{}] don't have dist attribute !".format(str(src_op))
244

245
        # check validation of inputs / outputs
246 247
        for input_name in src_op.desc.input_names():
            assert input_name in kwargs, "input [{}] is not given".format(
248 249
                input_name
            )
250 251 252 253 254
            assert len(kwargs[input_name]) == len(
                src_op.desc.input(input_name)
            ), "number of tensor for input [{}] is not match".format(input_name)
        for output_name in src_op.desc.output_names():
            assert output_name in kwargs, "input [{}] is not given".format(
255 256
                output_name
            )
257 258 259
            assert len(kwargs[output_name]) == len(
                src_op.desc.output(output_name)
            ), "number of tensor for input [{}] is not match".format(
260 261
                output_name
            )
262

Z
zhaoyingli 已提交
263 264 265
        X_var = main_block._var_recursive(kwargs['X'][0])
        Out_var = main_block._var_recursive(kwargs['Out'][0])
        XShape_var = main_block._var_recursive(kwargs['XShape'][0])
266 267 268 269 270 271 272 273 274 275
        shape_list = src_op.desc.attr("shape")
        ShapeTensor_var_list = []
        for name in kwargs['ShapeTensor']:
            ShapeTensor_var_list.append(name)
        Shape_var_list = []
        for name in kwargs['Shape']:
            Shape_var_list.append(name)

        # got dist attribute info
        dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name)
276
        process_mesh_shape = op_dist_attr.process_mesh.shape
277 278 279 280 281

        # modify target shape
        for idx, axis in enumerate(dim_mapping):
            if axis >= 0:
                if len(shape_list) > idx:
282 283 284
                    shape_list[idx] = (
                        shape_list[idx] // process_mesh_shape[axis]
                    )
285 286

        # create op
287 288
        new_op = main_block.append_op(type='nop')
        new_op_desc = new_op.desc
289
        new_op_desc.copy_from(src_op.desc)
290
        set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx)
291 292 293 294 295 296
        new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list)
        new_op_desc.set_input('Shape', Shape_var_list)
        new_op_desc.set_input('X', [X_var.name])
        new_op_desc.set_output('XShape', [XShape_var.name])
        new_op_desc.set_output('Out', [Out_var.name])
        new_op_desc._set_attr('shape', shape_list)
297
        # TODO: should we add a new dist attr for the new op here?
298 299 300

    @staticmethod
    def backward(ctx, *args, **kwargs):
301
        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
302

303 304 305

class DistributedReshapeImpl1(DistributedOperatorImpl):
    def __init__(self, name):
306
        super().__init__(name)
307
        self._forward_implemented = True
308
        self._backward_implemented = False
309

C
caozhou 已提交
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
    def calc_cost(self, op_role, dist_op, ctx, cluster):
        cost = None
        if int(op_role) == int(OpRole.Backward):
            cost = self.calc_bwd_cost(dist_op, ctx, cluster)
        else:
            cost = self.calc_fwd_cost(dist_op, ctx, cluster)
        assert cost is not None
        return cost

    def calc_fwd_cost(self, dist_op, ctx, cluster):
        res = []
        op = dist_op.serial_op
        dist_attr = dist_op.dist_attr

        shape_list = op.desc.attr("shape")
        # got dist attribute info
        dim_mapping = dist_attr.get_output_dims_mapping(op.output("Out")[0])
327
        process_mesh_shape = dist_attr.process_mesh.shape
C
caozhou 已提交
328 329 330 331 332

        # modify target shape
        for idx, axis in enumerate(dim_mapping):
            if axis >= 0:
                if len(shape_list) > idx:
333 334 335
                    shape_list[idx] = (
                        shape_list[idx] // process_mesh_shape[axis]
                    )
C
caozhou 已提交
336 337

        # calc comp op cost
338 339 340
        desc_mapping = build_comp_desc_from_dist_op(
            dist_op=dist_op, dist_context=ctx
        )
341
        processes = dist_attr.process_mesh.process_ids
C
caozhou 已提交
342 343 344
        for key in desc_mapping:
            desc_mapping[key]["shape"] = shape_list

345 346 347
        cost_mapping = build_comp_costs_from_descs(
            Reshape2OpCost, ctx, processes, desc_mapping, cluster
        )
C
caozhou 已提交
348 349 350 351 352 353 354
        res.append(cost_mapping)

        return res

    def calc_bwd_cost(self, dist_op, ctx, cluster):
        # calc comp op cost
        res = []
355 356 357
        desc_mapping = build_comp_desc_from_dist_op(
            dist_op=dist_op, dist_context=ctx
        )
C
caozhou 已提交
358 359
        dist_attr = dist_op.dist_attr
        process_mesh = dist_attr.process_mesh
360
        processes = process_mesh.process_ids
C
caozhou 已提交
361 362
        op_type = dist_op.serial_op.type

363 364 365
        cost_mapping = build_comp_costs_from_descs(
            Reshape2GradOpCost, ctx, processes, desc_mapping, cluster
        )
C
caozhou 已提交
366 367 368 369 370 371 372 373
        res.append(cost_mapping)

        backward_op = dist_op.serial_op
        main_block = backward_op.block
        need_gradient_allreduce = False
        for input_name in backward_op.desc.input_names():
            for varname in backward_op.desc.input(input_name):
                if "@GRAD" not in varname and not is_parameter_related(
374 375
                    varname, main_block
                ):
C
caozhou 已提交
376 377 378
                    # NOTE input var's dim_mapping of backward op should be the same with input var instead of corresponding varname of forward op
                    var_dim_mapping = dist_attr.get_input_dims_mapping(varname)

379
                    mesh_shape = process_mesh.shape
C
caozhou 已提交
380 381 382 383 384
                    batch_size_axis = var_dim_mapping[0]
                    if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1:
                        parallel_axis = batch_size_axis
                        attrs = {"use_calc_stream": True}
                        var_names = [varname + "@GRAD"]
385 386 387 388 389 390 391 392 393
                        build_dp_costs(
                            res,
                            dist_op,
                            ctx,
                            var_names,
                            attrs,
                            parallel_axis,
                            cluster,
                        )
C
caozhou 已提交
394 395 396

        return res

397 398 399
    def is_input_compatible(self, dist_op):
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
400 401 402 403 404 405 406 407 408 409 410 411 412
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

        if len(x_dims_mapping) != len(out_dims_mapping) + 1:
            return False

        if is_dim_shard(x_dims_mapping[-1]):
            return False

        return True

413 414 415
    def is_output_compatible(self, dist_op):
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
416 417 418 419 420 421 422 423 424 425
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

        if len(x_dims_mapping) != len(out_dims_mapping) + 1:
            return False

        return True

沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
426
    def is_auto_compatible(self, dist_op):
427 428 429
        if (not self.is_input_compatible(dist_op)) or (
            not self.is_output_compatible(dist_op)
        ):
430 431
            return False

沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
432 433 434 435 436 437 438 439
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_shape_name = op_desc.output('XShape')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
440 441
            x_shape_name
        )
沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
442 443 444 445

        if is_dim_shard(x_dims_mapping[-1]):
            return False

446
        for idx, item in enumerate(x_dims_mapping[:-1]):
沉潜的鱼儿's avatar
沉潜的鱼儿 已提交
447 448 449 450 451 452 453 454 455 456 457
            if out_dims_mapping[idx] != item:
                return False

        if x_shape_dims_mapping[0] != -1:
            return False

        if x_shape_dims_mapping[1:] != x_dims_mapping[:]:
            return False

        return True

458
    def update_dims_mapping(self, dist_op):
459
        changed = False
460 461
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
462 463 464 465 466 467
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_shape_name = op_desc.output('XShape')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
468 469
            x_shape_name
        )
470 471 472

        for i in range(len(out_dims_mapping)):
            dim_changed = compute_compatible_and_update_dim_mapping(
473 474
                [x_dims_mapping, out_dims_mapping], [i, i]
            )
475 476 477 478 479 480
            if dim_changed:
                changed = True

        for i in range(len(x_dims_mapping)):
            x_shape_dims_mapping[i + 1] = x_dims_mapping[i]

481 482 483 484 485 486 487
        if changed:
            op_dist_attr.set_input_dims_mapping(x_name, x_dims_mapping)
            op_dist_attr.set_output_dims_mapping(out_name, out_dims_mapping)
            op_dist_attr.set_output_dims_mapping(
                x_shape_name, x_shape_dims_mapping
            )

488 489
        return changed

490 491 492 493 494 495
    @staticmethod
    def forward(ctx, *args, **kwargs):
        """
        kwargs: inputname_mapping & outputname_mapping
        """

496
        dist_op_context = ctx.dist_op_context
497 498 499
        main_block = dist_op_context.work_block
        src_op = dist_op_context.cur_src_op
        rank_id = dist_op_context.rank_id
500
        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
501 502 503
        assert (
            op_dist_attr is not None
        ), "backward op [{}] don't have dist attribute !".format(str(src_op))
504

505
        # check validation of inputs / outputs
506 507
        for input_name in src_op.desc.input_names():
            assert input_name in kwargs, "input [{}] is not given".format(
508 509
                input_name
            )
510 511 512 513 514
            assert len(kwargs[input_name]) == len(
                src_op.desc.input(input_name)
            ), "number of tensor for input [{}] is not match".format(input_name)
        for output_name in src_op.desc.output_names():
            assert output_name in kwargs, "input [{}] is not given".format(
515 516
                output_name
            )
517 518 519
            assert len(kwargs[output_name]) == len(
                src_op.desc.output(output_name)
            ), "number of tensor for input [{}] is not match".format(
520 521
                output_name
            )
522

Z
zhaoyingli 已提交
523 524 525
        X_var = main_block._var_recursive(kwargs['X'][0])
        Out_var = main_block._var_recursive(kwargs['Out'][0])
        XShape_var = main_block._var_recursive(kwargs['XShape'][0])
526 527 528 529 530 531 532 533 534 535
        shape_list = src_op.desc.attr("shape")
        ShapeTensor_var_list = []
        for name in kwargs['ShapeTensor']:
            ShapeTensor_var_list.append(name)
        Shape_var_list = []
        for name in kwargs['Shape']:
            Shape_var_list.append(name)

        # got dist attribute info
        dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name)
536
        process_mesh_shape = op_dist_attr.process_mesh.shape
537 538 539 540 541

        # modify target shape
        for idx, axis in enumerate(dim_mapping):
            if axis >= 0:
                if len(shape_list) > idx:
542 543 544
                    shape_list[idx] = (
                        shape_list[idx] // process_mesh_shape[axis]
                    )
545 546

        # create op
547 548
        new_op = main_block.append_op(type='nop')
        new_op_desc = new_op.desc
549
        new_op_desc.copy_from(src_op.desc)
550
        set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx)
551 552 553 554 555 556
        new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list)
        new_op_desc.set_input('Shape', Shape_var_list)
        new_op_desc.set_input('X', [X_var.name])
        new_op_desc.set_output('XShape', [XShape_var.name])
        new_op_desc.set_output('Out', [Out_var.name])
        new_op_desc._set_attr('shape', shape_list)
557
        # TODO: should we add a new dist attr for the new op here?
558 559 560

    @staticmethod
    def backward(ctx, *args, **kwargs):
561
        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
562

563

564 565
class DistributedReshapeImpl2(DistributedOperatorImpl):
    def __init__(self, name):
566
        super().__init__(name)
567 568 569
        self._forward_implemented = True
        self._backward_implemented = False

C
caozhou 已提交
570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
    def calc_cost(self, op_role, dist_op, ctx, cluster):
        cost = None
        if int(op_role) == int(OpRole.Backward):
            cost = self.calc_bwd_cost(dist_op, ctx, cluster)
        else:
            cost = self.calc_fwd_cost(dist_op, ctx, cluster)
        assert cost is not None
        return cost

    def calc_fwd_cost(self, dist_op, ctx, cluster):
        res = []
        op = dist_op.serial_op
        dist_attr = dist_op.dist_attr

        shape_list = op.desc.attr("shape")
        # got dist attribute info
        dim_mapping = dist_attr.get_output_dims_mapping(op.output("Out")[0])
587
        process_mesh_shape = dist_attr.process_mesh.shape
C
caozhou 已提交
588 589 590 591 592

        # modify target shape
        for idx, axis in enumerate(dim_mapping):
            if axis >= 0:
                if len(shape_list) > idx:
593 594 595
                    shape_list[idx] = (
                        shape_list[idx] // process_mesh_shape[axis]
                    )
C
caozhou 已提交
596 597

        # calc comp op cost
598 599 600
        desc_mapping = build_comp_desc_from_dist_op(
            dist_op=dist_op, dist_context=ctx
        )
601
        processes = dist_attr.process_mesh.process_ids
C
caozhou 已提交
602 603 604
        for key in desc_mapping:
            desc_mapping[key]["shape"] = shape_list

605 606 607
        cost_mapping = build_comp_costs_from_descs(
            Reshape2OpCost, ctx, processes, desc_mapping, cluster
        )
C
caozhou 已提交
608 609 610 611 612 613 614
        res.append(cost_mapping)

        return res

    def calc_bwd_cost(self, dist_op, ctx, cluster):
        # calc comp op cost
        res = []
615 616 617
        desc_mapping = build_comp_desc_from_dist_op(
            dist_op=dist_op, dist_context=ctx
        )
C
caozhou 已提交
618 619
        dist_attr = dist_op.dist_attr
        process_mesh = dist_attr.process_mesh
620
        processes = process_mesh.process_ids
C
caozhou 已提交
621 622
        op_type = dist_op.serial_op.type

623 624 625
        cost_mapping = build_comp_costs_from_descs(
            Reshape2GradOpCost, ctx, processes, desc_mapping, cluster
        )
C
caozhou 已提交
626 627 628 629 630 631 632 633
        res.append(cost_mapping)

        backward_op = dist_op.serial_op
        main_block = backward_op.block
        need_gradient_allreduce = False
        for input_name in backward_op.desc.input_names():
            for varname in backward_op.desc.input(input_name):
                if "@GRAD" not in varname and not is_parameter_related(
634 635
                    varname, main_block
                ):
C
caozhou 已提交
636 637 638
                    # NOTE input var's dim_mapping of backward op should be the same with input var instead of corresponding varname of forward op
                    var_dim_mapping = dist_attr.get_input_dims_mapping(varname)

639
                    mesh_shape = process_mesh.shape
C
caozhou 已提交
640 641 642 643 644
                    batch_size_axis = var_dim_mapping[0]
                    if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1:
                        parallel_axis = batch_size_axis
                        attrs = {"use_calc_stream": True}
                        var_names = [varname + "@GRAD"]
645 646 647 648 649 650 651 652 653
                        build_dp_costs(
                            res,
                            dist_op,
                            ctx,
                            var_names,
                            attrs,
                            parallel_axis,
                            cluster,
                        )
C
caozhou 已提交
654 655 656

        return res

657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
    def is_input_compatible(self, dist_op):
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

        if len(x_dims_mapping) != len(out_dims_mapping):
            return False

        return True

    def is_output_compatible(self, dist_op):
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
        out_name = op_desc.output('Out')[0]
        x_name = op_desc.input('X')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

        if len(x_dims_mapping) != len(out_dims_mapping):
            return False

        return True

    def is_auto_compatible(self, dist_op):
684 685 686
        if (not self.is_input_compatible(dist_op)) or (
            not self.is_output_compatible(dist_op)
        ):
687 688 689 690 691 692 693 694 695 696
            return False

        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_shape_name = op_desc.output('XShape')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
697 698
            x_shape_name
        )
699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721

        for idx, item in enumerate(x_dims_mapping[:-1]):
            if out_dims_mapping[idx] != item:
                return False

        if x_shape_dims_mapping[0] != -1:
            return False

        if x_shape_dims_mapping[1:] != out_dims_mapping[:]:
            return False

        return True

    def update_dims_mapping(self, dist_op):
        changed = False
        op_desc = dist_op.serial_op.desc
        op_dist_attr = dist_op.dist_attr
        x_name = op_desc.input('X')[0]
        out_name = op_desc.output('Out')[0]
        x_shape_name = op_desc.output('XShape')[0]
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
722 723
            x_shape_name
        )
724 725 726

        for i in range(len(out_dims_mapping) - 1):
            dim_changed = compute_compatible_and_update_dim_mapping(
727 728
                [x_dims_mapping, out_dims_mapping], [i, i]
            )
729 730 731 732 733 734
            if dim_changed:
                changed = True

        for i in range(len(out_dims_mapping)):
            x_shape_dims_mapping[i + 1] = out_dims_mapping[i]

735 736 737 738 739 740 741
        if changed:
            op_dist_attr.set_input_dims_mapping(x_name, x_dims_mapping)
            op_dist_attr.set_output_dims_mapping(out_name, out_dims_mapping)
            op_dist_attr.set_output_dims_mapping(
                x_shape_name, x_shape_dims_mapping
            )

742 743 744 745 746 747 748 749 750 751 752 753
        return changed

    @staticmethod
    def forward(ctx, *args, **kwargs):
        """
        kwargs: inputname_mapping & outputname_mapping
        """

        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.work_block
        src_op = dist_op_context.cur_src_op
        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
754 755 756
        assert (
            op_dist_attr is not None
        ), "backward op [{}] don't have dist attribute !".format(str(src_op))
757 758 759 760

        # check validation of inputs / outputs
        for input_name in src_op.desc.input_names():
            assert input_name in kwargs, "input [{}] is not given".format(
761 762
                input_name
            )
763 764 765 766 767
            assert len(kwargs[input_name]) == len(
                src_op.desc.input(input_name)
            ), "number of tensor for input [{}] is not match".format(input_name)
        for output_name in src_op.desc.output_names():
            assert output_name in kwargs, "input [{}] is not given".format(
768 769
                output_name
            )
770 771 772
            assert len(kwargs[output_name]) == len(
                src_op.desc.output(output_name)
            ), "number of tensor for input [{}] is not match".format(
773 774
                output_name
            )
775

Z
zhaoyingli 已提交
776 777 778
        X_var = main_block._var_recursive(kwargs['X'][0])
        Out_var = main_block._var_recursive(kwargs['Out'][0])
        XShape_var = main_block._var_recursive(kwargs['XShape'][0])
779 780 781 782 783 784 785 786 787 788
        shape_list = src_op.desc.attr("shape")
        ShapeTensor_var_list = []
        for name in kwargs['ShapeTensor']:
            ShapeTensor_var_list.append(name)
        Shape_var_list = []
        for name in kwargs['Shape']:
            Shape_var_list.append(name)

        # got dist attribute info
        out_dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name)
789
        process_mesh_shape = op_dist_attr.process_mesh.shape
790 791 792 793 794

        # modify target shape
        for idx, axis in enumerate(out_dim_mapping):
            if axis >= 0:
                if len(shape_list) > idx:
795 796 797
                    shape_list[idx] = (
                        shape_list[idx] // process_mesh_shape[axis]
                    )
798 799

        # create op
800 801
        new_op = main_block.append_op(type='nop')
        new_op_desc = new_op.desc
802 803 804 805 806 807 808 809
        new_op_desc.copy_from(src_op.desc)
        set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx)
        new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list)
        new_op_desc.set_input('Shape', Shape_var_list)
        new_op_desc.set_input('X', [X_var.name])
        new_op_desc.set_output('XShape', [XShape_var.name])
        new_op_desc.set_output('Out', [Out_var.name])
        new_op_desc._set_attr('shape', shape_list)
810
        # TODO: should we add a new dist attr for the new op here?
811 812 813 814 815 816

    @staticmethod
    def backward(ctx, *args, **kwargs):
        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)


817
register_distributed_operator_impl(
818 819 820 821 822 823 824 825
    "reshape2", DistributedReshapeImpl0("add_one_dim_back")
)
register_distributed_operator_impl(
    "reshape2", DistributedReshapeImpl1("remove_one_dim_back")
)
register_distributed_operator_impl(
    "reshape2", DistributedReshapeImpl2("same_dim_shape")
)