From 7a5004cc49624ca25a21ef5d1194f36855d7913b Mon Sep 17 00:00:00 2001 From: Yi Huaijie Date: Mon, 18 May 2020 20:06:35 +0800 Subject: [PATCH] init the slices of a Initialzer on different devices --- mindspore/common/api.py | 12 ++- mindspore/common/initializer.py | 14 ++++ mindspore/common/parameter.py | 29 ++++++- mindspore/nn/cell.py | 13 +++- mindspore/parallel/_tensor.py | 15 ++++ .../parallel/test_initializer_weight_slice.py | 75 +++++++++++++++++++ 6 files changed, 153 insertions(+), 5 deletions(-) create mode 100644 tests/ut/python/parallel/test_initializer_weight_slice.py diff --git a/mindspore/common/api.py b/mindspore/common/api.py index 16df9a00e..01451b694 100644 --- a/mindspore/common/api.py +++ b/mindspore/common/api.py @@ -328,8 +328,13 @@ class _Executor: def _params_init_data(self, obj, params): if params is not None: - for _, param in params.items(): - param.init_data() + for key, param in params.items(): + if key not in obj.parameter_layout_dict: + logger.info("Layout dict does not contain the key %s.", key) + param.init_data() + else: + layout = obj.parameter_layout_dict[key] + param.init_data(layout) obj.init_parameters_data() def compile(self, obj, *args, phase='predict', params=None, do_convert=True, auto_parallel_mode=False): @@ -377,10 +382,11 @@ class _Executor: if not do_convert: return phase, True + if auto_parallel_mode and "train" in phase: + obj.parameter_layout_dict = self._executor.get_parameter_layout(phase) self._params_init_data(obj, params) if not enable_debug_runtime or enable_ge: if auto_parallel_mode and "train" in phase: - obj.parameter_layout_dict = self._executor.get_parameter_layout(phase) obj.load_parameter_slice(params) # the following GE init process is not needed when use vm or ms backend diff --git a/mindspore/common/initializer.py b/mindspore/common/initializer.py index 352fdebe8..5c8c18989 100644 --- a/mindspore/common/initializer.py +++ b/mindspore/common/initializer.py @@ -41,6 +41,7 @@ class Initializer: self._kwargs = kwargs self.shape = None self.dtype = None + self._seed = None def _initialize(self, *kwargs): raise NotImplementedError('Must be overridden!') @@ -48,6 +49,15 @@ class Initializer: def __call__(self, arr): return self._initialize(arr) + @property + def seed(self): + return self._seed + + @seed.setter + def seed(self, seed_): + """set the random seed.""" + self._seed = seed_ + @property def shape(self): return self._shape @@ -65,6 +75,7 @@ class Initializer: self._dtype = dtype def to_tensor(self): + """Get the tensor format data of this Initializer.""" arr = None try: arr = np.ndarray(self.shape) @@ -72,7 +83,10 @@ class Initializer: msg = "Error shape={}".format(self.shape) logger.error(msg) raise ValueError(msg) + if self._seed is not None: + np.random.seed(self.seed) self.__call__(arr) + self._seed = None return Tensor(arr, dtype=self.dtype) def _register(*aliases): diff --git a/mindspore/common/parameter.py b/mindspore/common/parameter.py index ee4551549..f425d882b 100644 --- a/mindspore/common/parameter.py +++ b/mindspore/common/parameter.py @@ -20,6 +20,7 @@ from .initializer import initializer, Initializer from .tensor import Tensor, MetaTensor from .._checkparam import _check_str_by_regular from ..parallel._utils import _set_clone_info, _CloneInfo +from ..parallel._tensor import _get_seed __all__ = ['Parameter', 'ParameterTuple'] @@ -55,6 +56,7 @@ class Parameter: self.requires_grad = requires_grad self.layerwise_parallel = layerwise_parallel self._is_init = False + self._sliced = False self.clone_info = _CloneInfo() def __repr__(self): @@ -91,6 +93,11 @@ class Parameter: raise ValueError("The type of the name should be `str` or `None`.") self._name = name_ + @property + def sliced(self): + """Get slice status of the parameter.""" + return self._sliced + @property def is_init(self): """Get init status of the parameter.""" @@ -196,11 +203,31 @@ class Parameter: self.default_input = data - def init_data(self): + def init_data(self, layout=None): + """ + Init data of the parameter. + + Args: + layout (list[list[int]]): parameter slice layout [dev_mat, tensor_map, slice_shape]. + dev_mat (list[int]): device matrix. + tensor_map (list[int]): tensor map. + slice_shape (list[int]): shape of slice. + """ if not isinstance(self.default_input, MetaTensor): return + if layout is not None: + if not isinstance(layout, list): + raise TypeError("The layout should be list! layout is {}." + .format(layout)) + if len(layout) != 3: + raise ValueError("The length of layout must be 3! layout is {}." + .format(layout)) + self.init_mode.shape = layout[2] + self.init_mode.seed = int(_get_seed(layout[0], layout[1])) + self.default_input = self.init_mode.to_tensor() self.init_mode = None + self._sliced = True class ParameterTuple(tuple): diff --git a/mindspore/nn/cell.py b/mindspore/nn/cell.py index c95160620..30ff8d025 100755 --- a/mindspore/nn/cell.py +++ b/mindspore/nn/cell.py @@ -249,6 +249,9 @@ class Cell: if key not in self.parameter_layout_dict: logger.info("layout dict does not contain the key %s", key) continue + if self.parameters_dict()[key].sliced: + logger.info("Param %s is from initializer, already sliced.", key) + continue layout = self.parameter_layout_dict[key] new_tensor = _load_tensor_by_layout(tensor, layout) self.parameters_dict()[key].set_parameter_data(new_tensor) @@ -258,6 +261,9 @@ class Cell: if key not in self.parameter_layout_dict: logger.info("layout dict does not contain the key %s", key) continue + if params[key].sliced: + logger.info("Param %s is from initializer, already sliced.", key) + continue layout = self.parameter_layout_dict[key] new_tensor = _load_tensor_by_layout(tensor, layout) params[key].set_parameter_data(new_tensor) @@ -398,7 +404,12 @@ class Cell: def init_parameters_data(self, recurse=True): for param in self.get_parameters(expand=recurse): - param.init_data() + if param.name not in self.parameter_layout_dict: + logger.info("Layout dict does not contain the key %s.", param.name) + param.init_data() + else: + layout = self.parameter_layout_dict[param.name] + param.init_data(layout) def parameters_dict(self, recurse=True): """ diff --git a/mindspore/parallel/_tensor.py b/mindspore/parallel/_tensor.py index 019ec4aa1..073ad9809 100644 --- a/mindspore/parallel/_tensor.py +++ b/mindspore/parallel/_tensor.py @@ -168,6 +168,21 @@ def _chunk_tensor_by_strategy(np_tensor, strategy): raise ValueError("The length of np_tensor does not match the length of strategy!") return _chunk_tensor(np_tensor, strategy, len(strategy)) +def _get_seed(dev_mat, tensor_map): + """ + Get the random seed for current slice. + + Args: + dev_mat (list): The device matrix of devices. + tensor_map (list): The split strategy of tensor. + + Returns: + Integer, the local random seed for this device. + """ + rank = get_rank() + tensor_strategy = _get_tensor_strategy(dev_mat, tensor_map) + tensor_slice_seed = _get_tensor_slice_index(dev_mat, tensor_strategy, tensor_map, rank) + return tensor_slice_seed def _load_tensor(tensor, dev_mat, tensor_map): """ diff --git a/tests/ut/python/parallel/test_initializer_weight_slice.py b/tests/ut/python/parallel/test_initializer_weight_slice.py new file mode 100644 index 000000000..7f63c5a65 --- /dev/null +++ b/tests/ut/python/parallel/test_initializer_weight_slice.py @@ -0,0 +1,75 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from mindspore import context +import mindspore.nn as nn +from mindspore.ops import operations as P +from mindspore import Tensor, Parameter +import mindspore as ms +import mindspore.common.api as me +from mindspore.common.initializer import initializer +from hccl_test.manage.api import Hccl + + +def test_initializer_weight_slice(): + class Net(nn.Cell): + def __init__(self, strategy1, strategy2, weight): + super().__init__() + self.weight = Parameter(weight, "w1") + self.matmul = P.MatMul(transpose_a=False, transpose_b=True).set_strategy(strategy1) + self.relu = P.ReLU().set_strategy(strategy2) + + def construct(self, x): + out = self.matmul(x, self.weight) + out = self.relu(out) + return out + + def get_slice(rank): + hccl = Hccl() + rank_save = hccl.rank_id + hccl.rank_id = rank + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num=8, global_rank=0) + context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") + strategy1 = ((2, 1), (4, 1)) + strategy2 = ((2, 4),) + context.set_context(mode=context.GRAPH_MODE) + exe = me._executor + + x = Tensor(np.ones([32, 32]), dtype=ms.float32) + weight = initializer("Uniform", [64, 32], ms.float32) + net = Net(strategy1, strategy2, weight) + net.set_auto_parallel() + exe.compile(net, x, auto_parallel_mode=True, phase='train') + hccl.rank_id = rank_save + return net.parameters_dict()['w1'].data.asnumpy() + + slice0 = get_slice(0) + slice1 = get_slice(1) + slice4 = get_slice(4) + slice_shape = slice0.shape + + slice0 = slice0.flatten() + slice1 = slice1.flatten() + slice4 = slice4.flatten() + expect_slice_shape = (16, 32) + + assert expect_slice_shape == slice_shape + assert all(slice0 == slice4) + assert any(slice0 != slice1) + + +if __name__ == '__main__': + test_initializer_weight_slice() -- GitLab