support multi param for tuple grad

3c205729 · panyifeng · bc4b1c24 · 3c205729 · 3c205729 · 3c205729
13 changed file
--- a/mindspore/ccsrc/operator/prim_others.cc
+++ b/mindspore/ccsrc/operator/prim_others.cc
@@ -59,7 +59,8 @@ class UndeterminedShapeType {
 public:
  explicit UndeterminedShapeType(const std::string &env_str) {
    // param_name indices_shape indices_type values_shape values_type dense_shape
-    // export UNDETERMINED_SPARSE_SHAPE_TYPES="w1:2:Int32:2 1 2:Float32:3 1 2"
+    // export UNDETERMINED_SPARSE_SHAPE_TYPES="sparse_key_w1:2:Int32:2 1 2:Float32:3 1 2;sparse_key_w2:2:Int32:2 1
+    // 2:Float32:3 1 2"
    std::vector<string> fields;
    string tmp;
    std::stringstream input(env_str);
@@ -115,6 +116,20 @@ std::vector<int> UndeterminedShapeType::GetShape(const std::string &shape_str) {
 }
 const size_t UndeterminedShapeType::fields_num = 6;

+std::unordered_map<std::string, UndeterminedShapeType> g_undetermined_configs;
+void InitUndeterminedFromEnv(const std::string &sparse_shape_types) {
+  if (!g_undetermined_configs.empty()) {
+    return;
+  }
+  std::string tmp;
+  std::stringstream input(sparse_shape_types);
+  while (std::getline(input, tmp, ';')) {
+    auto config = UndeterminedShapeType(tmp);
+    g_undetermined_configs.insert(std::make_pair(config.param_name(), config));
+    MS_LOG(DEBUG) << "Undetermined config from env: " << tmp;
+  }
+}
+
 AbstractBasePtr InferImplEnvGetItem(const AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                    const AbstractBasePtrList &args_spec_list) {
  MS_EXCEPTION_IF_NULL(primitive);
@@ -128,27 +143,33 @@ AbstractBasePtr InferImplEnvGetItem(const AnalysisEnginePtr &, const PrimitivePt
    MS_LOG(EXCEPTION) << "EnvGetItem evaluator args[1] should be a SymbolicKeyInstance but: " << key->ToString();
  }

-  if (key->sparse_grad()) {
+  if (!key->sparse_grad().empty()) {
    // Will be fixed once undetermined type ready
    auto sparse_shape_types = common::GetEnv("UNDETERMINED_SPARSE_SHAPE_TYPES");
    if (sparse_shape_types.empty()) {
-      sparse_shape_types = "w1:2:Int32:2 1 2:Float32:3 1 2";
+      sparse_shape_types = "sparse_key_w1:2:Int32:2 1 2:Float32:3 1 2;sparse_key_w2:2:Int32:2 1 2:Float32:3 1 2";
    }
-    MS_LOG(DEBUG) << "EnvGetItem is sparse_grad " << key->ToString() << ", Undetermined shape is "
-                  << sparse_shape_types;
+    InitUndeterminedFromEnv(sparse_shape_types);

-    auto shape_types = UndeterminedShapeType(sparse_shape_types);
+    auto shape_types = g_undetermined_configs.find(key->sparse_grad());
+    if (shape_types == g_undetermined_configs.end()) {
+      MS_LOG(EXCEPTION) << "Param " << key->ToString()
+                        << " has sparse_grad, but shape/type is not configured in env UNDETERMINED_SPARSE_SHAPE_TYPES: "
+                        << sparse_shape_types;
+    }
+    MS_LOG(DEBUG) << "EnvGetItem is sparse_grad " << key->ToString();
    AbstractBasePtrList sparse_list;
    // indices
-    auto indices_ele = std::make_shared<AbstractScalar>(kAnyValue, shape_types.indices_type());
-    auto indices = std::make_shared<AbstractTensor>(indices_ele, std::make_shared<Shape>(shape_types.indices_shape()));
+    auto indices_ele = std::make_shared<AbstractScalar>(kAnyValue, shape_types->second.indices_type());
+    auto indices =
+      std::make_shared<AbstractTensor>(indices_ele, std::make_shared<Shape>(shape_types->second.indices_shape()));
    sparse_list.emplace_back(indices);
    // values
-    auto dout_ele = std::make_shared<AbstractScalar>(kAnyValue, shape_types.values_type());
-    auto dout = std::make_shared<AbstractTensor>(dout_ele, std::make_shared<Shape>(shape_types.values_shape()));
+    auto dout_ele = std::make_shared<AbstractScalar>(kAnyValue, shape_types->second.values_type());
+    auto dout = std::make_shared<AbstractTensor>(dout_ele, std::make_shared<Shape>(shape_types->second.values_shape()));
    sparse_list.emplace_back(dout);
    // dense_shape
-    sparse_list.emplace_back(std::make_shared<AbstractTuple>(shape_types.dense_shape()));
+    sparse_list.emplace_back(std::make_shared<AbstractTuple>(shape_types->second.dense_shape()));
    return std::make_shared<AbstractTuple>(sparse_list);
  }


--- a/mindspore/ccsrc/pipeline/action.cc
+++ b/mindspore/ccsrc/pipeline/action.cc
@@ -229,7 +229,8 @@ bool AbstractSpecializeAction(const ResourcePtr &res) {
    if (param_node->has_default()) {
      auto param_value = std::dynamic_pointer_cast<ParamValuePy>(param_node->default_param());
      AbstractBasePtr ptr = abstract::FromValue(parse::data_converter::PyDataToValue(param_value->value()), true);
-      auto sparse_grad = py::cast<bool>(parse::python_adapter::GetPyObjAttr(param_value->value(), "sparse_grad"));
+      auto sparse_grad =
+        py::cast<std::string>(parse::python_adapter::GetPyObjAttr(param_value->value(), "sparse_grad"));
      ptr->set_sparse_grad(sparse_grad);

      parallel::ParallelParameterContextRestoreInNoTraining(func_graph, param_node, ptr);

--- a/mindspore/ccsrc/pipeline/static_analysis/abstract_value.h
+++ b/mindspore/ccsrc/pipeline/static_analysis/abstract_value.h
@@ -44,7 +44,7 @@ class AbstractBase : public Base {
 public:
  explicit AbstractBase(const ValuePtr &value = nullptr, const TypePtr &type = kAnyType,
                        const BaseShapePtr &shape = kNoShape)
-      : value_(value), type_(type), shape_(shape), sparse_grad_(false) {}
+      : value_(value), type_(type), shape_(shape), sparse_grad_("") {}
  ~AbstractBase() override = default;
  MS_DECLARE_PARENT(AbstractBase, Base)

@@ -53,13 +53,13 @@ class AbstractBase : public Base {

  virtual bool operator==(const AbstractBase &other) const;
  void set_value(const ValuePtr &value) { value_ = value; }
-  void set_sparse_grad(const bool &sparse_grad) { sparse_grad_ = sparse_grad; }
+  void set_sparse_grad(const std::string &sparse_grad) { sparse_grad_ = sparse_grad; }
  void set_type(const TypePtr &type) { type_ = type; }
  void set_shape(const BaseShapePtr &shape) { shape_ = shape; }
  void set_value_desc(const std::string &desc) { value_desc_ = desc; }
  const std::string &value_desc() const { return value_desc_; }
  ValuePtr GetValueTrack() const { return value_; }
-  bool sparse_grad() const { return sparse_grad_; }
+  const std::string &sparse_grad() const { return sparse_grad_; }
  TypePtr GetTypeTrack() const { return type_; }
  BaseShapePtr GetShapeTrack() const { return shape_; }

@@ -87,7 +87,7 @@ class AbstractBase : public Base {
  TypePtr type_;
  BaseShapePtr shape_;
  std::string value_desc_;  // store initial value description for error report
-  bool sparse_grad_;
+  std::string sparse_grad_;
 };

 class AbstractScalar : public AbstractBase {

--- a/mindspore/common/parameter.py
+++ b/mindspore/common/parameter.py
@@ -51,9 +51,9 @@ class Parameter:
        requires_grad (bool): True if the parameter requires gradient. Default: True.
        layerwise_parallel (bool): A kind of model parallel mode. When layerwise_parallel is true in paralle mode,
            broadcast and gradients communication would not be applied on parameters. Default: False.
-        sparse_grad (bool): True if the parameter's gradient is sparse. Default: False.
+        sparse_grad (str): Set if the parameter's gradient is sparse. Default: empty.
    """
-    def __init__(self, default_input, name, requires_grad=True, layerwise_parallel=False, sparse_grad=False):
+    def __init__(self, default_input, name, requires_grad=True, layerwise_parallel=False, sparse_grad=""):
        self.set_parameter_data(default_input)
        self.name = name
        self.requires_grad = requires_grad
@@ -181,9 +181,9 @@ class Parameter:
        return self._sparse_grad

    @sparse_grad.setter
-    def sparse_grad(self, value=True):
-        if not isinstance(value, bool):
-            raise TypeError("`sparse_grad` parameter must be bool type")
+    def sparse_grad(self, value=""):
+        if not isinstance(value, str):
+            raise TypeError("`sparse_grad` parameter must be str type")
        self._sparse_grad = value

    @property

--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -156,7 +156,7 @@ class Adam(Optimizer):
        To improve parameter groups performance, the customized order of parameters can be supported.

        The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the
-        `sparse_grad` of `Parameter` being set as True. The sparse feature is under continuous development. The sparse
+        `sparse_grad` of `Parameter` being set. The sparse feature is under continuous development. The sparse
        behavior is currently performed on the CPU, weight decay is not supported.

    Args:

--- a/mindspore/nn/optim/ftrl.py
+++ b/mindspore/nn/optim/ftrl.py
@@ -72,7 +72,7 @@ class FTRL(Optimizer):

    Note:
        The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the
-        `sparse_grad` of `Parameter` being set as True. The sparse feature is under continuous development. The sparse
+        `sparse_grad` of `Parameter` being set. The sparse feature is under continuous development. The sparse
        behavior is currently performed on the CPU, weight decay is not supported.

    Args:

--- a/mindspore/nn/optim/lazyadam.py
+++ b/mindspore/nn/optim/lazyadam.py
@@ -92,9 +92,10 @@ class LazyAdam(Optimizer):
        applied on the parameters if `weight_decay` > 0 and the 'beta' and 'gamma' are not in the name of parameters.

        The sparse strategy is applied while the SparseGatherV2 operator being used for forward network and the
-        `sparse_grad` of `Parameter` being set as True. The sparse behavior, to be notice, is not equivalent to the
+        `sparse_grad` of `Parameter` being set. The sparse behavior, to be notice, is not equivalent to the
        original Adam algorithm, as only the current indices parames will be updated. The sparse feature is under
-        continuous development. The sparse behavior is currently performed on the CPU, weight decay is not supported.
+        continuous development. The sparse behavior is currently performed on the CPU, weight decay is
+        not supported.

    Args:
        params (Union[list[Parameter], list[dict]]): When the `params` is a list of `Parameter` which will be updated,

--- a/mindspore/ops/composite/base.py
+++ b/mindspore/ops/composite/base.py
@@ -241,6 +241,7 @@ class HyperMap(HyperMap_):
            return func(*args_list)
        return tuple(map(hypermap, *args_list))

+
 class Map(Map_):
    """
    Map will apply the set operation on input sequences.
@@ -271,37 +272,12 @@ class Map(Map_):
            Map_.__init__(self)

    def __call__(self, *args):
-        func = args[0]
-        count = 0
-        count_max = 1
-        args_list = args[1:]
-        if self.ops is not None:
-            func = self.ops
-            args_list = args
-        for item in args_list:
-            if isinstance(item, (tuple, list)):
-                count_max = len(item)
-                break
-
-        def get_item(x):
-            nonlocal count
-            if isinstance(x, (tuple, list)):
-                return x[count]
-            return x
-
-        for i in range(count_max):
-            true_args = tuple(map(get_item, args_list))
-            func(*true_args)
-            count = i + 1
-        return True
-
-    def register(self, *type_names):
-        """Register a function for the given type string."""
-
-        def deco(fn):
-            self.register_fn(type_names, fn)
-            return fn
-        return deco
+        func = self.ops
+        args_list = args
+        if self.ops is None:
+            func = args[0]
+            args_list = args[1:]
+        return tuple(map(func, *args_list))


 class _ListAppend(ListAppend_):

--- a/tests/ut/python/nn/optim/test_adam.py
+++ b/tests/ut/python/nn/optim/test_adam.py
@@ -53,7 +53,8 @@ class NetWithSparseGatherV2(nn.Cell):
    """ NetWithSparseGatherV2 definition """
    def __init__(self):
        super(NetWithSparseGatherV2, self).__init__()
-        self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="weight1", sparse_grad=True)
+        self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)),
+                                 name="weight1", sparse_grad="sparse_key_w1")
        self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype((np.float32))), name="weight2")
        self.axis = 0
        self.gather = P.SparseGatherV2()

--- a/tests/ut/python/nn/optim/test_adam_with_tuple_grad.py
+++ b/tests/ut/python/nn/optim/test_adam_with_tuple_grad.py
@@ -154,8 +154,8 @@ def test_AdamWeightDecaySparse():
    class NetWithSparseGatherV2(nn.Cell):
        def __init__(self):
            super(NetWithSparseGatherV2, self).__init__()
-            self.w1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="w1", sparse_grad=True)
-            self.w2 = Parameter(Tensor(np.ones([2, 1, 2]).astype(np.float32)), name="w2")
+            self.w1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="w1", sparse_grad="sparse_key_w1")
+            self.w2 = Parameter(Tensor(np.ones([2, 1, 2]).astype(np.float32)), name="w2", sparse_grad="sparse_key_w2")
            self.gatherv2 = P.SparseGatherV2()
            self.axis = 0
        def construct(self, indices):

--- a/tests/ut/python/nn/optim/test_ftrl.py
+++ b/tests/ut/python/nn/optim/test_ftrl.py
@@ -41,7 +41,8 @@ class NetWithSparseGatherV2(nn.Cell):
    """ NetWithSparseGatherV2 definition """
    def __init__(self):
        super(NetWithSparseGatherV2, self).__init__()
-        self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="weight1", sparse_grad=True)
+        self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)),
+                                 name="weight1", sparse_grad="sparse_key_w1")
        self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype((np.float32))), name="weight2")
        self.axis = 0
        self.gather = P.SparseGatherV2()

--- a/tests/ut/python/nn/optim/test_lazyadam.py
+++ b/tests/ut/python/nn/optim/test_lazyadam.py
@@ -43,7 +43,8 @@ class NetWithSparseGatherV2(nn.Cell):
    """ NetWithSparseGatherV2 definition """
    def __init__(self):
        super(NetWithSparseGatherV2, self).__init__()
-        self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="weight1", sparse_grad=True)
+        self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)),
+                                 name="weight1", sparse_grad="sparse_key_w1")
        self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype((np.float32))), name="weight2")
        self.axis = 0
        self.gather = P.SparseGatherV2()

--- a/tests/ut/python/nn/optim/test_proximal_ada_grad.py
+++ b/tests/ut/python/nn/optim/test_proximal_ada_grad.py
@@ -40,7 +40,8 @@ class NetWithSparseGatherV2(nn.Cell):
    """ NetWithSparseGatherV2 definition """
    def __init__(self):
        super(NetWithSparseGatherV2, self).__init__()
-        self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="weight1", sparse_grad=True)
+        self.weight1 = Parameter(Tensor(np.ones([3, 1, 2]).astype(np.float32)), name="weight1",
+                                 sparse_grad="sparse_key_w1")
        self.weight2 = Parameter(Tensor(np.ones([2, 1, 2]).astype(np.float32)), name="weight2")
        self.axis = 0
        self.gather = P.SparseGatherV2()