[NPU] add FLAGS_npu_storage_format env to enable npu storage format, test=develop (#48774)

e5bc2eec · Qi Li · GitHub · c6a2b0fd · e5bc2eec · e5bc2eec
4 changed file
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -1041,6 +1041,18 @@ PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
                              "Predictor",
                              "Choose default funciton type in JitLayer.");

+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+/**
+ * Custom Device NPU related FLAG
+ * Name: FLAGS_npu_storage_format
+ * Since Version: 2.5.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: Enable NPU Storage Format for Ascend910 performance improvement.
+ */
+PADDLE_DEFINE_EXPORTED_bool(npu_storage_format, false, "");
+#endif
+
 #ifdef PADDLE_WITH_CUDNN_FRONTEND
 /**
 * CUDNNv8 related FLAG

--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import os
 import inspect
 import numpy as np
 import warnings
@@ -379,7 +380,11 @@ def monkey_patch_varbase():

            new_ivar = self._grad_ivar()
            # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
-            if 'npu' in get_all_custom_device_type():
+            if (
+                os.environ.get('FLAGS_npu_storage_format', None)
+                in [1, '1', True, 'True', 'true']
+                and 'npu' in get_all_custom_device_type()
+            ):
                new_ivar = paddle.incubate._npu_identity(x=new_ivar, format=-1)
            new_ivar = new_ivar._copy_to(core.CPUPlace(), True)
            if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:

--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import os
+
 from paddle import _C_ops, _legacy_C_ops, get_flags, in_dynamic_mode
 from paddle.device import (
    get_all_custom_device_type,
@@ -149,7 +151,11 @@ def _conv_nd(
            new_shape[channel_dim] = -1
            bias = bias.reshape(new_shape)
            # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
-            if 'npu' in get_all_custom_device_type():
+            if (
+                os.environ.get('FLAGS_npu_storage_format', None)
+                in [1, '1', True, 'True', 'true']
+                and 'npu' in get_all_custom_device_type()
+            ):
                with no_grad():
                    bias_storage = _C_ops.npu_identity(
                        bias, 3
@@ -747,7 +753,11 @@ def conv2d(
                        + [1 for i in range(len(x.shape) - channel_dim - 1)],
                    )
                # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
-                if 'npu' in get_all_custom_device_type():
+                if (
+                    os.environ.get('FLAGS_npu_storage_format', None)
+                    in [1, '1', True, 'True', 'true']
+                    and 'npu' in get_all_custom_device_type()
+                ):
                    with no_grad():
                        bias_storage = _C_ops.npu_identity(
                            bias, 3

--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -28,6 +28,7 @@
 # TODO: define normalization api

 import numbers
+import os
 import warnings

 import numpy as np
@@ -681,7 +682,11 @@ class _BatchNormBase(Layer):
        self._variance.stop_gradient = True

        # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
-        if 'npu' in get_all_custom_device_type():
+        if (
+            os.environ.get('FLAGS_npu_storage_format', None)
+            in [1, '1', True, 'True', 'true']
+            and 'npu' in get_all_custom_device_type()
+        ):
            with no_grad():
                weight_trans = _C_ops.npu_identity(
                    self.weight, 3