diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
old mode 100644
new mode 100755
index 639605a64ed2898403825aa7f6defa9ede3b1f41..f0aa42495161e02ce2b03ec2d182820dd1b81a82
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -88,7 +88,8 @@ class TestSimpleRNN(unittest.TestCase):
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
         y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
-        y2 = paddle.multiply(y2, mask, axis=0)
+        mask = paddle.unsqueeze(mask, -1)
+        y2 = paddle.multiply(y2, mask)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
@@ -174,7 +175,8 @@ class TestGRU(unittest.TestCase):
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
         y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
-        y2 = paddle.multiply(y2, mask, axis=0)
+        mask = paddle.unsqueeze(mask, -1)
+        y2 = paddle.multiply(y2, mask)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
@@ -259,7 +261,8 @@ class TestLSTM(unittest.TestCase):
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
         y2, (h2, c2) = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
-        y2 = paddle.multiply(y2, mask, axis=0)
+        mask = paddle.unsqueeze(mask, -1)
+        y2 = paddle.multiply(y2, mask)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
@@ -343,5 +346,6 @@ def load_tests(loader, tests, pattern):
                     suite.addTest(test_class(time_major, direction, device))
     return suite
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
old mode 100644
new mode 100755
index f2a3da3ff6efe92fcb445696da787cd2f45689c6..950d942b7917e17a106ddcb5bb071d193d965e63
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
@@ -151,7 +151,8 @@ class TestSimpleRNN(unittest.TestCase):
                 if self.time_major:
                     mask = paddle.transpose(mask, [1, 0])
                 y, h = rnn2(x_data, sequence_length=seq_len)
-                y = paddle.multiply(y, mask, axis=0)
+                mask = paddle.unsqueeze(mask, -1)
+                y = paddle.multiply(y, mask)
 
         feed_dict = {x_data.name: x, seq_len.name: sequence_length}
 
@@ -297,7 +298,8 @@ class TestGRU(unittest.TestCase):
                 if self.time_major:
                     mask = paddle.transpose(mask, [1, 0])
                 y, h = rnn2(x_data, sequence_length=seq_len)
-                y = paddle.multiply(y, mask, axis=0)
+                mask = paddle.unsqueeze(mask, -1)
+                y = paddle.multiply(y, mask)
 
         feed_dict = {x_data.name: x, seq_len.name: sequence_length}
 
@@ -445,7 +447,8 @@ class TestLSTM(unittest.TestCase):
                 if self.time_major:
                     mask = paddle.transpose(mask, [1, 0])
                 y, (h, c) = rnn2(x_data, sequence_length=seq_len)
-                y = paddle.multiply(y, mask, axis=0)
+                mask = paddle.unsqueeze(mask, -1)
+                y = paddle.multiply(y, mask)
 
         feed_dict = {x_data.name: x, seq_len.name: sequence_length}
 
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_wrappers.py b/python/paddle/fluid/tests/unittests/rnn/test_wrappers.py
old mode 100644
new mode 100755
index 0fa76c9bcb1b761570ceedd5012728f5ed3ca017..85aebf86ed9bac23aa4075f65825044c43bfea68
--- a/python/paddle/fluid/tests/unittests/rnn/test_wrappers.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_wrappers.py
@@ -89,7 +89,8 @@ class TestRNNWrapper(unittest.TestCase):
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
         y2, h2 = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
-        y2 = paddle.multiply(y2, mask, axis=0)
+        mask = paddle.unsqueeze(mask, -1)
+        y2 = paddle.multiply(y2, mask)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
@@ -169,7 +170,8 @@ class TestBiRNNWrapper(unittest.TestCase):
         if self.time_major:
             mask = paddle.transpose(mask, [1, 0])
         y2, (fw_h2, bw_h2) = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
-        y2 = paddle.multiply(y2, mask, axis=0)
+        mask = paddle.unsqueeze(mask, -1)
+        y2 = paddle.multiply(y2, mask)
 
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(fw_h1, fw_h2.numpy(), atol=1e-8, rtol=1e-5)
diff --git a/python/paddle/fluid/tests/unittests/test_multiply.py b/python/paddle/fluid/tests/unittests/test_multiply.py
index 09a2007c1adb31ffb84082de425328fb2f3b6e40..72e5a4453f2911ef4f6796e343e380a7bdda5f4e 100755
--- a/python/paddle/fluid/tests/unittests/test_multiply.py
+++ b/python/paddle/fluid/tests/unittests/test_multiply.py
@@ -13,176 +13,84 @@
 # limitations under the License.
 
 from __future__ import print_function
-import paddle
-import paddle.tensor as tensor
-import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
-import numpy as np
 import unittest
 
+import numpy as np
+
+import paddle
+import paddle.tensor as tensor
+from paddle.static import Program, program_guard
 
-class TestMultiplyAPI(unittest.TestCase):
-    """TestMultiplyAPI."""
 
-    def __run_static_graph_case(self, x_data, y_data, axis=-1):
+class TestMultiplyApi(unittest.TestCase):
+    def _run_static_graph_case(self, x_data, y_data):
         with program_guard(Program(), Program()):
             paddle.enable_static()
             x = paddle.static.data(
                 name='x', shape=x_data.shape, dtype=x_data.dtype)
             y = paddle.static.data(
                 name='y', shape=y_data.shape, dtype=y_data.dtype)
-            res = tensor.multiply(x, y, axis=axis)
-
-            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            outs = exe.run(fluid.default_main_program(),
-                           feed={'x': x_data,
-                                 'y': y_data},
-                           fetch_list=[res])
-            res = outs[0]
-            return res
-
-    def __run_static_graph_case_with_numpy_input(self, x_data, y_data, axis=-1):
-        with program_guard(Program(), Program()):
-            paddle.enable_static()
+            res = tensor.multiply(x, y)
 
-            res = tensor.multiply(x_data, y_data, axis=axis)
-            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            outs = exe.run(fluid.default_main_program(),
+            place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
+            ) else paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            outs = exe.run(paddle.static.default_main_program(),
                            feed={'x': x_data,
                                  'y': y_data},
                            fetch_list=[res])
             res = outs[0]
             return res
 
-    def __run_dynamic_graph_case(self, x_data, y_data, axis=-1):
+    def _run_dynamic_graph_case(self, x_data, y_data):
         paddle.disable_static()
         x = paddle.to_tensor(x_data)
         y = paddle.to_tensor(y_data)
-        res = paddle.multiply(x, y, axis=axis)
-        return res.numpy()
-
-    def __run_dynamic_graph_case_with_numpy_input(self, x_data, y_data,
-                                                  axis=-1):
-        paddle.disable_static()
-        res = paddle.multiply(x_data, y_data, axis=axis)
+        res = paddle.multiply(x, y)
         return res.numpy()
 
     def test_multiply(self):
-        """test_multiply."""
         np.random.seed(7)
 
         # test static computation graph: 1-d array
         x_data = np.random.rand(200)
         y_data = np.random.rand(200)
-        res = self.__run_static_graph_case(x_data, y_data)
-        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
-
-        # test static computation graph: 1-d array
-        x_data = np.random.rand(200)
-        y_data = np.random.rand(200)
-        res = self.__run_static_graph_case_with_numpy_input(x_data, y_data)
+        res = self._run_static_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
         # test static computation graph: 2-d array
         x_data = np.random.rand(2, 500)
         y_data = np.random.rand(2, 500)
-        res = self.__run_static_graph_case(x_data, y_data)
-        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
-
-        # test static computation graph with_primitives: 2-d array
-        x_data = np.random.rand(2, 500)
-        y_data = np.random.rand(2, 500)
-        res = self.__run_static_graph_case_with_numpy_input(x_data, y_data)
+        res = self._run_static_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
         # test static computation graph: broadcast
         x_data = np.random.rand(2, 500)
         y_data = np.random.rand(500)
-        res = self.__run_static_graph_case(x_data, y_data)
+        res = self._run_static_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
-        # test static computation graph with_primitives: broadcast
-        x_data = np.random.rand(2, 500)
-        y_data = np.random.rand(500)
-        res = self.__run_static_graph_case_with_numpy_input(x_data, y_data)
-        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
-
-        # test static computation graph: broadcast with axis
-        x_data = np.random.rand(2, 300, 40)
-        y_data = np.random.rand(300)
-        res = self.__run_static_graph_case(x_data, y_data, axis=1)
-        expected = np.multiply(x_data, y_data[..., np.newaxis])
-        self.assertTrue(np.allclose(res, expected))
-
-        # test static computation graph with_primitives: broadcast with axis
-        x_data = np.random.rand(2, 300, 40)
-        y_data = np.random.rand(300)
-        res = self.__run_static_graph_case_with_numpy_input(
-            x_data, y_data, axis=1)
-        expected = np.multiply(x_data, y_data[..., np.newaxis])
-        self.assertTrue(np.allclose(res, expected))
-
         # test dynamic computation graph: 1-d array
         x_data = np.random.rand(200)
         y_data = np.random.rand(200)
-        res = self.__run_dynamic_graph_case(x_data, y_data)
-        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
-
-        # test dynamic numpy input computation graph: 1-d array
-        x_data = np.random.rand(200)
-        y_data = np.random.rand(200)
-        res = self.__run_dynamic_graph_case_with_numpy_input(x_data, y_data)
+        res = self._run_dynamic_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
         # test dynamic computation graph: 2-d array
         x_data = np.random.rand(20, 50)
         y_data = np.random.rand(20, 50)
-        res = self.__run_dynamic_graph_case(x_data, y_data)
-        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
-
-        # test dynamic numpy input computation graph: 1-d array
-        x_data = np.random.rand(20, 50)
-        y_data = np.random.rand(20, 50)
-        res = self.__run_dynamic_graph_case_with_numpy_input(x_data, y_data)
+        res = self._run_dynamic_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
         # test dynamic computation graph: broadcast
         x_data = np.random.rand(2, 500)
         y_data = np.random.rand(500)
-        res = self.__run_dynamic_graph_case(x_data, y_data)
+        res = self._run_dynamic_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
-        # test dynamic computation graph with numpy tensor: broadcast
-        x_data = np.random.rand(2, 500)
-        y_data = np.random.rand(500)
-        res = self.__run_dynamic_graph_case_with_numpy_input(x_data, y_data)
-        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
-
-        # test dynamic computation graph: broadcast with axis
-        x_data = np.random.rand(2, 300, 40)
-        y_data = np.random.rand(300)
-        res = self.__run_dynamic_graph_case(x_data, y_data, axis=1)
-        expected = np.multiply(x_data, y_data[..., np.newaxis])
-        self.assertTrue(np.allclose(res, expected))
-
-        # test dynamic computation graph with numpy tensor: broadcast with axis
-        x_data = np.random.rand(2, 300, 40)
-        y_data = np.random.rand(300)
-        res = self.__run_dynamic_graph_case_with_numpy_input(
-            x_data, y_data, axis=1)
-        expected = np.multiply(x_data, y_data[..., np.newaxis])
-        self.assertTrue(np.allclose(res, expected))
-
 
 class TestMultiplyError(unittest.TestCase):
-    """TestMultiplyError."""
-
     def test_errors(self):
-        """test_errors."""
         # test static computation graph: dtype can not be int8
         paddle.enable_static()
         with program_guard(Program(), Program()):
@@ -226,6 +134,35 @@ class TestMultiplyError(unittest.TestCase):
         y = paddle.to_tensor(y_data)
         self.assertRaises(TypeError, paddle.multiply, x, y)
 
+        # test dynamic computation graph: dtype must be Tensor type
+        x_data = np.random.randn(200).astype(np.int64)
+        y_data = np.random.randn(200).astype(np.float64)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(TypeError, paddle.multiply, x_data, y)
+
+        # test dynamic computation graph: dtype must be Tensor type
+        x_data = np.random.randn(200).astype(np.int64)
+        y_data = np.random.randn(200).astype(np.float64)
+        x = paddle.to_tensor(x_data)
+        self.assertRaises(TypeError, paddle.multiply, x, y_data)
+
+        # test dynamic computation graph: dtype must be Tensor type
+        x_data = np.random.randn(200).astype(np.float32)
+        y_data = np.random.randn(200).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        self.assertRaises(TypeError, paddle.multiply, x, y_data)
+
+        # test dynamic computation graph: dtype must be Tensor type
+        x_data = np.random.randn(200).astype(np.float32)
+        y_data = np.random.randn(200).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        self.assertRaises(TypeError, paddle.multiply, x_data, y)
+
+        # test dynamic computation graph: dtype must be Tensor type
+        x_data = np.random.randn(200).astype(np.float32)
+        y_data = np.random.randn(200).astype(np.float32)
+        self.assertRaises(TypeError, paddle.multiply, x_data, y_data)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
old mode 100644
new mode 100755
index 7bfe51c2ec5cbe4f107a116c4b74864a96fcdb61..b3ed491a54e5a9cfc45ea1b7f4b6f3f5b46bc1e9
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -166,7 +166,7 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
     if weight is not None:
         if isinstance(weight, paddle.static.Variable):
             weight_name = name if reduction is 'none' else None
-            out = paddle.multiply(out, weight, axis=-1, name=weight_name)
+            out = paddle.multiply(out, weight, name=weight_name)
         else:
             raise ValueError(
                 "The weight is not a Tensor, please convert to Tensor.")
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
old mode 100644
new mode 100755
index b14fb3e21200d5c11ca925b83090eb5ead61c4c4..89a7a53b0aa81b604ce2aa929feb5f344de0aecd
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -18,7 +18,6 @@ from ...fluid import dygraph
 from ...fluid import layers as F
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.data_feeder import check_variable_and_dtype
-from ...tensor.math import multiply
 
 __all__ = ['weight_norm', 'remove_weight_norm']
 
@@ -86,7 +85,8 @@ def _weight_norm(v, g, dim):
         v_normalized = F.l2_normalize(p_matrix, axis=1)
         v_normalized = F.reshape(v_normalized, transposed_shape)
         v_normalized = F.transpose(v_normalized, perm)
-    weight = multiply(v_normalized, g, axis=dim if dim is not None else -1)
+    weight = F.elementwise_mul(
+        v_normalized, g, axis=dim if dim is not None else -1)
     return weight
 
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index e7b72fe95bca6505e734b3e628493d819461bb57..f6d5c83ef20ff7a52cc8cf7477065efe05d7132e 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -421,7 +421,7 @@ mod = remainder  #DEFINE_ALIAS
 floor_mod = remainder  #DEFINE_ALIAS
 
 
-def multiply(x, y, axis=-1, name=None):
+def multiply(x, y, name=None):
     """
     multiply two tensors element-wise. The equation is:
 
@@ -445,20 +445,20 @@ def multiply(x, y, axis=-1, name=None):
 
             import paddle
 
-            paddle.disable_static()
             x = paddle.to_tensor([[1, 2], [3, 4]])
             y = paddle.to_tensor([[5, 6], [7, 8]])
             res = paddle.multiply(x, y)
-            print(res.numpy()) # [[5, 12], [21, 32]]
+            print(res) # [[5, 12], [21, 32]]
 
             x = paddle.to_tensor([[[1, 2, 3], [1, 2, 3]]])
-            y = paddle.to_tensor([1, 2])
-            res = paddle.multiply(x, y, axis=1)
-            print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
+            y = paddle.to_tensor([2])
+            res = paddle.multiply(x, y)
+            print(res) # [[[2, 4, 6], [2, 4, 6]]]
 
     """
     op_type = 'elementwise_mul'
     act = None
+    axis = -1
 
     if x.dtype != y.dtype:
         raise TypeError(
@@ -467,19 +467,12 @@ def multiply(x, y, axis=-1, name=None):
 
     if in_dygraph_mode():
         if not isinstance(x, (paddle.Tensor)):
-            x = paddle.to_tensor(x)
-        if not isinstance(y, (paddle.Tensor)):
-            y = paddle.to_tensor(y)
+            raise TypeError(
+                    'Input x must tensor type, but received type of x: %s'
+                    % (x.dtype))
+
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
-
-    if not isinstance(x, (paddle.Tensor, Variable)):
-        x = paddle.static.data(
-            name='x', shape=x.shape, dtype=x.dtype)
-    if not isinstance(y, (paddle.Tensor, Variable)):
-        y = paddle.static.data(
-            name='y', shape=y.shape, dtype=y.dtype)
-
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 def maximum(x, y, axis=-1, name=None):