diff --git a/python/paddle/distributed/auto_parallel/static/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/static/cost/comp_op_cost.py
index ea6d2ef571ca99214be5a852ba422aaa5f59cb9b..1039a7b2305b2e060314ddecc978f8cf6f67f3e8 100644
--- a/python/paddle/distributed/auto_parallel/static/cost/comp_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/static/cost/comp_op_cost.py
@@ -471,6 +471,14 @@ class ScaleOpCost(CompOpCost):
         super().__init__(op=op, op_desc=op_desc, cluster=cluster)
 
 
+@register_op_cost
+class ShapeOpCost(CompOpCost):
+    OP_TYPE = "shape"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super().__init__(op=op, op_desc=op_desc, cluster=cluster)
+
+
 @register_op_cost
 class SliceOpCost(CompOpCost):
     OP_TYPE = "slice"
diff --git a/python/paddle/distribution/bernoulli.py b/python/paddle/distribution/bernoulli.py
index 9ae721dad4e03d7e99317b3ebcd22fbf3d347c52..1b365bbcd313c0264f8de0729844f872e1cfd6f8 100644
--- a/python/paddle/distribution/bernoulli.py
+++ b/python/paddle/distribution/bernoulli.py
@@ -18,7 +18,7 @@ import numpy as np
 import paddle
 from paddle.distribution import exponential_family
 from paddle.fluid.data_feeder import check_type, convert_dtype
-from paddle.fluid.layers import tensor
+from paddle.fluid.framework import Variable
 from paddle.framework import in_dynamic_mode
 from paddle.nn.functional import (
     binary_cross_entropy_with_logits,
@@ -97,7 +97,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
             check_type(
                 probs,
                 'probs',
-                (float, tensor.Variable),
+                (float, Variable),
                 self.name,
             )
 
@@ -180,7 +180,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
             check_type(
                 shape,
                 'shape',
-                (np.ndarray, tensor.Variable, list, tuple),
+                (np.ndarray, Variable, list, tuple),
                 name,
             )
 
@@ -259,7 +259,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
             check_type(
                 shape,
                 'shape',
-                (np.ndarray, tensor.Variable, list, tuple),
+                (np.ndarray, Variable, list, tuple),
                 name,
             )
             check_type(
@@ -318,7 +318,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
         """
         name = self.name + '_cdf'
         if not in_dynamic_mode():
-            check_type(value, 'value', tensor.Variable, name)
+            check_type(value, 'value', Variable, name)
 
         value = self._check_values_dtype_in_probs(self.probs, value)
         probs, value = paddle.broadcast_tensors([self.probs, value])
@@ -356,7 +356,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
         """
         name = self.name + '_log_prob'
         if not in_dynamic_mode():
-            check_type(value, 'value', tensor.Variable, name)
+            check_type(value, 'value', Variable, name)
 
         value = self._check_values_dtype_in_probs(self.probs, value)
         logits, value = paddle.broadcast_tensors([self.logits, value])
@@ -395,7 +395,7 @@ class Bernoulli(exponential_family.ExponentialFamily):
         """
         name = self.name + '_prob'
         if not in_dynamic_mode():
-            check_type(value, 'value', tensor.Variable, name)
+            check_type(value, 'value', Variable, name)
 
         return self.log_prob(value).exp(name=name)
 
diff --git a/python/paddle/distribution/categorical.py b/python/paddle/distribution/categorical.py
index 3eea67157f715f9ebcd296a5bcb838cb2d227a6d..1af187c2cfed78afacdbcb3d9c674c52884cac38 100644
--- a/python/paddle/distribution/categorical.py
+++ b/python/paddle/distribution/categorical.py
@@ -17,7 +17,7 @@ import numpy as np
 import paddle
 from paddle.distribution import distribution
 from paddle.fluid.data_feeder import check_type, convert_dtype
-from paddle.fluid.layers import tensor
+from paddle.fluid.framework import Variable
 from paddle.framework import in_dynamic_mode
 from paddle.tensor import multinomial
 
@@ -100,7 +100,7 @@ class Categorical(distribution.Distribution):
             check_type(
                 logits,
                 'logits',
-                (np.ndarray, tensor.Variable, list, tuple),
+                (np.ndarray, Variable, list, tuple),
                 'Categorical',
             )
 
diff --git a/python/paddle/distribution/distribution.py b/python/paddle/distribution/distribution.py
index 023bd53d7b4283d6fe3b4357926f5298bd1afd23..68d468accee55b3efeaa8dc3e273e077d52ec7cf 100644
--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
@@ -26,7 +26,7 @@ import numpy as np
 import paddle
 from paddle import _C_ops
 from paddle.fluid.data_feeder import check_variable_and_dtype, convert_dtype
-from paddle.fluid.layers import tensor
+from paddle.fluid.framework import Variable
 from paddle.framework import in_dynamic_mode
 
 
@@ -150,7 +150,7 @@ class Distribution:
         is_variable = False
         is_number = False
         for arg in args:
-            if isinstance(arg, tensor.Variable):
+            if isinstance(arg, Variable):
                 is_variable = True
             else:
                 is_number = True
@@ -176,9 +176,7 @@ class Distribution:
         tmp = 0.0
 
         for arg in args:
-            if not isinstance(
-                arg, (float, list, tuple, np.ndarray, tensor.Variable)
-            ):
+            if not isinstance(arg, (float, list, tuple, np.ndarray, Variable)):
                 raise TypeError(
                     "Type of input args must be float, list, tuple, numpy.ndarray or Tensor, but received type {}".format(
                         type(arg)
diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py
index 7ba987819a3a2acd97c4211829fa4ade95346b91..07b1b810d9b6eb4fb5e5f91406e53a57de5bfabf 100644
--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -20,7 +20,7 @@ import numpy as np
 import paddle
 from paddle.distribution import distribution
 from paddle.fluid.data_feeder import check_type, convert_dtype
-from paddle.fluid.layers import tensor
+from paddle.fluid.framework import Variable
 from paddle.framework import in_dynamic_mode
 from paddle.tensor import random
 
@@ -91,13 +91,13 @@ class Normal(distribution.Distribution):
             check_type(
                 loc,
                 'loc',
-                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                (int, float, np.ndarray, Variable, list, tuple),
                 'Normal',
             )
             check_type(
                 scale,
                 'scale',
-                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                (int, float, np.ndarray, Variable, list, tuple),
                 'Normal',
             )
 
@@ -174,9 +174,9 @@ class Normal(distribution.Distribution):
         name = self.name + '_sample'
         if -1 in batch_shape:
             output_shape = shape + batch_shape
-            zero_tmp = tensor.fill_constant_batch_size_like(
-                self.loc + self.scale, batch_shape + shape, self.dtype, 0.0
-            )
+            fill_shape = list(batch_shape + shape)
+            fill_shape[0] = paddle.shape(self.loc + self.scale)[0].item()
+            zero_tmp = paddle.full(fill_shape, 0.0, self.dtype)
             zero_tmp_reshape = paddle.reshape(zero_tmp, output_shape)
 
             zero_tmp_shape = paddle.shape(zero_tmp_reshape)
@@ -234,9 +234,10 @@ class Normal(distribution.Distribution):
         name = self.name + '_entropy'
         batch_shape = list((self.loc + self.scale).shape)
         if -1 in batch_shape:
-            zero_tmp = tensor.fill_constant_batch_size_like(
-                self.loc + self.scale, batch_shape, self.dtype, 0.0
-            )
+            fill_shape = list(batch_shape)
+            fill_shape[0] = paddle.shape(self.loc + self.scale)[0].item()
+            fill_dtype = (self.loc + self.scale).dtype
+            zero_tmp = paddle.full(fill_shape, 0.0, fill_dtype)
         else:
             zero_tmp = paddle.full(batch_shape, 0.0, self.dtype)
         return paddle.add(
diff --git a/python/paddle/distribution/uniform.py b/python/paddle/distribution/uniform.py
index 5619258efcf9142032e3f76f4605e2a2ccf9c17a..dbd27fd14728f4bba2bd8480777f9bc8d9f995fd 100644
--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -18,7 +18,7 @@ import paddle
 from paddle import _C_ops
 from paddle.distribution import distribution
 from paddle.fluid.data_feeder import check_type, convert_dtype
-from paddle.fluid.layers import tensor
+from paddle.fluid.framework import Variable
 from paddle.framework import in_dynamic_mode
 from paddle.tensor import random
 
@@ -105,13 +105,13 @@ class Uniform(distribution.Distribution):
             check_type(
                 low,
                 'low',
-                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                (int, float, np.ndarray, Variable, list, tuple),
                 'Uniform',
             )
             check_type(
                 high,
                 'high',
-                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                (int, float, np.ndarray, Variable, list, tuple),
                 'Uniform',
             )
 
@@ -169,9 +169,9 @@ class Uniform(distribution.Distribution):
         batch_shape = list((self.low + self.high).shape)
         if -1 in batch_shape:
             output_shape = shape + batch_shape
-            zero_tmp = tensor.fill_constant_batch_size_like(
-                self.low + self.high, batch_shape + shape, self.dtype, 0.0
-            )
+            fill_shape = list(batch_shape + shape)
+            fill_shape[0] = paddle.shape(self.low + self.high)[0].item()
+            zero_tmp = paddle.full(fill_shape, 0.0, self.dtype)
             uniform_random_tmp = random.uniform_random_batch_size_like(
                 zero_tmp,
                 zero_tmp.shape,
diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py
index b91d7de093c88cb54d5381d9b0df75d773af40bc..c5eb01ff763835d28762b4be97a0b3dcf027da47 100644
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -16,8 +16,6 @@ from . import nn
 from .nn import *
 from . import io
 from .io import *
-from . import tensor
-from .tensor import *
 from . import math_op_patch
 from .math_op_patch import *
 from .learning_rate_scheduler import *
@@ -27,5 +25,4 @@ from ..layer_helper import LayerHelper
 __all__ = []
 __all__ += nn.__all__
 __all__ += io.__all__
-__all__ += tensor.__all__
 __all__ += learning_rate_scheduler.__all__
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 050cc774ab7c75f8255a6810f5f134ee1e57dbb9..59f25c63b744a8b9bd865ea2b0ae0d28da59c1d4 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -25,7 +25,6 @@ import numbers
 
 import paddle
 from . import nn
-from . import tensor
 from ..framework import (
     default_main_program,
     Parameter,
@@ -488,7 +487,7 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
             learning_rate = base_lr, step_each_epoch=10000, epochs=120)
     """
     check_type(
-        learning_rate, 'learning_rate', (float, tensor.Variable), 'cosine_decay'
+        learning_rate, 'learning_rate', (float, Variable), 'cosine_decay'
     )
 
     with default_main_program()._lr_schedule_guard():
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
deleted file mode 100644
index 06cfbf1cecb390401576025a50ef14f475d8e785..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/tensor.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unlessf required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import numpy
-import warnings
-
-from ..layer_helper import LayerHelper
-from ..framework import (
-    _current_expected_place,
-    convert_np_dtype_to_dtype_,
-    _create_tensor,
-    in_dygraph_mode,
-)
-from ..framework import Variable
-from ..core import VarDesc
-from .. import core
-from .layer_function_generator import templatedoc
-from ..data_feeder import (
-    check_variable_and_dtype,
-    check_type,
-    check_dtype,
-    convert_dtype,
-)
-from paddle.utils import deprecated
-
-from paddle import _C_ops, _legacy_C_ops
-
-__all__ = [
-    'fill_constant_batch_size_like',
-]
-
-
-@deprecated(since='1.8.0', update_to="paddle.fluid.layers.fill_constant")
-@templatedoc()
-def fill_constant_batch_size_like(
-    input,
-    shape,
-    dtype,
-    value,
-    input_dim_idx=0,
-    output_dim_idx=0,
-    force_cpu=False,
-):
-    """
-    This OP creates a Tesnor according the shape and dtype, and initializes the
-    Tensor with the constants provided in ``value``. When the input is LoDTensor
-    and the input_dim_idx is 0, the output_dim_idx dimension is set to the value
-    of the batch_size input by the input, the Stop_gradient attribute of the created
-    Tensor is False by default.
-
-    Args:
-        input(Variable): Tensor which data type is float32, float64, int32 and int64.
-        shape(list): The shape of Tensor to be created, Tensor's shape may be changed
-            according the input.
-        dtype(np.dtype|core.VarDesc.VarType|str): The data type of created Tensor which
-            can be float32, float64, int32, int64.
-        value(float|int): The constant value used to initialize the Tensor to be created.
-        input_dim_idx(int): When the value is 0 and the input is LoDTensor, the output_dim_idx
-            dimension of the created Tensor is set to the batch_size value of input.
-            The default value is 0.
-        output_dim_idx(int): Used to specify which dimension of Tensor is created to be set
-            the value of batch_size of input Tensor. The default value is 0.
-        force_cpu(bool): data should be on CPU if it's true, default value is False.
-
-    Returns:
-        Variable: Tensor which will be created according to dtype.
-
-    Examples:
-
-        .. code-block:: python
-
-             import paddle
-             import paddle.fluid as fluid
-             like = paddle.full(shape=[1,2], fill_value=10, dtype='int64') #like=[[10, 10]]
-             data = fluid.layers.fill_constant_batch_size_like(
-                    input=like, shape=[1], value=0, dtype='int64') #like=[[10, 10]] data=[0]
-
-    """
-    if in_dygraph_mode():
-        if not isinstance(dtype, core.VarDesc.VarType):
-            dtype = convert_np_dtype_to_dtype_(dtype)
-
-        place = _current_expected_place()
-        if force_cpu:
-            place = core.CPUPlace()
-        out = _C_ops.full_batch_size_like(
-            input, shape, dtype, value, input_dim_idx, output_dim_idx, place
-        )
-        out.stop_gradient = True
-        return out
-    else:
-        helper = LayerHelper("fill_constant_batch_size_like", **locals())
-        out = helper.create_variable_for_type_inference(dtype=dtype)
-        attrs = {
-            'shape': shape,
-            'dtype': out.dtype,
-            'value': float(value),
-            'input_dim_idx': input_dim_idx,
-            'output_dim_idx': output_dim_idx,
-            'force_cpu': force_cpu,
-        }
-        if convert_dtype(dtype) in ['int64', 'int32']:
-            attrs['str_value'] = str(int(value))
-        else:
-            attrs['str_value'] = str(float(value))
-        helper.append_op(
-            type='fill_constant_batch_size_like',
-            inputs={'Input': input},
-            outputs={'Out': [out]},
-            attrs=attrs,
-        )
-        out.stop_gradient = True
-        return out
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index b46f3da08cbb5c6fe1b781390a07dad92e6587f1..3f27d310624d3f32e9a8018ffd9d1275b9a27206 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -48,7 +48,6 @@ from .dygraph.learning_rate_scheduler import (
     _LearningRateEpochDecay,
 )
 from paddle.fluid import core
-from paddle.fluid.layers import tensor
 from functools import reduce
 from functools import cmp_to_key
 from .wrapped_decorator import signature_safe_contextmanager
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 3ef01f836fc6dc66f80891721b86e740bcca3950..991df623d96f15db7eb244283c877b2cc5fea18b 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -600,7 +600,9 @@ class RNNCellBase(Layer):
 
         class Shape:
             def __init__(self, shape):
-                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
+                self.shape = (
+                    list(shape) if shape[0] == -1 else ([-1] + list(shape))
+                )
 
         # nested structure of shapes
         states_shapes = self.state_shape if shape is None else shape
@@ -621,16 +623,35 @@ class RNNCellBase(Layer):
             states_dtypes = paddle.utils.map_structure(
                 lambda shape: dtype, states_shapes
             )
+        fill_shapes = states_shapes
+        if batch_ref.shape[batch_dim_idx] > 0:
+            if isinstance(fill_shapes, list):
+                for s in fill_shapes[0]:
+                    s.shape[0] = batch_ref.shape[batch_dim_idx]
+            elif isinstance(fill_shapes, tuple):
+                for s in fill_shapes:
+                    s.shape[0] = batch_ref.shape[batch_dim_idx]
+            else:
+                fill_shapes.shape[0] = batch_ref.shape[batch_dim_idx]
+        else:
+            if isinstance(fill_shapes, list):
+                for s in fill_shapes[0]:
+                    s.shape[0] = paddle.shape(batch_ref)[batch_dim_idx].item()
+            elif isinstance(fill_shapes, tuple):
+                for s in fill_shapes:
+                    s.shape[0] = paddle.shape(batch_ref)[batch_dim_idx].item()
+            else:
+                fill_shapes.shape[0] = paddle.shape(batch_ref)[
+                    batch_dim_idx
+                ].item()
 
         init_states = paddle.utils.map_structure(
-            lambda shape, dtype: paddle.fluid.layers.fill_constant_batch_size_like(
-                input=batch_ref,
+            lambda shape, dtype: paddle.full(
                 shape=shape.shape,
+                fill_value=init_value,
                 dtype=dtype,
-                value=init_value,
-                input_dim_idx=batch_dim_idx,
             ),
-            states_shapes,
+            fill_shapes,
             states_dtypes,
         )
         return init_states
@@ -1534,7 +1555,6 @@ class RNNBase(LayerList):
                 'Reserve': reserve,
                 'DropoutState': self._dropout_state,
             }
-
             self._helper.append_op(
                 type="rnn", inputs=inputs, outputs=outputs, attrs=attrs
             )
@@ -1555,11 +1575,15 @@ class RNNBase(LayerList):
                 -1,
                 self.hidden_size,
             )
+
+            fill_shape = list(state_shape)
+            if inputs.shape[batch_index] > 0:
+                fill_shape[1] = inputs.shape[batch_index]
+            else:
+                fill_shape[1] = paddle.shape(inputs)[batch_index].item()
             initial_states = tuple(
                 [
-                    paddle.fluid.layers.fill_constant_batch_size_like(
-                        inputs, state_shape, dtype, 0, batch_index, 1
-                    )
+                    paddle.full(shape=fill_shape, fill_value=0, dtype=dtype)
                     for _ in range(self.state_components)
                 ]
             )
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index cab257f2e0eb69d356068fd452fdaf6614e6900f..335b47d2599b95d033e4d7f10a8fc72b83acdbd1 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -23,7 +23,6 @@ import paddle
 from paddle.fluid.data_feeder import convert_dtype
 
 from ... import tensor
-from ...fluid import layers
 from ...framework import ParamAttr
 from .. import functional as F
 from .common import Dropout, Linear
@@ -342,18 +341,10 @@ class MultiHeadAttention(Layer):
             k, v = self.compute_kv(key, value)
             return self.StaticCache(k, v)
         elif value is None:  # incremental_state
-            k = layers.fill_constant_batch_size_like(
-                input=key,
-                shape=[-1, self.num_heads, 0, self.head_dim],
-                dtype=key.dtype,
-                value=0,
-            )
-            v = layers.fill_constant_batch_size_like(
-                input=key,
-                shape=[-1, self.num_heads, 0, self.head_dim],
-                dtype=key.dtype,
-                value=0,
-            )
+            fill_shape = [-1, self.num_heads, 0, self.head_dim]
+            fill_shape[0] = paddle.shape(key)[0].item()
+            k = paddle.full(fill_shape, 0, key.dtype)
+            v = paddle.full(fill_shape, 0, key.dtype)
             return self.Cache(k, v)
         else:
             # incremental_state with initial value, mainly for usage like UniLM
diff --git a/test/auto_parallel/test_base_cost.py b/test/auto_parallel/test_base_cost.py
index c9e3e64c6a8dfe9c75cb993060b5baee9c72526b..62c695b9e1d2eecb6ffe30b6cc22fb3bd7155451 100644
--- a/test/auto_parallel/test_base_cost.py
+++ b/test/auto_parallel/test_base_cost.py
@@ -101,10 +101,9 @@ def mlp_forward(train_program, start_program):
         label = static.data(
             name="label", shape=[batch_size, 1], dtype='float32'
         )
-
-        fill_constant_out = paddle.fluid.layers.fill_constant_batch_size_like(
-            input=input, shape=[batch_size], value=1, dtype="int32"
-        )
+        fill_shape = [batch_size]
+        fill_shape[0] = input.shape[0]
+        fill_constant_out = paddle.full(fill_shape, 1, dtype="int32")
         embedding = paddle.nn.Embedding(10, hidden_size, sparse=True)
         embedding_out = embedding(fill_constant_out)
 
diff --git a/test/auto_parallel/test_dist_op_cost.py b/test/auto_parallel/test_dist_op_cost.py
index 4d7cca7e5b3329b6b6800d60bf7b59bb90de2008..b5ac2249873516f6f7eb69896417ec95a4f51879 100644
--- a/test/auto_parallel/test_dist_op_cost.py
+++ b/test/auto_parallel/test_dist_op_cost.py
@@ -75,9 +75,9 @@ class TestDistOpCost(unittest.TestCase):
                 auto.shard_tensor(
                     x, auto.ProcessMesh([0, 1], dim_names=["x"]), ["x", None]
                 )
-                tmp = paddle.fluid.layers.fill_constant_batch_size_like(
-                    input=x, shape=[2, 8], value=1, dtype='float32'
-                )
+                fill_shape = [2, 8]
+                fill_shape[0] = x.shape[0]
+                tmp = paddle.full(fill_shape, fill_value=1, dtype='float32')
                 weight_attr = paddle.ParamAttr()
                 linear = paddle.nn.Linear(8, 1, weight_attr=weight_attr)
                 linear_out = linear(x)
@@ -97,6 +97,8 @@ class TestDistOpCost(unittest.TestCase):
                 op.type != "matmul_v2"
                 and op.type != "matmul_v2_grad"
                 and op.type != "sgd"
+                and op.type != "shape"
+                and op.type != "slice"
             ):
                 dist_op = dist_context.get_dist_op_for_program(op)
                 op_dist_attr = dist_op.dist_attr
@@ -137,9 +139,9 @@ class TestDistOpCost(unittest.TestCase):
                     ["x", None],
                 )
                 # embedding
-                tmp = paddle.fluid.layers.fill_constant_batch_size_like(
-                    input=x, shape=[4], value=1, dtype='int32'
-                )
+                fill_shape = [4]
+                fill_shape[0] = x.shape[0]
+                tmp = paddle.full(shape=fill_shape, fill_value=1, dtype='int32')
                 embedding = paddle.nn.Embedding(10, 8)
                 out = embedding(tmp)
                 # row parallel embedding
@@ -206,23 +208,24 @@ class TestDistOpCost(unittest.TestCase):
         cluster = Cluster()
         cluster.gen_default_config_cluster(device_count=2)
         for idx, op in enumerate(ops):
-            dist_op = dist_context.get_dist_op_for_program(op)
-            op_dist_attr = dist_op.dist_attr
-            processes = op_dist_attr.process_mesh.process_ids
-            if is_elementwise_op(op.type):
-                container = get_distributed_operator_impl_container(
-                    "elementwise"
-                )
-            else:
-                container = get_distributed_operator_impl_container(
-                    op_dist_attr.impl_type
-                )
+            if op.type != "shape" and op.type != "slice":
+                dist_op = dist_context.get_dist_op_for_program(op)
+                op_dist_attr = dist_op.dist_attr
+                processes = op_dist_attr.process_mesh.process_ids
+                if is_elementwise_op(op.type):
+                    container = get_distributed_operator_impl_container(
+                        "elementwise"
+                    )
+                else:
+                    container = get_distributed_operator_impl_container(
+                        op_dist_attr.impl_type
+                    )
 
-            dist_impl = container.impls[op_dist_attr.impl_idx]
-            dist_op_cost = dist_impl.calc_cost(
-                op.attr('op_role'), dist_op, dist_context, cluster
-            )
-            self.assertTrue(dist_op_cost)
+                dist_impl = container.impls[op_dist_attr.impl_idx]
+                dist_op_cost = dist_impl.calc_cost(
+                    op.attr('op_role'), dist_op, dist_context, cluster
+                )
+                self.assertTrue(dist_op_cost)
 
     def test_dist_op_cost_part3(self):
         def make_program():
@@ -245,9 +248,9 @@ class TestDistOpCost(unittest.TestCase):
                     ["x", None],
                 )
                 # embedding
-                tmp = paddle.fluid.layers.fill_constant_batch_size_like(
-                    input=x, shape=[4], value=1, dtype='int32'
-                )
+                fill_shape = [4]
+                fill_shape[0] = x.shape[0]
+                tmp = paddle.full(shape=fill_shape, fill_value=1, dtype='int32')
                 embedding = paddle.nn.Embedding(10, 8)
                 out = embedding(tmp)
                 # row parallel embedding
@@ -315,23 +318,24 @@ class TestDistOpCost(unittest.TestCase):
         cluster = Cluster()
         cluster.gen_default_config_cluster(device_count=2)
         for idx, op in enumerate(ops):
-            dist_op = dist_context.get_dist_op_for_program(op)
-            op_dist_attr = dist_op.dist_attr
-            processes = op_dist_attr.process_mesh.process_ids
-            if is_elementwise_op(op.type):
-                container = get_distributed_operator_impl_container(
-                    "elementwise"
-                )
-            else:
-                container = get_distributed_operator_impl_container(
-                    op_dist_attr.impl_type
-                )
+            if op.type != "shape" and op.type != "slice":
+                dist_op = dist_context.get_dist_op_for_program(op)
+                op_dist_attr = dist_op.dist_attr
+                processes = op_dist_attr.process_mesh.process_ids
+                if is_elementwise_op(op.type):
+                    container = get_distributed_operator_impl_container(
+                        "elementwise"
+                    )
+                else:
+                    container = get_distributed_operator_impl_container(
+                        op_dist_attr.impl_type
+                    )
 
-            dist_impl = container.impls[op_dist_attr.impl_idx]
-            dist_op_cost = dist_impl.calc_cost(
-                op.attr('op_role'), dist_op, dist_context, cluster
-            )
-            self.assertTrue(dist_op_cost)
+                dist_impl = container.impls[op_dist_attr.impl_idx]
+                dist_op_cost = dist_impl.calc_cost(
+                    op.attr('op_role'), dist_op, dist_context, cluster
+                )
+                self.assertTrue(dist_op_cost)
 
     def test_dist_op_cost_part4(self):
         def make_program():
@@ -353,9 +357,9 @@ class TestDistOpCost(unittest.TestCase):
                     ["x", None],
                 )
                 # embedding
-                tmp = paddle.fluid.layers.fill_constant_batch_size_like(
-                    input=x, shape=[4], value=1, dtype='int32'
-                )
+                fill_shape = [4]
+                fill_shape[0] = x.shape[0]
+                tmp = paddle.full(shape=fill_shape, fill_value=1, dtype='int32')
                 embedding = paddle.nn.Embedding(10, 8)
                 out = embedding(tmp)
                 # row parallel embedding
@@ -423,23 +427,24 @@ class TestDistOpCost(unittest.TestCase):
         cluster = Cluster()
         cluster.gen_default_config_cluster(device_count=2)
         for idx, op in enumerate(ops):
-            dist_op = dist_context.get_dist_op_for_program(op)
-            op_dist_attr = dist_op.dist_attr
-            processes = op_dist_attr.process_mesh.process_ids
-            if is_elementwise_op(op.type):
-                container = get_distributed_operator_impl_container(
-                    "elementwise"
-                )
-            else:
-                container = get_distributed_operator_impl_container(
-                    op_dist_attr.impl_type
-                )
+            if op.type != "shape" and op.type != "slice":
+                dist_op = dist_context.get_dist_op_for_program(op)
+                op_dist_attr = dist_op.dist_attr
+                processes = op_dist_attr.process_mesh.process_ids
+                if is_elementwise_op(op.type):
+                    container = get_distributed_operator_impl_container(
+                        "elementwise"
+                    )
+                else:
+                    container = get_distributed_operator_impl_container(
+                        op_dist_attr.impl_type
+                    )
 
-            dist_impl = container.impls[op_dist_attr.impl_idx]
-            dist_op_cost = dist_impl.calc_cost(
-                op.attr('op_role'), dist_op, dist_context, cluster
-            )
-            self.assertTrue(dist_op_cost)
+                dist_impl = container.impls[op_dist_attr.impl_idx]
+                dist_op_cost = dist_impl.calc_cost(
+                    op.attr('op_role'), dist_op, dist_context, cluster
+                )
+                self.assertTrue(dist_op_cost)
 
 
 if __name__ == "__main__":
diff --git a/test/auto_parallel/test_while_op_partition.py b/test/auto_parallel/test_while_op_partition.py
index cbab4cf981f610b521e92ce06a5260c6a19d4498..fd8edc6eba7c1da380820a18c887bd7730d5c183 100644
--- a/test/auto_parallel/test_while_op_partition.py
+++ b/test/auto_parallel/test_while_op_partition.py
@@ -145,8 +145,18 @@ def get_program():
         auto.shard_tensor(label, _g_process_mesh, [None, None, None])
 
         # fill constant bsz like
-        tmp = paddle.fluid.layers.fill_constant_batch_size_like(
-            input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0
+        block = train_program.current_block()
+        fill_shape = [-1, 16, 0, 48]
+        tmp = block.create_var(name='tmp', dtype='float32')
+        block.append_op(
+            type='fill_constant_batch_size_like',
+            outputs={'Out': [tmp]},
+            inputs={'Input': [input]},
+            attrs={
+                'shape': fill_shape,
+                'value': 0,
+            },
+            stop_gradient=True,
         )
         auto.shard_tensor(tmp, _g_process_mesh, [None, 'x', None, None])
 
@@ -369,7 +379,6 @@ class TestMLP(unittest.TestCase):
             train_program, start_program, dist_context
         )
         dist_context.block_state.parse_forward_blocks(train_program)
-
         dist_main_prog, dist_startup_prog = partition(
             train_program, start_program, dist_context
         )
@@ -388,8 +397,8 @@ class TestMLP(unittest.TestCase):
         self.assertTrue("c_allreduce_sum" in sub_block_ops)
 
         # test fill_constant_batch_size_like
-
         self.assertIsNotNone(fill_op)
+
         ref_shape = [-1, 8, 0, 48]
         shape = fill_op.attr("shape")
         self.assertTrue(ref_shape == shape)
diff --git a/test/legacy_test/auto_parallel_gpt_model.py b/test/legacy_test/auto_parallel_gpt_model.py
index 28e63db4bf150d00e327807a1b4b89640b06500d..1be27f9bc803ab43a2296f58cefb392d156ecede 100644
--- a/test/legacy_test/auto_parallel_gpt_model.py
+++ b/test/legacy_test/auto_parallel_gpt_model.py
@@ -18,7 +18,6 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn, tensor
 from paddle.distributed.fleet import auto
-from paddle.fluid import layers
 from paddle.nn.layer.transformer import _convert_param_attr_to_list
 
 paddle.enable_static()
@@ -212,18 +211,10 @@ class MultiHeadAttention(nn.Layer):
             k, v = self.compute_kv(key, value)
             return self.StaticCache(k, v)
         elif value is None:  # incremental_state
-            k = layers.fill_constant_batch_size_like(
-                input=key,
-                shape=[-1, self.num_heads, 0, self.head_dim],
-                dtype=key.dtype,
-                value=0,
-            )
-            v = layers.fill_constant_batch_size_like(
-                input=key,
-                shape=[-1, self.num_heads, 0, self.head_dim],
-                dtype=key.dtype,
-                value=0,
-            )
+            fill_shape = [-1, self.num_heads, 0, self.head_dim]
+            fill_shape[0] = paddle.shape(key)[0].item()
+            k = paddle.full(shape=fill_shape, fill_value=0, dtype=key.dtype)
+            v = paddle.full(shape=fill_shape, fill_value=0, dtype=key.dtype)
             return self.Cache(k, v)
         else:
             # incremental_state with initial value, mainly for usage like UniLM
diff --git a/test/legacy_test/dist_fleet_simnet_bow.py b/test/legacy_test/dist_fleet_simnet_bow.py
index 8ee220682e474dc705e3fcf94ad4125eba5ce577..5885f39569450b4025543304ab74b63e990ed480 100644
--- a/test/legacy_test/dist_fleet_simnet_bow.py
+++ b/test/legacy_test/dist_fleet_simnet_bow.py
@@ -68,17 +68,17 @@ def get_acc(cos_q_nt, cos_q_pt, batch_size):
 
 
 def get_loss(cos_q_pt, cos_q_nt):
+    fill_shape = [-1, 1]
+    fill_shape[0] = paddle.shape(cos_q_pt)[0].item()
+
     loss_op1 = paddle.subtract(
-        fluid.layers.fill_constant_batch_size_like(
-            input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'
-        ),
+        paddle.full(shape=fill_shape, fill_value=margin, dtype='float32'),
         cos_q_pt,
     )
     loss_op2 = paddle.add(loss_op1, cos_q_nt)
+    fill_shape[0] = paddle.shape(cos_q_pt)[0].item()
     loss_op3 = paddle.maximum(
-        fluid.layers.fill_constant_batch_size_like(
-            input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
-        ),
+        paddle.full(shape=fill_shape, fill_value=0.0, dtype='float32'),
         loss_op2,
     )
     avg_cost = paddle.mean(loss_op3)
diff --git a/test/legacy_test/test_auto_parallel_completion_gpt.py b/test/legacy_test/test_auto_parallel_completion_gpt.py
index cc09ac989e1da1edc8ae7981ae0770feb98b42b5..cd00ae2c736ed6c8c0368a9770c5a311d8d2abce 100644
--- a/test/legacy_test/test_auto_parallel_completion_gpt.py
+++ b/test/legacy_test/test_auto_parallel_completion_gpt.py
@@ -23,7 +23,6 @@ from paddle.distributed.auto_parallel.static.dist_context import (
     DistributedContext,
 )
 from paddle.distributed.fleet import auto
-from paddle.fluid import layers
 from paddle.nn.layer.transformer import _convert_param_attr_to_list
 
 paddle.enable_static()
@@ -172,18 +171,10 @@ class MultiHeadAttention(nn.Layer):
             k, v = self.compute_kv(key, value)
             return self.StaticCache(k, v)
         elif value is None:  # incremental_state
-            k = layers.fill_constant_batch_size_like(
-                input=key,
-                shape=[-1, self.num_heads, 0, self.head_dim],
-                dtype=key.dtype,
-                value=0,
-            )
-            v = layers.fill_constant_batch_size_like(
-                input=key,
-                shape=[-1, self.num_heads, 0, self.head_dim],
-                dtype=key.dtype,
-                value=0,
-            )
+            fill_shape = [-1, self.num_heads, 0, self.head_dim]
+            fill_shape[0] = paddle.shape(key)[0].item()
+            k = paddle.full(shape=fill_shape, fill_value=0, dtype=key.dtype)
+            v = paddle.full(shape=fill_shape, fill_value=0, dtype=key.dtype)
             return self.Cache(k, v)
         else:
             # incremental_state with initial value, mainly for usage like UniLM
diff --git a/test/legacy_test/test_auto_parallel_partitioner_gpt.py b/test/legacy_test/test_auto_parallel_partitioner_gpt.py
index 66c0eb3ea74c61f0fa43fc2c80ed68e4ab1904ba..0828cafa60b8172c81f57de70b1e266694e7b4ff 100644
--- a/test/legacy_test/test_auto_parallel_partitioner_gpt.py
+++ b/test/legacy_test/test_auto_parallel_partitioner_gpt.py
@@ -28,7 +28,6 @@ from paddle.distributed.auto_parallel.static.process_group import (
 )
 from paddle.distributed.auto_parallel.static.utils import _get_comm_group
 from paddle.distributed.fleet import auto
-from paddle.fluid import layers
 from paddle.nn.layer.transformer import _convert_param_attr_to_list
 
 paddle.enable_static()
@@ -218,18 +217,10 @@ class MultiHeadAttention(nn.Layer):
             k, v = self.compute_kv(key, value)
             return self.StaticCache(k, v)
         elif value is None:  # incremental_state
-            k = layers.fill_constant_batch_size_like(
-                input=key,
-                shape=[-1, self.num_heads, 0, self.head_dim],
-                dtype=key.dtype,
-                value=0,
-            )
-            v = layers.fill_constant_batch_size_like(
-                input=key,
-                shape=[-1, self.num_heads, 0, self.head_dim],
-                dtype=key.dtype,
-                value=0,
-            )
+            fill_shape = [-1, self.num_heads, 0, self.head_dim]
+            fill_shape[0] = paddle.shape(key)[0].item()
+            k = paddle.full(shape=fill_shape, fill_value=0, dtype=key.dtype)
+            v = paddle.full(shape=fill_shape, fill_value=0, dtype=key.dtype)
             return self.Cache(k, v)
         else:
             # incremental_state with initial value, mainly for usage like UniLM
diff --git a/test/legacy_test/test_dist_fleet_minimize.py b/test/legacy_test/test_dist_fleet_minimize.py
index 3eb44988d8872a751dc7bc5c9b81ad182cc91f9f..59751a9604156b279ba9906064364ffc5e8b242f 100644
--- a/test/legacy_test/test_dist_fleet_minimize.py
+++ b/test/legacy_test/test_dist_fleet_minimize.py
@@ -49,17 +49,19 @@ class TestPSMinimize(unittest.TestCase):
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(cos_q_pt)[0].item()
             loss_op1 = paddle.subtract(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'
+                paddle.full(
+                    shape=fill_shape, fill_value=margin, dtype='float32'
                 ),
                 cos_q_pt,
             )
             loss_op2 = paddle.add(loss_op1, cos_q_nt)
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(loss_op2)[0].item()
             loss_op3 = paddle.maximum(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
-                ),
+                paddle.full(shape=fill_shape, fill_value=0.0, dtype='float32'),
                 loss_op2,
             )
             avg_cost = paddle.mean(loss_op3)
diff --git a/test/legacy_test/test_dist_fleet_ps.py b/test/legacy_test/test_dist_fleet_ps.py
index 8266a02dfbbad06a7aa61fbb33a0506cf8497ec4..eb423b3c341fc8245d7a3904077a6675d0b011c5 100644
--- a/test/legacy_test/test_dist_fleet_ps.py
+++ b/test/legacy_test/test_dist_fleet_ps.py
@@ -49,17 +49,19 @@ class TestPSPassWithBow(unittest.TestCase):
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(cos_q_pt)[0].item()
             loss_op1 = paddle.subtract(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'
+                paddle.full(
+                    shape=fill_shape, fill_value=margin, dtype='float32'
                 ),
                 cos_q_pt,
             )
             loss_op2 = paddle.add(loss_op1, cos_q_nt)
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(loss_op2)[0].item()
             loss_op3 = paddle.maximum(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
-                ),
+                paddle.full(shape=fill_shape, fill_value=0.0, dtype='float32'),
                 loss_op2,
             )
             avg_cost = paddle.mean(loss_op3)
diff --git a/test/legacy_test/test_dist_fleet_ps11.py b/test/legacy_test/test_dist_fleet_ps11.py
index 755636d0ab244fb6041a901974ccab2997f8612c..d5a4c64423f7398de65d28cd97b7f9f5bdbe8f0c 100755
--- a/test/legacy_test/test_dist_fleet_ps11.py
+++ b/test/legacy_test/test_dist_fleet_ps11.py
@@ -49,17 +49,19 @@ class TestPSPassWithBow(unittest.TestCase):
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(cos_q_pt)[0].item()
             loss_op1 = paddle.subtract(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'
+                paddle.full(
+                    shape=fill_shape, fill_value=margin, dtype='float32'
                 ),
                 cos_q_pt,
             )
             loss_op2 = paddle.add(loss_op1, cos_q_nt)
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(loss_op2)[0].item()
             loss_op3 = paddle.maximum(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
-                ),
+                paddle.full(shape=fill_shape, fill_value=0.0, dtype='float32'),
                 loss_op2,
             )
             avg_cost = paddle.mean(loss_op3)
diff --git a/test/legacy_test/test_dist_fleet_ps12.py b/test/legacy_test/test_dist_fleet_ps12.py
index 1b7b30780cb493e0db3cce3452e26da4f6bcbaed..dc0d0325a5a525be76a6a1ea0bad32a3e1c60213 100644
--- a/test/legacy_test/test_dist_fleet_ps12.py
+++ b/test/legacy_test/test_dist_fleet_ps12.py
@@ -52,17 +52,19 @@ class TestPSPassWithBow(unittest.TestCase):
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(cos_q_pt)[0].item()
             loss_op1 = paddle.subtract(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'
+                paddle.full(
+                    shape=fill_shape, fill_value=margin, dtype='float32'
                 ),
                 cos_q_pt,
             )
             loss_op2 = paddle.add(loss_op1, cos_q_nt)
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(loss_op2)[0].item()
             loss_op3 = paddle.maximum(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
-                ),
+                paddle.full(shape=fill_shape, fill_value=0.0, dtype='float32'),
                 loss_op2,
             )
             avg_cost = paddle.mean(loss_op3)
diff --git a/test/legacy_test/test_dist_fleet_ps13.py b/test/legacy_test/test_dist_fleet_ps13.py
index 3cb1dec9ae96dd7e0c252761bd2db4652f6fb1bf..2fbdbeba47f77eb1e24cef7f7802bfde37baa5ee 100644
--- a/test/legacy_test/test_dist_fleet_ps13.py
+++ b/test/legacy_test/test_dist_fleet_ps13.py
@@ -53,17 +53,19 @@ class TestPSPassWithBow(unittest.TestCase):
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(cos_q_pt)[0].item()
             loss_op1 = paddle.subtract(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'
+                paddle.full(
+                    shape=fill_shape, fill_value=margin, dtype='float32'
                 ),
                 cos_q_pt,
             )
             loss_op2 = paddle.add(loss_op1, cos_q_nt)
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(loss_op2)[0].item()
             loss_op3 = paddle.maximum(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
-                ),
+                paddle.full(shape=fill_shape, fill_value=0.0, dtype='float32'),
                 loss_op2,
             )
             avg_cost = paddle.mean(loss_op3)
diff --git a/test/legacy_test/test_dist_fleet_ps2.py b/test/legacy_test/test_dist_fleet_ps2.py
index c6bbaee3a20132dc9ecbcd52b261b5324bb1179e..f27e4172d12945d5cd84371b9d961aded124dfa8 100644
--- a/test/legacy_test/test_dist_fleet_ps2.py
+++ b/test/legacy_test/test_dist_fleet_ps2.py
@@ -52,17 +52,19 @@ class TestPSPassWithBow(unittest.TestCase):
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(cos_q_pt)[0].item()
             loss_op1 = paddle.subtract(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'
+                paddle.full(
+                    shape=fill_shape, fill_value=margin, dtype='float32'
                 ),
                 cos_q_pt,
             )
             loss_op2 = paddle.add(loss_op1, cos_q_nt)
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(loss_op2)[0].item()
             loss_op3 = paddle.maximum(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
-                ),
+                paddle.full(shape=fill_shape, fill_value=0.0, dtype='float32'),
                 loss_op2,
             )
             avg_cost = paddle.mean(loss_op3)
diff --git a/test/legacy_test/test_dist_fleet_ps3.py b/test/legacy_test/test_dist_fleet_ps3.py
index 15f0bc363db47f51d62848be95a3edeb40d98753..9f1ff73b830187d2c358162a105ecd28a959b9b3 100644
--- a/test/legacy_test/test_dist_fleet_ps3.py
+++ b/test/legacy_test/test_dist_fleet_ps3.py
@@ -49,17 +49,19 @@ class TestPSPassWithBow(unittest.TestCase):
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(cos_q_pt)[0].item()
             loss_op1 = paddle.subtract(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'
+                paddle.full(
+                    shape=fill_shape, fill_value=margin, dtype='float32'
                 ),
                 cos_q_pt,
             )
             loss_op2 = paddle.add(loss_op1, cos_q_nt)
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(loss_op2)[0].item()
             loss_op3 = paddle.maximum(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
-                ),
+                paddle.full(shape=fill_shape, fill_value=0.0, dtype='float32'),
                 loss_op2,
             )
             avg_cost = paddle.mean(loss_op3)
diff --git a/test/legacy_test/test_dist_fleet_ps4.py b/test/legacy_test/test_dist_fleet_ps4.py
index b3c8dedf3ee243b95abd2a1d0dad79ab3ae95b33..3d401885815ddc8ef565ab87feb6e5d420358aa3 100644
--- a/test/legacy_test/test_dist_fleet_ps4.py
+++ b/test/legacy_test/test_dist_fleet_ps4.py
@@ -49,17 +49,19 @@ class TestPSPassWithBow(unittest.TestCase):
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(cos_q_pt)[0].item()
             loss_op1 = paddle.subtract(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'
+                paddle.full(
+                    shape=fill_shape, fill_value=margin, dtype='float32'
                 ),
                 cos_q_pt,
             )
             loss_op2 = paddle.add(loss_op1, cos_q_nt)
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(loss_op2)[0].item()
             loss_op3 = paddle.maximum(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
-                ),
+                paddle.full(shape=fill_shape, fill_value=0.0, dtype='float32'),
                 loss_op2,
             )
             avg_cost = paddle.mean(loss_op3)
diff --git a/test/legacy_test/test_dist_fleet_ps5.py b/test/legacy_test/test_dist_fleet_ps5.py
index 5eeab8dac744310d292418d1a7ace41070ee72b9..efc70346ab159bda7b5140698f6212f517ece8cb 100644
--- a/test/legacy_test/test_dist_fleet_ps5.py
+++ b/test/legacy_test/test_dist_fleet_ps5.py
@@ -49,17 +49,19 @@ class TestPSPassWithBow(unittest.TestCase):
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(cos_q_pt)[0].item()
             loss_op1 = paddle.subtract(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'
+                paddle.full(
+                    shape=fill_shape, fill_value=margin, dtype='float32'
                 ),
                 cos_q_pt,
             )
             loss_op2 = paddle.add(loss_op1, cos_q_nt)
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(loss_op2)[0].item()
             loss_op3 = paddle.maximum(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
-                ),
+                paddle.full(shape=fill_shape, fill_value=0.0, dtype='float32'),
                 loss_op2,
             )
             avg_cost = paddle.mean(loss_op3)
diff --git a/test/legacy_test/test_dist_fleet_ps6.py b/test/legacy_test/test_dist_fleet_ps6.py
index f8eaafe3cc32d97ba18af148f927914bc0aa1137..c4be4348c0c7e1422b95a367a7524c14f6321e32 100644
--- a/test/legacy_test/test_dist_fleet_ps6.py
+++ b/test/legacy_test/test_dist_fleet_ps6.py
@@ -49,17 +49,19 @@ class TestPSPassWithBow(unittest.TestCase):
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(cos_q_pt)[0].item()
             loss_op1 = paddle.subtract(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'
+                paddle.full(
+                    shape=fill_shape, fill_value=margin, dtype='float32'
                 ),
                 cos_q_pt,
             )
             loss_op2 = paddle.add(loss_op1, cos_q_nt)
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(loss_op2)[0].item()
             loss_op3 = paddle.maximum(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
-                ),
+                paddle.full(shape=fill_shape, fill_value=0.0, dtype='float32'),
                 loss_op2,
             )
             avg_cost = paddle.mean(loss_op3)
diff --git a/test/legacy_test/test_dist_fleet_spmt.py b/test/legacy_test/test_dist_fleet_spmt.py
index 6d9d6cd86df6383f96d1fa83df681372bbccaa32..17e6c03693d4aaa385cefd456f39311453349505 100644
--- a/test/legacy_test/test_dist_fleet_spmt.py
+++ b/test/legacy_test/test_dist_fleet_spmt.py
@@ -47,17 +47,19 @@ class TestSPMT(unittest.TestCase):
             return acc
 
         def get_loss(cos_q_pt, cos_q_nt):
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(cos_q_pt)[0].item()
             loss_op1 = paddle.subtract(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'
+                paddle.full(
+                    shape=fill_shape, fill_value=margin, dtype='float32'
                 ),
                 cos_q_pt,
             )
             loss_op2 = paddle.add(loss_op1, cos_q_nt)
+            fill_shape = [-1, 1]
+            fill_shape[0] = paddle.shape(loss_op2)[0].item()
             loss_op3 = paddle.maximum(
-                fluid.layers.fill_constant_batch_size_like(
-                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
-                ),
+                paddle.full(shape=fill_shape, fill_value=0.0, dtype='float32'),
                 loss_op2,
             )
             avg_cost = paddle.mean(loss_op3)
diff --git a/test/legacy_test/test_dist_transpiler.py b/test/legacy_test/test_dist_transpiler.py
index ed23ecd294e4452a96ed16a7ed0aa3294376c91f..73ca10308eb87e3012f325473c35fdca3d49d3b1 100644
--- a/test/legacy_test/test_dist_transpiler.py
+++ b/test/legacy_test/test_dist_transpiler.py
@@ -422,11 +422,15 @@ class TestFakeInit(TranspilerTest):
         neg_matmul_re = paddle.reshape(neg_matmul, shape=[-1, neg_num])
         neg_logits = paddle.add(neg_matmul_re, neg_emb_b_vec)
         # nce loss
-        label_ones = fluid.layers.fill_constant_batch_size_like(
-            true_logits, shape=[-1, 1], value=1.0, dtype='float32'
-        )
-        label_zeros = fluid.layers.fill_constant_batch_size_like(
-            true_logits, shape=[-1, neg_num], value=0.0, dtype='float32'
+        fill_shape = [-1, 1]
+        fill_shape[0] = paddle.shape(true_logits)[0].item()
+        label_ones = paddle.full(
+            shape=fill_shape, fill_value=1.0, dtype='float32'
+        )
+        fill_shape = [-1, neg_num]
+        fill_shape[0] = paddle.shape(true_logits)[0].item()
+        label_zeros = paddle.full(
+            shape=fill_shape, fill_value=0.0, dtype='float32'
         )
 
         true_xent = paddle.nn.functional.binary_cross_entropy_with_logits(
diff --git a/test/legacy_test/test_fill_constant_batch_size_like.py b/test/legacy_test/test_fill_constant_batch_size_like.py
deleted file mode 100644
index bd077c87984ac9e324519f5fc4d2ef4d26b7be58..0000000000000000000000000000000000000000
--- a/test/legacy_test/test_fill_constant_batch_size_like.py
+++ /dev/null
@@ -1,124 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
-
-import paddle
-from paddle.fluid import core
-from paddle.fluid.framework import convert_np_dtype_to_dtype_
-
-paddle.enable_static()
-
-
-def fill_constant_batch_size_like(
-    input,
-    shape,
-    value,
-    data_type,
-    input_dim_idx=0,
-    output_dim_idx=0,
-    force_cpu=False,
-):
-    return paddle.fluid.layers.fill_constant_batch_size_like(
-        input, shape, data_type, value, input_dim_idx, output_dim_idx, force_cpu
-    )
-
-
-class TestFillConstantBatchSizeLike1(OpTest):
-    # test basic
-    def setUp(self):
-        self.op_type = "fill_constant_batch_size_like"
-        self.python_api = fill_constant_batch_size_like
-        self.init_dtype()
-        self.init_data()
-
-        input = np.zeros(self.shape)
-        out = np.full_like(input, self.value, self.dtype)
-
-        self.inputs = {'Input': input}
-        self.outputs = {'Out': out}
-        self.attrs = {
-            'shape': self.shape,
-            'dtype': convert_np_dtype_to_dtype_(self.dtype),
-            'value': self.value,
-            'input_dim_idx': self.input_dim_idx,
-            'output_dim_idx': self.output_dim_idx,
-            'force_cpu': self.force_cpu,
-        }
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def init_data(self):
-        self.shape = [10, 10]
-        self.value = 100
-        self.input_dim_idx = 0
-        self.output_dim_idx = 0
-        self.force_cpu = False
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFillConstantBatchSizeLikeFP16Op(TestFillConstantBatchSizeLike1):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda() or not core.supports_bfloat16(),
-    "core is not compiled with CUDA or place do not support bfloat16",
-)
-class TestFillConstantBatchSizeLikeBF16Op(OpTest):
-    # test bf16
-    def setUp(self):
-        self.op_type = "fill_constant_batch_size_like"
-        self.python_api = fill_constant_batch_size_like
-        self.init_data()
-
-        input = np.zeros(self.shape).astype("float32")
-        input_bf16 = convert_float_to_uint16(input)
-        out = np.full_like(input, self.value, np.float32)
-        out_bf16 = convert_float_to_uint16(out)
-
-        self.inputs = {'Input': input_bf16}
-        self.outputs = {'Out': out_bf16}
-        self.attrs = {
-            'shape': self.shape,
-            'dtype': convert_np_dtype_to_dtype_(self.dtype),
-            'value': self.value,
-            'input_dim_idx': self.input_dim_idx,
-            'output_dim_idx': self.output_dim_idx,
-            'force_cpu': self.force_cpu,
-        }
-
-    def init_data(self):
-        self.shape = [10, 10]
-        self.dtype = np.uint16
-        self.value = 100
-        self.input_dim_idx = 0
-        self.output_dim_idx = 0
-        self.force_cpu = False
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_layers.py b/test/legacy_test/test_layers.py
index ded9e08da74cf72ebd151dc81db6a14cb710e99d..44986f0b122ab739c84d66fde28864fcdcfa045c 100644
--- a/test/legacy_test/test_layers.py
+++ b/test/legacy_test/test_layers.py
@@ -2133,16 +2133,6 @@ class TestBook(LayerTest):
             )
             return out
 
-    def test_fill_constant_batch_size_like(self):
-        with self.static_graph():
-            like = paddle.tensor.fill_constant(
-                shape=[1, 200], value=10, dtype='int64'
-            )
-            out = layers.fill_constant_batch_size_like(
-                input=like, shape=[2, 3300], value=1315454564656, dtype='int64'
-            )
-            return out
-
     def test_shuffle_batch(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():