diff --git a/ppcls/arch/backbone/model_zoo/levit.py b/ppcls/arch/backbone/model_zoo/levit.py
index e4be6f43bca89a28be33193adda1d445cebd1f7c..55c5817f5c4fc9c188fc99897a46e6b925793570 100644
--- a/ppcls/arch/backbone/model_zoo/levit.py
+++ b/ppcls/arch/backbone/model_zoo/levit.py
@@ -45,12 +45,13 @@ __all__ = list(MODEL_URLS.keys())
 def cal_attention_biases(attention_biases, attention_bias_idxs):
     gather_list = []
     attention_bias_t = paddle.transpose(attention_biases, (1, 0))
-    for idx in attention_bias_idxs:
-        gather = paddle.gather(attention_bias_t, idx)
+    nums = attention_bias_idxs.shape[0]
+    for idx in range(nums):
+        gather = paddle.gather(attention_bias_t, attention_bias_idxs[idx])
         gather_list.append(gather)
     shape0, shape1 = attention_bias_idxs.shape
-    return paddle.transpose(paddle.concat(gather_list), (1, 0)).reshape(
-        (0, shape0, shape1))
+    gather = paddle.concat(gather_list)
+    return paddle.transpose(gather, (1, 0)).reshape((0, shape0, shape1))
 
 
 class Conv2d_BN(nn.Sequential):
@@ -127,11 +128,12 @@ class Residual(nn.Layer):
 
     def forward(self, x):
         if self.training and self.drop > 0:
-            return x + self.m(x) * paddle.rand(
-                x.size(0), 1, 1,
-                device=x.device).ge_(self.drop).div(1 - self.drop).detach()
+            return paddle.add(x,
+                              self.m(x) * paddle.rand(
+                                  x.size(0), 1, 1, device=x.device).ge_(
+                                      self.drop).div(1 - self.drop).detach())
         else:
-            return x + self.m(x)
+            return paddle.add(x, self.m(x))
 
 
 class Attention(nn.Layer):
@@ -203,9 +205,9 @@ class Attention(nn.Layer):
                                                     self.attention_bias_idxs)
         else:
             attention_biases = self.ab
-        attn = ((q @k_transpose) * self.scale + attention_biases)
+        attn = (paddle.matmul(q, k_transpose) * self.scale + attention_biases)
         attn = F.softmax(attn)
-        x = paddle.transpose(attn @v, perm=[0, 2, 1, 3])
+        x = paddle.transpose(paddle.matmul(attn, v), perm=[0, 2, 1, 3])
         x = paddle.reshape(x, [B, N, self.dh])
         x = self.proj(x)
         return x
@@ -219,8 +221,11 @@ class Subsample(nn.Layer):
 
     def forward(self, x):
         B, N, C = x.shape
-        x = paddle.reshape(x, [B, self.resolution, self.resolution,
-                               C])[:, ::self.stride, ::self.stride]
+        #x = paddle.reshape(x, [B, self.resolution, self.resolution,
+        #                        C])[:, ::self.stride, ::self.stride]
+        x = paddle.reshape(x, [B, self.resolution, self.resolution, C])
+        end1, end2 = x.shape[1], x.shape[2]
+        x = x[:, 0:end1:self.stride, 0:end2:self.stride]
         x = paddle.reshape(x, [B, -1, C])
         return x
 
@@ -315,13 +320,14 @@ class AttentionSubsample(nn.Layer):
         else:
             attention_biases = self.ab
 
-        attn = (q @paddle.transpose(
-            k, perm=[0, 1, 3, 2])) * self.scale + attention_biases
+        attn = (paddle.matmul(
+            q, paddle.transpose(
+                k, perm=[0, 1, 3, 2]))) * self.scale + attention_biases
         attn = F.softmax(attn)
 
         x = paddle.reshape(
             paddle.transpose(
-                (attn @v), perm=[0, 2, 1, 3]), [B, -1, self.dh])
+                paddle.matmul(attn, v), perm=[0, 2, 1, 3]), [B, -1, self.dh])
         x = self.proj(x)
         return x
 
@@ -422,6 +428,7 @@ class LeViT(nn.Layer):
         x = paddle.transpose(x, perm=[0, 2, 1])
         x = self.blocks(x)
         x = x.mean(1)
+        x = paddle.reshape(x, [-1, 384])
         if self.distillation:
             x = self.head(x), self.head_dist(x)
             if not self.training: