From 44f50ecb91fc61d35ee1e83b0fe7f18fe6d55f86 Mon Sep 17 00:00:00 2001
From: NazgulLee <lijian30@baidu.com>
Date: Fri, 14 Jun 2019 16:19:04 +0800
Subject: [PATCH] 1. fix add bias logic; 2. fix several typo (#1687)

---
 .../paddle-mobile-metallib/Common.metal       |  10 ++
 .../paddle-mobile-metallib/ConcatKernel.metal |  13 ++
 .../ConvAddReluMetal.metal                    | 153 +++++++++++++++---
 .../paddle-mobile-metallib/Elementwise.metal  |  10 --
 .../ElementwiseAddPreluKernel.metal           |  10 --
 .../Src/Common/MetalExtension.swift           |  15 +-
 .../Src/Common/PaddleMobileUnitTest.swift     |   2 +-
 .../paddle-mobile/Src/Framework/Loader.swift  |   2 +-
 .../Src/Operators/ConvAddReluOp.swift         |   8 +-
 .../paddle-mobile/Src/Operators/FeedOp.swift  |   2 +-
 .../Kernels/ConvAddAddPreluKernel.swift       |   2 +-
 .../Kernels/ConvAddBatchNormReluKernel.swift  |   2 +-
 .../Src/Operators/Kernels/ConvAddKernel.swift |   4 +-
 .../Kernels/ConvAddPreluKernel.swift          |   2 +-
 .../Operators/Kernels/ConvAddReluKernel.swift |  52 ++++--
 .../Operators/Kernels/ConvBNReluKernel.swift  |   2 +-
 .../Src/Operators/Kernels/ConvKernel.swift    |   6 +-
 .../Operators/Kernels/ConvReluKernel.swift    |   4 +-
 .../Kernels/ElementwiseAddKernel.swift        |  46 +++---
 .../Src/Operators/Kernels/ReluKernel.swift    |   5 +
 .../Src/Operators/Kernels/ScaleOpKernel.swift |   4 +-
 21 files changed, 259 insertions(+), 95 deletions(-)

diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
index 418dcb4396..185370c519 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
@@ -107,6 +107,15 @@ inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
     }
 }
 
+struct ElementwiseAddParam {
+    int32_t fast;
+    int32_t axis;
+    int32_t ylen;
+    int32_t xdim[4];
+    int32_t xtrans[4];
+    int32_t ydim[4];
+    int32_t ytrans[4];
+};
 
 struct MetalConvParam {
     short offsetX;
@@ -122,4 +131,5 @@ struct MetalConvParam {
     ushort oC;
     ushort hasAddOp;
     ushort hasReluOp;
+    ElementwiseAddParam addParam;
 };
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
index 671b912bb2..55362f44de 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
@@ -204,3 +204,16 @@ struct ConcatParam {
 #undef N
 #undef R
 #undef V
+
+#define V VY
+#define R 4
+#define N 3
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
+#undef V
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
index de85897c10..d487e00fa3 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
@@ -17,6 +17,56 @@
 
 using namespace metal;
 
+half4 getBiasHalf(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<half, access::sample> biasTexture) {
+    half4 output;
+    if (addParam.fast) {
+        output = biasTexture.read(gid.xy, gid.z);
+    } else {
+        int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+        int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+        int32_t xtrans[4] = {addParam.xtrans[0], addParam.xtrans[1], addParam.xtrans[2], addParam.xtrans[3]};
+        int32_t ytrans[4] = {addParam.ytrans[0], addParam.ytrans[1], addParam.ytrans[2], addParam.ytrans[3]};
+        int32_t yshift = 4 - addParam.ylen - addParam.axis;
+        for (int n = 0; n < 4; n++) {
+            x_xyzn[3] = n;
+            xyzn2abcd(addParam.xdim[3], x_xyzn, x_abcd);
+            invtrans(xtrans, x_abcd, t_abcd);
+            for (int k = addParam.axis; k < (addParam.axis + addParam.ylen); k++) {
+                y_abcd[yshift+k] = t_abcd[k];
+            }
+            trans(ytrans, y_abcd, t_abcd);
+            abcd2xyzn(addParam.ydim[3], t_abcd, y_xyzn);
+            output[n] = biasTexture.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+        }
+    }
+    return output;
+}
+
+float4 getBias(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<float, access::sample> biasTexture) {
+    float4 output;
+    if (addParam.fast) {
+        output = float4(biasTexture.read(gid.xy, gid.z));
+    } else {
+        int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+        int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+        int32_t xtrans[4] = {addParam.xtrans[0], addParam.xtrans[1], addParam.xtrans[2], addParam.xtrans[3]};
+        int32_t ytrans[4] = {addParam.ytrans[0], addParam.ytrans[1], addParam.ytrans[2], addParam.ytrans[3]};
+        int32_t yshift = 4 - addParam.ylen - addParam.axis;
+        for (int n = 0; n < 4; n++) {
+            x_xyzn[3] = n;
+            xyzn2abcd(addParam.xdim[3], x_xyzn, x_abcd);
+            invtrans(xtrans, x_abcd, t_abcd);
+            for (int k = addParam.axis; k < (addParam.axis + addParam.ylen); k++) {
+                y_abcd[yshift+k] = t_abcd[k];
+            }
+            trans(ytrans, y_abcd, t_abcd);
+            abcd2xyzn(addParam.ydim[3], t_abcd, y_xyzn);
+            output[n] = biasTexture.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+        }
+    }
+    return output;
+}
+
 #pragma mark - convAdd
 kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                          texture2d_array<float, access::sample> biasTexture [[texture(1)]],
@@ -39,7 +89,11 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
     uint input_arr_size = inTexture.get_array_size();
     uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
+    float4 output = float4(0.0, 0.0, 0.0, 0.0);
+    if (param.hasAddOp) {
+        constant ElementwiseAddParam &addParam = param.addParam;
+        output = getBias(gid, addParam, biasTexture);
+    }
     
     float4 input;
     for (uint i = 0; i < input_arr_size; ++i) {
@@ -83,7 +137,11 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
     
     uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
+    float4 output = float4(0.0, 0.0, 0.0, 0.0);
+    if (param.hasAddOp) {
+        constant ElementwiseAddParam &addParam = param.addParam;
+        output = getBias(gid, addParam, biasTexture);
+    }
     
     ushort dilation_x = param.dilationX;
     ushort dilation_y = param.dilationY;
@@ -146,7 +204,11 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
     
     const uint kernelHXW = 9;
     
-    float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
+    float4 output = float4(0.0, 0.0, 0.0, 0.0);
+    if (param.hasAddOp) {
+        constant ElementwiseAddParam &addParam = param.addParam;
+        output = getBias(gid, addParam, biasTexture);
+    }
     
     ushort dilation_x = param.dilationX;
     ushort dilation_y = param.dilationY;
@@ -205,7 +267,11 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
     
     uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
+    float4 output = float4(0.0, 0.0, 0.0, 0.0);
+    if (param.hasAddOp) {
+        constant ElementwiseAddParam &addParam = param.addParam;
+        output = getBias(gid, addParam, biasTexture);
+    }
     
     ushort dilation_y = param.dilationY;
     float4 input[5];
@@ -262,7 +328,11 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
     
     uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
+    float4 output = float4(0.0, 0.0, 0.0, 0.0);
+    if (param.hasAddOp) {
+        constant ElementwiseAddParam &addParam = param.addParam;
+        output = getBias(gid, addParam, biasTexture);
+    }
     
     ushort dilation_x = param.dilationX;
     float4 input[5];
@@ -313,7 +383,13 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
     constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
     const uint kernelHXW = 9;
     uint weithTo = gid.z * kernelHXW * 4;
-    float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
+
+    float4 output = float4(0.0, 0.0, 0.0, 0.0);
+    if (param.hasAddOp) {
+        constant ElementwiseAddParam &addParam = param.addParam;
+        output = getBias(gid, addParam, biasTexture);
+    }
+    
     float4 inputs[9];
     inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
     inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
@@ -358,7 +434,11 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
     uint input_arr_size = inTexture.get_array_size();
     uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
+    float4 output = float4(0.0, 0.0, 0.0, 0.0);
+    if (param.hasAddOp) {
+        constant ElementwiseAddParam &addParam = param.addParam;
+        output = float4(getBiasHalf(gid, addParam, biasTexture));
+    }
     
     float4 input;
     for (uint i = 0; i < input_arr_size; ++i) {
@@ -399,11 +479,15 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
     uint input_arr_size = inTexture.get_array_size();
     uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
-    
+    float4 output = float4(0.0, 0.0, 0.0, 0.0);
+    if (param.hasAddOp) {
+        constant ElementwiseAddParam &addParam = param.addParam;
+        output = float4(getBiasHalf(gid, addParam, biasTexture));
+    }
+
     ushort dilation_x = param.dilationX;
     ushort dilation_y = param.dilationY;
-    
+
     half4 input[9];
     for (uint i = 0; i < input_arr_size; ++i) {
         input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
@@ -418,13 +502,13 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
         for (int j = 0; j < 9; ++j) {
             half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
             output.x += dot(float4(input[j]), float4(weight_x));
-            
+
             half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
             output.y += dot(float4(input[j]), float4(weight_y));
-            
+
             half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
             output.z += dot(float4(input[j]), float4(weight_z));
-            
+
             half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
             output.w += dot(float4(input[j]), float4(weight_w));
         }
@@ -452,7 +536,11 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
     
     const uint kernelHXW = 9;
     
-    float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
+    float4 output = float4(0.0, 0.0, 0.0, 0.0);
+    if (param.hasAddOp) {
+        constant ElementwiseAddParam &addParam = param.addParam;
+        output = float4(getBiasHalf(gid, addParam, biasTexture));
+    }
     
     ushort dilation_x = param.dilationX;
     ushort dilation_y = param.dilationY;
@@ -505,7 +593,13 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
     constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
     const uint kernelHXW = 9;
     uint weithTo = gid.z * kernelHXW * 4;
-    float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
+
+    float4 output = float4(0.0, 0.0, 0.0, 0.0);
+    if (param.hasAddOp) {
+        constant ElementwiseAddParam &addParam = param.addParam;
+        output = float4(getBiasHalf(gid, addParam, biasTexture));
+    }
+
     half4 inputs[9];
     inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
     inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
@@ -523,7 +617,7 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
         output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
         output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
     }
-    
+
     float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
     outTexture.write(half4(relu), gid.xy, gid.z);
 }
@@ -584,7 +678,7 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
 
     for (int c = 0; c < 4; ++c) {
         if (hasComputedC + c >= param.oC) {
-            return;
+            break;
         }
         half I[16];
         for (int i = 0; i < 16; ++i) {
@@ -644,13 +738,14 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
     }
     
     if (param.hasAddOp == 1) {
-        half4 base = biasTexture.sample(sample, float2(tx, ty), tc);
+        constant ElementwiseAddParam &addParam = param.addParam;
+        half4 base = getBiasHalf(uint3(tx, ty, tc), addParam, biasTexture);
         res[0] += base;
-        base = biasTexture.sample(sample, float2(tx + 1, ty), tc);
+        base = getBiasHalf(uint3(tx + 1, ty, tc), addParam, biasTexture);
         res[1] += base;
-        base = biasTexture.sample(sample, float2(tx, ty + 1), tc);
+        base = getBiasHalf(uint3(tx, ty + 1, tc), addParam, biasTexture);
         res[2] += base;
-        base = biasTexture.sample(sample, float2(tx + 1, ty + 1), tc);
+        base = getBiasHalf(uint3(tx + 1, ty + 1, tc), addParam, biasTexture);
         res[3] += base;
     }
 
@@ -690,8 +785,12 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
     
     uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
-    
+    float4 output = float4(0.0, 0.0, 0.0, 0.0);
+    if (param.hasAddOp) {
+        constant ElementwiseAddParam &addParam = param.addParam;
+        output = float4(getBiasHalf(gid, addParam, biasTexture));
+    }
+
     ushort dilation_y = param.dilationY;
     half4 input[5];
     
@@ -747,8 +846,12 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu
     
     uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
-    
+    float4 output = float4(0.0, 0.0, 0.0, 0.0);
+    if (param.hasAddOp) {
+        constant ElementwiseAddParam &addParam = param.addParam;
+        output = float4(getBiasHalf(gid, addParam, biasTexture));
+    }
+
     ushort dilation_x = param.dilationX;
     half4 input[5];
     
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal
index 40cad28df1..45559cb0e8 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal
@@ -17,16 +17,6 @@
 
 using namespace metal;
 
-struct ElementwiseAddParam {
-    int32_t fast;
-    int32_t axis;
-    int32_t ylen;
-    int32_t xdim[4];
-    int32_t xtrans[4];
-    int32_t ydim[4];
-    int32_t ytrans[4];
-};
-
 kernel void elementwise_add(texture2d_array<float, access::read> inputX [[texture(0)]],
                             texture2d_array<float, access::read> inputY [[texture(1)]],
                             texture2d_array<float, access::write> outTexture [[texture(2)]],
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal
index cca11e8086..c688674918 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal
@@ -16,16 +16,6 @@
 #include "Common.metal"
 using namespace metal;
 
-struct ElementwiseAddParam {
-    int32_t fast;
-    int32_t axis;
-    int32_t ylen;
-    int32_t xdim[4];
-    int32_t xtrans[4];
-    int32_t ydim[4];
-    int32_t ytrans[4];
-};
-
 #define P float
 
 #define PRELU_CHANNEL prelu_channel
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift
index c09669137c..a966086c8e 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift
@@ -287,7 +287,13 @@ extension MTLDevice {
             var rcount: Int = (ndim[0] * ndim[3] + 3) / 4
             rcount = rcount * 4 * ndim[1] * ndim[2]
             var nvalue: [Float32] = .init(repeating: 0.0, count: rcount)
-            
+            var value32: [Float32]?
+            if value is [Float16] {
+                var value16 = value as! [Float16]
+                value32 = float16To32(input: &value16, count: value.count)
+            } else {
+                value32 = value as? [Float32]
+            }
             for i0 in 0..<tdim[0] {
                 for i1 in 0..<tdim[1] {
                     for i2 in 0..<tdim[2] {
@@ -298,8 +304,11 @@ extension MTLDevice {
                             let jg = transpose.map { ig[$0] }
                             let k = jg[0] * ndim[3] + jg[3]
                             let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4)
-                            
-                            nvalue[jx] = value[ix] as! Float32
+                            if let value32 = value32 {
+                                nvalue[jx] = value32[ix]
+                            } else {
+                                fatalError("tensor2texture tensor value type not support")
+                            }
                         }
                     }
                 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
index a66c9bcb4f..48f3e95331 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
@@ -325,7 +325,7 @@ public class PaddleMobileUnitTest {
         let fC = 4
         let oC = 4
         
-        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
+        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
         
         let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
         
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
index ed11667ef7..15707dcd56 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
@@ -105,8 +105,8 @@ public class Loader<P: PrecisionProtocol>: Loaderable {
                 } while (false)
             } else {
                 fseek(file, MemoryLayout<CChar>.size * tensorDescSize, SEEK_CUR)
+                nowIndex += MemoryLayout<CChar>.size * tensorDescSize
             }
-            nowIndex += MemoryLayout<CChar>.size * tensorDescSize
             
             /*
              这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift
index 72b8f6e4ed..72be568a67 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift
@@ -24,6 +24,11 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
             paddings = try ConvAddReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
             dilations = try ConvAddReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
             groups = try ConvAddReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            do {
+                axis = try ConvAddReluParam.getAttr(key: "axis", attrs: opDesc.attrs)
+            } catch {
+                axis = -1
+            }
             do {
                 y = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
             } catch {
@@ -32,7 +37,7 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
                     let device = input.metalTexture!.device
                     y = Texture.init(device: device, inDim: yTensor.dim)
                     let value: [P] = Array(UnsafeBufferPointer(start: yTensor.data.pointer, count: yTensor.dim.numel()))
-                    y?.metalTexture = device.tensor2texture(value: value, dim: yTensor.dim.dims, transpose: [0, 2, 3, 1], inComputePrecision: GlobalConfig.shared.computePrecision)
+                    y?.metalTexture = device.tensor2texture(value: value, dim: yTensor.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
                     self.yTensor = yTensor
                 } catch {
                 }
@@ -49,6 +54,7 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
     let paddings: [Int32]
     let dilations: [Int32]
     let groups: Int
+    let axis: Int
     
     var y: Texture?
     var yTensor: Tensor<P>?
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift
index 0d9510d2b0..5022d31205 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift
@@ -64,7 +64,7 @@ class FeedOp<P: PrecisionProtocol>: Operator<Texture2DTo2DArrayKernel<P>, FeedPa
     func delogOutput() {
         print(" \(type) output: ")
         print(para.output.metalTexture)
-        print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray())
+        print(para.output.toTensor().strideArray())
     }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
index 6f7b093b49..7313837c12 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
@@ -135,7 +135,7 @@ class ConvAddAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
         let iC = param.input.tensorDim[1];
         let fC = param.filter.tensorDim[1];
         let oC = param.output.tensorDim[1];
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
         //    print("metal param: ")
         //    print(inMetalParam)
         
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
index 82ccfa99fd..a10e1939df 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -98,7 +98,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionProtocol>: Kernel, Computable, Test
         let iC = param.input.tensorDim[1];
         let fC = param.filter.tensorDim[1];
         let oC = param.output.tensorDim[1];
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
         
         var invs: [P] = []
         let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
index bd270273a4..eefffd817a 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
@@ -16,11 +16,11 @@ import Foundation
 import MetalPerformanceShaders
 
 class ConvAddKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
-    override func hasAddOp() -> Bool {
+    override class func hasAddOp() -> Bool {
         return true
     }
     
-    override func hasReluOp() -> Bool {
+    override class func hasReluOp() -> Bool {
         return false
     }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
index 6821992995..6c6f926162 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
@@ -135,7 +135,7 @@ class ConvAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
         let iC = param.input.tensorDim[1];
         let fC = param.filter.tensorDim[1];
         let oC = param.output.tensorDim[1];
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
         //    print("metal param: ")
         //    print(inMetalParam)
         
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
index bfc481a877..4b742f94d5 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
@@ -29,6 +29,7 @@ public struct MetalConvParam {
     let oC: UInt16
     let hasAddOp: UInt16
     let hasReluOp: UInt16
+    let addParam: ElementwiseAddMetalParam
 }
 
 @available(iOS 11.0, *)
@@ -124,7 +125,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
         if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) {
             let inputChannel = param.input.tensorDim[1]
             let outputChannel = param.output.tensorDim[1]
-            if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) {
+            if inputChannel > 4 && outputChannel > 4 {
                 shouldUseMPS = true
             }
         }
@@ -135,6 +136,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
         if !isDepthWise && param.groups > 1 {
             shouldUseMPS = false
         }
+        if type(of: self).hasAddOp() {
+            if !(type(of: self).canAddUseMPS(param: param)) {
+                shouldUseMPS = false
+            }
+        }
         if shouldUseMPS {
             super.init(device: device, inFunctionName: nil, initContext: initContext)
             setupWithMPS(device: device, param: param)
@@ -195,11 +201,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
             param.input.useMPS = true
             param.output.useMPS = true
             if #available(iOS 11.3, *) {
-                if param.y != nil {
+                if type(of: self).hasAddOp() && type(of: self).canMPSAddByElement(param: param) && !type(of: self).canMPSAddByChannel(param: param) {
                     mpsAddOp = MPSCNNAdd(device: device)
-                    if hasReluOp() {
-                        mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0)
-                    }
+                }
+                if type(of: self).hasReluOp() {
+                    mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0)
                 }
             }
             let neuronFilter: MPSCNNNeuron? = param.y != nil ? nil : (neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
@@ -217,7 +223,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
             desc.strideInPixelsX = Int(param.stride[0])
             desc.strideInPixelsY = Int(param.stride[1])
             let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
-            let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.yTensor)
+            var biasTerms: Tensor<P>? = nil
+            if type(of: self).hasAddOp() && type(of: self).canMPSAddByChannel(param: param) {
+                biasTerms = param.yTensor
+            }
+            let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: biasTerms)
             
             let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
             conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
@@ -233,7 +243,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
         let iC = param.input.tensorDim[1];
         let fC = param.filter.tensorDim[1];
         let oC = param.output.tensorDim[1];
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0))
+        var addParam = ElementwiseAddMetalParam()
+        if let inputY = param.y {
+            addParam = ElementwiseAddKernel<P>.metalParamFrom(inputX: param.output, inputY: inputY, axis: param.axis)
+        }
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(type(of: self).hasAddOp() ? 1 : 0), hasReluOp: UInt16(type(of: self).hasReluOp() ? 1 : 0), addParam: addParam)
         metalParam = inMetalParam
         
         if type(of: self).isWinoGrad(functionName: functionName) {
@@ -304,7 +318,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
     }
     
     open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
-        if hasReluOp() {
+        if type(of: self).hasReluOp() {
             if #available(iOS 10.0, *) {
                 return MPSCNNNeuronReLU(device: device, a: 0)
             }
@@ -312,11 +326,29 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
         return nil
     }
     
-    open func hasAddOp() -> Bool {
+    open class func canAddUseMPS(param: ConvAddReluParam<P>) -> Bool {
+        return canMPSAddByChannel(param: param) || canMPSAddByElement(param: param)
+    }
+    
+    private class func canMPSAddByChannel(param: ConvAddReluParam<P>) -> Bool {
+        if let yTensor = param.yTensor, yTensor.dim.cout() == 1 {
+            return true
+        }
+        return false
+    }
+    
+    private class func canMPSAddByElement(param: ConvAddReluParam<P>) -> Bool {
+        if let y = param.y, y.dim.dims == param.input.dim.dims {
+            return true
+        }
+        return false
+    }
+    
+    open class func hasAddOp() -> Bool {
         return true
     }
     
-    open func hasReluOp() -> Bool {
+    open class func hasReluOp() -> Bool {
         return true
     }
     
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
index 6d20bf6f9f..d70caca5f4 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
@@ -105,7 +105,7 @@ class ConvBNReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable {
         let iC = param.input.tensorDim[1];
         let fC = param.filter.tensorDim[1];
         let oC = param.output.tensorDim[1];
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
         
         var invs: [P] = []
         let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
index 11f8a2683c..19dc193ac4 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
@@ -66,7 +66,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
             throw PaddleMobileError.predictError(message: " encode is nil")
         }
         encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setTexture(param.output.metalTexture, index: 2)
         encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
         encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
         encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2)
@@ -111,7 +111,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
         let iC = param.input.tensorDim[1];
         let fC = param.filter.tensorDim[1];
         let oC = param.output.tensorDim[1];
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0))
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0), addParam: ElementwiseAddMetalParam())
         metalParam = inMetalParam
         
         if type(of: self).isWinoGrad(functionName: functionName) {
@@ -130,7 +130,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
             } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
                 if useAggressiveOptimization {
                     let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3
-                        && param.filter.n == 16 && param.stride[0] == 1 && param.stride[1] == 1
+                        && param.filter.n <= 16 && param.stride[0] == 1 && param.stride[1] == 1
                         && param.dilations[0] == 1 && param.dilations[1] == 1
                     if couldUseWinograd {
                         return "depthwise_conv_add_relu_3x3_half_winograd"
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift
index 0b73deb1b0..9937ca158b 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift
@@ -16,11 +16,11 @@ import Foundation
 import MetalPerformanceShaders
 
 class ConvReluKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
-    override func hasAddOp() -> Bool {
+    override class func hasAddOp() -> Bool {
         return false
     }
     
-    override func hasReluOp() -> Bool {
+    override class func hasReluOp() -> Bool {
         return true
     }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift
index acddad5bb1..c1ee435e3a 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift
@@ -34,27 +34,8 @@ class ElementwiseAddKernel<P: PrecisionProtocol>: Kernel, Computable {
             throw error
         }
         
-        metalParam = ElementwiseAddMetalParam.init()
+        metalParam = ElementwiseAddKernel.metalParamFrom(inputX: param.inputX, inputY: param.inputY, axis: param.axis)
         
-        let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
-        let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
-        let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
-        let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
-        
-        metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
-        metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
-        metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
-        metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
-        if param.axis == -1 {
-            metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
-        } else {
-            metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
-        }
-        metalParam.ylen = Int32(param.inputY.tensorDim.cout())
-        if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
-            //      print("===> elementwise_add fast!!!")
-            metalParam.fast = 1
-        }
         if GlobalConfig.shared.computePrecision == .Float32 {
             super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext)
         } else if GlobalConfig.shared.computePrecision == .Float16 {
@@ -75,4 +56,29 @@ class ElementwiseAddKernel<P: PrecisionProtocol>: Kernel, Computable {
         encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
         encoder.endEncoding()
     }
+    
+    static func metalParamFrom(inputX: Texture, inputY: Texture, axis: Int) -> ElementwiseAddMetalParam {
+        var metalParam = ElementwiseAddMetalParam.init()
+        
+        let xdim: [Int32] = (0..<4).map { Int32(inputX.dim[$0]) }
+        let ydim: [Int32] = (0..<4).map { Int32(inputY.dim[$0]) }
+        let xtrans: [Int32] = (0..<4).map { Int32(inputX.transpose[$0]) }
+        let ytrans: [Int32] = (0..<4).map { Int32(inputY.transpose[$0]) }
+        
+        metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
+        metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
+        metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
+        metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
+        if axis == -1 {
+            metalParam.axis = 4 - Int32(inputY.tensorDim.cout())
+        } else {
+            metalParam.axis = 4 - Int32(inputX.tensorDim.cout()) + Int32(axis)
+        }
+        metalParam.ylen = Int32(inputY.tensorDim.cout())
+        if (inputX.dim == inputY.dim) && (inputX.transpose == inputY.transpose) {
+            //      print("===> elementwise_add fast!!!")
+            metalParam.fast = 1
+        }
+        return metalParam
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift
index 0e2f4983cb..1f370db649 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift
@@ -26,6 +26,11 @@ class ReluKernel<P: PrecisionProtocol>: Kernel, Computable{
     }
     
     required init(device: MTLDevice, param: ReluParam<P>, initContext: InitContext) throws {
+        do {
+            try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        } catch let error {
+            throw error
+        }
         if GlobalConfig.shared.computePrecision == .Float32 {
             super.init(device: device, inFunctionName: "relu", initContext: initContext)
         } else if GlobalConfig.shared.computePrecision == .Float16 {
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift
index fc63c08ebc..483bedcb08 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift
@@ -34,10 +34,10 @@ class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{
         }
         
         var shouldUseMPS = false
-        if initContext.useMPS && param.biasAfterScale {
+        if initContext.useMPS && param.biasAfterScale && param.input.tensorDim.cout() == 4 && param.output.tensorDim.cout() == 4 {
             let inputChannel = param.input.tensorDim[1]
             let outputChannel = param.output.tensorDim[1]
-            if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) {
+            if (inputChannel > 4) && (outputChannel > 4) {
                 shouldUseMPS = true
             }
         }
-- 
GitLab