1.merge conv, conv add, conv add relu into one implementation. (#1661)

b53a20aa · Yanzhan Yang · GitHub · 881dcb2e · b53a20aa · b53a20aa
18 changed file
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
@@ -40,7 +40,6 @@
 		FCC15DFD221E69E100DC3CB2 /* Elementwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */; };
 		FCC15DFE221E69E100DC3CB2 /* ReshapeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */; };
 		FCC15DFF221E69E100DC3CB2 /* Scale.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD6221E69DF00DC3CB2 /* Scale.metal */; };
-		FCC15E00221E69E100DC3CB2 /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD7221E69DF00DC3CB2 /* ConvKernel.metal */; };
 		FCC15E01221E69E100DC3CB2 /* PriorBoxKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */; };
 		FCC15E02221E69E100DC3CB2 /* BatchNormRelu.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */; };
 		FCC15E03221E69E100DC3CB2 /* TransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */; };
@@ -53,7 +52,6 @@
 		FCC15E0A221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */; };
 		FCC15E0B221E69E100DC3CB2 /* FetchKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */; };
 		FCC15E0C221E69E100DC3CB2 /* BufferToTexture.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */; };
-		FCC15E0D221E69E100DC3CB2 /* ConvAddMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE4221E69E100DC3CB2 /* ConvAddMetal.metal */; };
 /* End PBXBuildFile section */

 /* Begin PBXFileReference section */
@@ -93,7 +91,6 @@
 		FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Elementwise.metal; sourceTree = "<group>"; };
 		FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.metal; sourceTree = "<group>"; };
 		FCC15DD6221E69DF00DC3CB2 /* Scale.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Scale.metal; sourceTree = "<group>"; };
-		FCC15DD7221E69DF00DC3CB2 /* ConvKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = "<group>"; };
 		FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PriorBoxKernel.metal; sourceTree = "<group>"; };
 		FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BatchNormRelu.metal; sourceTree = "<group>"; };
 		FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = TransposeKernel.metal; sourceTree = "<group>"; };
@@ -106,7 +103,6 @@
 		FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.inc.metal; sourceTree = "<group>"; };
 		FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = FetchKernel.inc.metal; sourceTree = "<group>"; };
 		FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BufferToTexture.metal; sourceTree = "<group>"; };
-		FCC15DE4221E69E100DC3CB2 /* ConvAddMetal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvAddMetal.metal; sourceTree = "<group>"; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@@ -170,11 +166,9 @@
 				FCC15DE0221E69E100DC3CB2 /* ConcatKernel.inc.metal */,
 				FCC15DCA221E69DE00DC3CB2 /* ConcatKernel.metal */,
 				FCC15DBE221E69DD00DC3CB2 /* ConvAddBNReluKernel.metal */,
-				FCC15DE4221E69E100DC3CB2 /* ConvAddMetal.metal */,
 				FCC15DDB221E69E000DC3CB2 /* ConvAddPrelu.inc.metal */,
 				FCC15DD3221E69DF00DC3CB2 /* ConvAddPreluKernel.metal */,
 				FCC15DCF221E69DE00DC3CB2 /* ConvBNReluKernel.metal */,
-				FCC15DD7221E69DF00DC3CB2 /* ConvKernel.metal */,
 				FCC15DC5221E69DE00DC3CB2 /* ConvTransposeKernel.metal */,
 				FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */,
 				FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */,
@@ -309,12 +303,10 @@
 				FCC15E03221E69E100DC3CB2 /* TransposeKernel.metal in Sources */,
 				FCC15DFE221E69E100DC3CB2 /* ReshapeKernel.metal in Sources */,
 				16FBFB3E22925D040025B406 /* ActivationKernel.metal in Sources */,
-				FCC15E0D221E69E100DC3CB2 /* ConvAddMetal.metal in Sources */,
 				FCC15DF7221E69E100DC3CB2 /* ReshapeKernel.inc.metal in Sources */,
 				FCC15DE5221E69E100DC3CB2 /* ReluKernel.metal in Sources */,
 				FCC15DEF221E69E100DC3CB2 /* Macro.metal in Sources */,
 				FCC15E02221E69E100DC3CB2 /* BatchNormRelu.metal in Sources */,
-				FCC15E00221E69E100DC3CB2 /* ConvKernel.metal in Sources */,
 				FCC15E01221E69E100DC3CB2 /* PriorBoxKernel.metal in Sources */,
 				FCC15DEA221E69E100DC3CB2 /* ElementwiseAddPreluKernel.metal in Sources */,
 				FCC15DED221E69E100DC3CB2 /* PoolKernel.inc.metal in Sources */,

--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ActivationKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ActivationKernel.metal
@@ -13,7 +13,6 @@
 limitations under the License. */

 #include <metal_stdlib>
-#include <metal_math>
 using namespace metal;

 kernel void exp(texture2d_array<float, access::sample> inTexture [[texture(0)]],

--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
@@ -120,4 +120,6 @@ struct MetalConvParam {
    ushort iC;
    ushort fC;
    ushort oC;
+    ushort hasAddOp;
+    ushort hasReluOp;
 };
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
@@ -40,7 +40,7 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
    uint input_arr_size = inTexture.get_array_size();
    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
    
-    float4 output = biase[gid.z];
+    float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
    
    float4 input;
    for (uint i = 0; i < input_arr_size; ++i) {
@@ -57,7 +57,7 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
        output.w += dot(input, weight_w);
    }
-    float4 relu = fmax(output, 0.0);
+    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
    outTexture.write(relu, gid.xy, gid.z);
 }

@@ -85,7 +85,7 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
    
    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
    
-    float4 output = biase[gid.z];
+    float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
    
    ushort dilation_x = param.dilationX;
    ushort dilation_y = param.dilationY;
@@ -125,7 +125,7 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
            output.w += dot(input[j], weight_w);
        }
    }
-    float4 relu = fmax(output, 0.0);
+    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
    outTexture.write(relu, gid.xy, gid.z);
 }

@@ -148,7 +148,7 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
    
    const uint kernelHXW = 9;
    
-    float4 output = biase[gid.z];
+    float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
    
    ushort dilation_x = param.dilationX;
    ushort dilation_y = param.dilationY;
@@ -180,7 +180,7 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
        }
    }
    
-    float4 relu = fmax(output, 0.0);
+    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
    outTexture.write(relu, gid.xy, gid.z);
 }

@@ -208,7 +208,7 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
    
    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
    
-    float4 output = biase[gid.z];
+    float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
    
    ushort dilation_y = param.dilationY;
    float4 input[5];
@@ -238,7 +238,7 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
            output.w += dot(input[j], weight_w);
        }
    }
-    float4 relu = fmax(output, 0.0);
+    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
    outTexture.write(relu, gid.xy, gid.z);
 }

@@ -266,7 +266,7 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
    
    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
    
-    float4 output = biase[gid.z];
+    float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
    
    ushort dilation_x = param.dilationX;
    float4 input[5];
@@ -296,7 +296,7 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
            output.w += dot(input[j], weight_w);
        }
    }
-    float4 relu = fmax(output, 0.0);
+    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
    outTexture.write(relu, gid.xy, gid.z);
 }

@@ -318,7 +318,7 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
    const uint kernelHXW = 9;
    uint weithTo = gid.z * kernelHXW * 4;
-    float4 output = biase[gid.z];
+    float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
    float4 inputs[9];
    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
@@ -336,7 +336,7 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
    }
-    float4 relu = fmax(output, 0.0);
+    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
    outTexture.write(relu, gid.xy, gid.z);
 }

@@ -364,7 +364,7 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
    uint input_arr_size = inTexture.get_array_size();
    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
    
-    float4 output = float4(biase[gid.z]);
+    float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
    
    float4 input;
    for (uint i = 0; i < input_arr_size; ++i) {
@@ -381,7 +381,7 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
        float4 weight_w = float4(weights[weithTo + 3 * kernelHXW * input_arr_size + i]);
        output.w += dot(input, weight_w);
    }
-    float4 relu = fmax(output, 0.0);
+    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
    outTexture.write(half4(relu), gid.xy, gid.z);
 }

@@ -406,7 +406,7 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
    uint input_arr_size = inTexture.get_array_size();
    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
    
-    float4 output = float4(biase[gid.z]);
+    float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
    
    ushort dilation_x = param.dilationX;
    ushort dilation_y = param.dilationY;
@@ -436,7 +436,7 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
            output.w += dot(float4(input[j]), float4(weight_w));
        }
    }
-    float4 relu = fmax(output, 0.0);
+    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
    outTexture.write(half4(relu), gid.xy, gid.z);
 }

@@ -459,7 +459,7 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
    
    const uint kernelHXW = 9;
    
-    half4 output = biase[gid.z];
+    float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
    
    ushort dilation_x = param.dilationX;
    ushort dilation_y = param.dilationY;
@@ -486,13 +486,13 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
            input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
            for (int j = 0; j < 9; ++j) {
                half weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i];
-                output[c] += input[j] * weight;
+                output[c] += float(input[j]) * float(weight);
            }
        }
    }
    
-    half4 relu = fmax(output, 0.0);
-    outTexture.write(relu, gid.xy, gid.z);
+    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
+    outTexture.write(half4(relu), gid.xy, gid.z);
 }

 kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
@@ -512,7 +512,7 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
    const uint kernelHXW = 9;
    uint weithTo = gid.z * kernelHXW * 4;
-    float4 output = float4(biase[gid.z]);
+    float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
    half4 inputs[9];
    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
@@ -530,8 +530,9 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
        output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
        output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
    }
-    output = fmax(output, 0.0);
-    outTexture.write(half4(output), gid.xy, gid.z);
+    
+    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
+    outTexture.write(half4(relu), gid.xy, gid.z);
 }

 kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, access::sample> inTexture [[texture(0)]],
@@ -640,11 +641,18 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
        res[3][c] = T[7] - T[11] + T[15] + tmp1 - tmp2;
    }

-    half4 base = biase[tc];
-    outTexture.write(fmax(res[0] + base, 0.0), uint2(tx, ty), tc);
-    outTexture.write(fmax(res[1] + base, 0.0), uint2(tx + 1, ty), tc);
-    outTexture.write(fmax(res[2] + base, 0.0), uint2(tx, ty + 1), tc);
-    outTexture.write(fmax(res[3] + base, 0.0), uint2(tx + 1, ty + 1), tc);
+    if (param.hasReluOp == 1) {
+        half4 base = biase[tc];
+        outTexture.write(fmax(res[0] + base, 0.0), uint2(tx, ty), tc);
+        outTexture.write(fmax(res[1] + base, 0.0), uint2(tx + 1, ty), tc);
+        outTexture.write(fmax(res[2] + base, 0.0), uint2(tx, ty + 1), tc);
+        outTexture.write(fmax(res[3] + base, 0.0), uint2(tx + 1, ty + 1), tc);
+    } else {
+        outTexture.write(res[0], uint2(tx, ty), tc);
+        outTexture.write(res[1], uint2(tx + 1, ty), tc);
+        outTexture.write(res[2], uint2(tx, ty + 1), tc);
+        outTexture.write(res[3], uint2(tx + 1, ty + 1), tc);
+    }
 }

 kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
@@ -653,7 +661,6 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
                              const device half4 *weights [[buffer(1)]],
                              const device half4 *biase [[buffer(2)]],
                              uint3 gid [[thread_position_in_grid]]) {
-    
    if (gid.x >= outTexture.get_width() ||
        gid.y >= outTexture.get_height() ||
        gid.z >= outTexture.get_array_size()) {
@@ -671,7 +678,7 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
    
    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
    
-    float4 output = float4(biase[gid.z]);
+    float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
    
    ushort dilation_y = param.dilationY;
    half4 input[5];
@@ -701,7 +708,7 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
            output.w += dot(float4(input[j]), float4(weight_w));
        }
    }
-    float4 relu = fmax(output, 0.0);
+    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
    outTexture.write(half4(relu), gid.xy, gid.z);
 }

@@ -729,7 +736,7 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu
    
    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
    
-    float4 output = float4(biase[gid.z]);
+    float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
    
    ushort dilation_x = param.dilationX;
    half4 input[5];
@@ -759,6 +766,6 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu
            output.w += dot(float4(input[j]), float4(weight_w));
        }
    }
-    float4 relu = fmax(output, 0.0);
+    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
    outTexture.write(half4(relu), gid.xy, gid.z);
 }
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-
-// conv
-#pragma mark -- conv
-kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                     texture2d_array<float, access::write> outTexture [[texture(1)]],
-                     constant MetalConvParam &param [[buffer(0)]],
-                     const device float4 *weights [[buffer(1)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    float4 input[9];
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-        for (int j = 0; j < 9; ++j) {
-            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                               texture2d_array<float, access::write> outTexture [[texture(1)]],
-                               constant MetalConvParam &param [[buffer(0)]],
-                               const device float *weights [[buffer(1)]],
-                               uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-    float4 output = float4(0.0);
-    float4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        float4 input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-    }
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                     texture2d_array<float, access::write> outTexture [[texture(1)]],
-                     constant MetalConvParam &param [[buffer(0)]],
-                     const device float4 *weights [[buffer(1)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    float4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-
-kernel void conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<half, access::write> outTexture [[texture(1)]],
-                          constant MetalConvParam &param [[buffer(0)]],
-                          const device half4 *weights [[buffer(1)]],
-                          uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    half4 input[9];
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-        for (int j = 0; j < 9; ++j) {
-            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(float4(input[j]), float4(weight_x));
-            
-            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(float4(input[j]), float4(weight_y));
-            
-            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(float4(input[j]), float4(weight_z));
-            
-            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(float4(input[j]), float4(weight_w));
-        }
-    }
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                    texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                    constant MetalConvParam &param [[buffer(0)]],
-                                    const device half *weights [[buffer(1)]],
-                                    uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-    float4 output = float4(0.0);
-    half4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        half4 input = inputs[j];
-        output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
-        output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
-        output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
-        output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
-    }
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void conv_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<half, access::write> outTexture [[texture(1)]],
-                          constant MetalConvParam &param [[buffer(0)]],
-                          const device half4 *weights [[buffer(1)]],
-                          uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    half4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(float4(input), float4(weight_x));
-        
-        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(float4(input), float4(weight_y));
-        
-        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(float4(input), float4(weight_z));
-        
-        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(float4(input), float4(weight_w));
-    }
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
@@ -325,7 +325,7 @@ public class PaddleMobileUnitTest {
        let fC = 4
        let oC = 4
        
-        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
+        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
        
        let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
        

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
@@ -14,36 +14,7 @@

 import Foundation

-class ConvAddParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        do {
-            filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-            input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope)
-            output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
-            stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs)
-            paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-            dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-            groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs)
-            
-            y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-        } catch let error {
-            throw error
-        }
-    }
-    
-    let input: Texture
-    let y: Tensor<P>
-    let filter: Tensor<P>
-    
-    var output: Texture
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
-}
-
-class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion{
+class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
    typealias OpType = ConvAddOp<P>
    
    static func fusionNode() -> Node {

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift
@@ -14,7 +14,40 @@

 import Foundation

-class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion {
+class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            filter = try ConvAddReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvAddReluParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvAddReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvAddReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvAddReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvAddReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            groups = try ConvAddReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            do {
+                y = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+            } catch {
+            }
+        } catch let error {
+            throw error
+        }
+    }
+    
+    let input: Texture
+    var y: Tensor<P>?
+    let filter: Tensor<P>
+    var output: Texture
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
+    
+    open class func hasY() -> Bool {
+        return true
+    }
+}
+
+class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion {
    typealias OpType = ConvAddReluOp<P>
    
    static func fusionNode() -> Node {
@@ -69,4 +102,3 @@ class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAd
        print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
    }
 }
-
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
@@ -15,7 +15,6 @@
 import Foundation

 class ConvParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
    required init(opDesc: PMOpDesc, inScope: Scope) throws {
        do {
            filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
@@ -37,7 +37,7 @@ protocol KernelProtocol {
    
 }

-@objc open class Kernel: NSObject{
+@objc open class Kernel: NSObject {
    
    private var _pipline: MTLComputePipelineState? = nil
    

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
@@ -135,7 +135,7 @@ class ConvAddAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
        let iC = param.input.tensorDim[1];
        let fC = param.filter.tensorDim[1];
        let oC = param.output.tensorDim[1];
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
        //    print("metal param: ")
        //    print(inMetalParam)
        

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -98,7 +98,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionProtocol>: Kernel, Computable, Test
        let iC = param.input.tensorDim[1];
        let fC = param.filter.tensorDim[1];
        let oC = param.output.tensorDim[1];
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
        
        var invs: [P] = []
        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
@@ -15,234 +15,17 @@
 import Foundation
 import MetalPerformanceShaders

-@available(iOS 11.0, *)
-class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSource {
-    
-    var _descriptor: MPSCNNConvolutionDescriptor
-    var _weightsTensor: Tensor<P>
-    var _biasTensor: Tensor<P>
-    var _biasTerms: UnsafeMutablePointer<Float>?
-    
-    func load() -> Bool {
-        switch P.precisionType {
-        case .Float32:
-            _biasTerms = _biasTensor.data.pointer as? UnsafeMutablePointer<Float>
-        case .Float16:
-            _biasTerms = UnsafeMutablePointer<Float>.allocate(capacity: _biasTensor.data.count)
-            if let float16Point = _biasTensor.data.pointer as? UnsafeMutablePointer<Float16> {
-                float16to32(input: float16Point, output: _biasTerms!, count: _biasTensor.data.count)
-            }
-        }
+class ConvAddKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
+    override func hasAddOp() -> Bool {
        return true
    }
-
-    func purge() {
-        switch P.precisionType {
-        case .Float32:
-            return
-        case .Float16:
-            _biasTerms?.deinitialize(count: _biasTensor.data.count)
-            _biasTerms?.deallocate()
-        }
-    }
-
-    func label() -> String? {
-        return "conv_add_label"
-    }
-
-    func copy(with zone: NSZone? = nil) -> Any {
-        return self
-    }
-
-    init(inDesc: MPSCNNConvolutionDescriptor,
-                  inWeights: Tensor<P>,
-                  inBiasTerms: Tensor<P>) {
-        _descriptor = inDesc
-        _weightsTensor = inWeights
-        _biasTensor = inBiasTerms
-        super.init()
-    }
-
-    func descriptor() -> MPSCNNConvolutionDescriptor {
-        return _descriptor
-    }
-
-    func dataType() -> MPSDataType {
-        switch P.precisionType {
-        case .Float32:
-            return .float32
-        case .Float16:
-            return .float16
-        }
-    }
-
-    func weights() -> UnsafeMutableRawPointer {
-        return UnsafeMutableRawPointer.init(_weightsTensor.data.pointer)
-    }
-
-    func biasTerms() -> UnsafeMutablePointer<Float>? {
-        return _biasTerms
-    }
-
-}
-
-
-class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
-    var metalParam: MetalConvParam!
-    var mpsConvOp: Any?
-    
-    required init(device: MTLDevice, param: ConvAddParam<P>, initContext: InitContext) throws {
-        do {
-            try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-        } catch let error {
-            throw error
-        }
-        
-        var shouldUseMPS = false
-        let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
-        if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
-            if initContext.useAggresiveOptimization {
-                if (param.input.tensorDim[1] == 1 || param.input.tensorDim[1] > 4) && (param.output.tensorDim[1] == 1 || param.output.tensorDim[1] > 4) {
-                    shouldUseMPS = true
-                }
-            } else {
-                if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
-                    shouldUseMPS = true
-                }
-            }
-        }
-        if type(of: self).isWinoGrad(functionName: functionName) {
-            shouldUseMPS = false
-        }
-        let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
-        if !isDepthWise && param.groups > 1 {
-            shouldUseMPS = false
-        }
-        if shouldUseMPS {
-            super.init(device: device, inFunctionName: nil, initContext: initContext)
-            setupWithMPS(device: device, param: param)
-        } else {
-            if functionName == nil {
-                fatalError(" unsupport yet ")
-            }
-            super.init(device: device, inFunctionName: functionName, initContext: initContext)
-            setupWithoutMPS(device: device, param: param)
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
-        if #available(iOS 10.0, *) {
-            if let conv = mpsConvOp as? MPSCNNConvolution {
-                let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
-                let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
-                conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
-                return
-            }
-        }
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil)
-        encoder.endEncoding()
-    }
    
-    func setupWithMPS(device: MTLDevice, param: ConvAddParam<P>) {
-        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
-        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
-        
-        let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
-        if #available(iOS 11.0, *) {
-            param.input.useMPS = true
-            param.output.useMPS = true
-            let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
-                MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
-                                                     kernelHeight: param.filter.tensorDim[2],
-                                                     inputFeatureChannels: param.input.tensorDim[1],
-                                                     outputFeatureChannels: param.output.tensorDim[1],
-                                                     neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
-                MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
-                                            kernelHeight: param.filter.tensorDim[2],
-                                            inputFeatureChannels: param.input.tensorDim[1],
-                                            outputFeatureChannels: param.output.tensorDim[1],
-                                            neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
-            desc.strideInPixelsX = Int(param.stride[0])
-            desc.strideInPixelsY = Int(param.stride[1])
-            let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
-            let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.y)
-            let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
-            conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
-            conv.edgeMode = .zero
-            mpsConvOp = conv
-        }
-    }
-    
-    func setupWithoutMPS(device: MTLDevice, param: ConvAddParam<P>) {
-        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
-        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
-        let offsetZ = 0.0
-        let iC = param.input.tensorDim[1];
-        let fC = param.filter.tensorDim[1];
-        let oC = param.output.tensorDim[1];
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
-        metalParam = inMetalParam
-        
-        if type(of: self).isWinoGrad(functionName: functionName) {
-            let _ = param.filter.convert(converter: WinogradPointerConverter<P>.init())
-        }
-        let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
-        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
-        
-        param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    }
-    
-    open class func kernelFunctionName(param: ConvAddParam<P>, useAggressiveOptimization: Bool = false) -> String? {
-        if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                return "conv_add_1x1_half"
-            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-                return "depthwise_conv_add_3x3_half"
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                return "conv_add_3x3_half"
-            } else if param.filter.width == 1 && param.filter.height == 5 {
-                return "conv_add_5x1_half"
-            } else if param.filter.width == 5 && param.filter.height == 1 {
-                return "conv_add_1x5_half"
-            } else {
-                return nil
-            }
-        } else if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                return "conv_add_1x1"
-            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-                return "depthwise_conv_add_3x3"
-            } else if param.filter.width == 1 && param.filter.height == 5 {
-                return "conv_add_5x1"
-            } else if param.filter.width == 5 && param.filter.height == 1 {
-                return "conv_add_1x5"
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                return "conv_add_3x3"
-            } else {
-                return nil
-            }
-        } else {
-            return nil
-        }
+    override func hasReluOp() -> Bool {
+        return false
    }
    
-    func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
+    override func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
        return nil
    }
-    
-    open class func isWinoGrad(functionName: String?) -> Bool {
-        if let functionName = functionName {
-            return functionName.hasSuffix("winograd")
-        }
-        return false
-    }
 }

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
@@ -135,7 +135,7 @@ class ConvAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
        let iC = param.input.tensorDim[1];
        let fC = param.filter.tensorDim[1];
        let oC = param.output.tensorDim[1];
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
        //    print("metal param: ")
        //    print(inMetalParam)
        

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
-//
-//  ConvAddReluKernel.swift
-//  paddle-mobile
-//
-//  Created by Yang,Yanzhan on 2019/4/29.
-//  Copyright © 2019 orange. All rights reserved.
-//
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */

 import Foundation
 import MetalPerformanceShaders

-class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
-    override class func kernelFunctionName(param: ConvAddParam<P>, useAggressiveOptimization: Bool = false) -> String? {
+public struct MetalConvParam {
+    let offsetX: Int16
+    let offsetY: Int16
+    let offsetZ: Int16
+    let strideX: UInt16
+    let strideY: UInt16
+    let dilationX: UInt16
+    let dilationY: UInt16
+    let groups: UInt16
+    let iC: UInt16
+    let fC: UInt16
+    let oC: UInt16
+    let hasAddOp: UInt16
+    let hasReluOp: UInt16
+}
+
+@available(iOS 11.0, *)
+class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSource {
+    var _descriptor: MPSCNNConvolutionDescriptor
+    var _weightsTensor: Tensor<P>
+    var _biasTensor: Tensor<P>?
+    var _biasTerms: UnsafeMutablePointer<Float>?
+    
+    func load() -> Bool {
+        if let biasTensor = _biasTensor {
+            switch P.precisionType {
+            case .Float32:
+                _biasTerms = biasTensor.data.pointer as? UnsafeMutablePointer<Float>
+            case .Float16:
+                _biasTerms = UnsafeMutablePointer<Float>.allocate(capacity: biasTensor.data.count)
+                if let float16Point = biasTensor.data.pointer as? UnsafeMutablePointer<Float16> {
+                    float16to32(input: float16Point, output: _biasTerms!, count: biasTensor.data.count)
+                }
+            }
+        }
+        return true
+    }
+    
+    func purge() {
+        switch P.precisionType {
+        case .Float32:
+            return
+        case .Float16:
+            if let biasTensor = _biasTensor {
+                _biasTerms?.deinitialize(count: biasTensor.data.count)
+                _biasTerms?.deallocate()
+            }
+        }
+    }
+    
+    func label() -> String? {
+        return "conv_add_label"
+    }
+    
+    func copy(with zone: NSZone? = nil) -> Any {
+        return self
+    }
+    
+    init(inDesc: MPSCNNConvolutionDescriptor,
+         inWeights: Tensor<P>,
+         inBiasTerms: Tensor<P>?) {
+        _descriptor = inDesc
+        _weightsTensor = inWeights
+        _biasTensor = inBiasTerms
+        super.init()
+    }
+    
+    func descriptor() -> MPSCNNConvolutionDescriptor {
+        return _descriptor
+    }
+    
+    func dataType() -> MPSDataType {
+        switch P.precisionType {
+        case .Float32:
+            return .float32
+        case .Float16:
+            return .float16
+        }
+    }
+    
+    func weights() -> UnsafeMutableRawPointer {
+        return UnsafeMutableRawPointer.init(_weightsTensor.data.pointer)
+    }
+    
+    func biasTerms() -> UnsafeMutablePointer<Float>? {
+        return _biasTerms
+    }
+}
+
+
+class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
+    var metalParam: MetalConvParam!
+    var mpsConvOp: Any?
+    var blankTensor: Tensor<P>?
+    
+    required init(device: MTLDevice, param: ConvAddReluParam<P>, initContext: InitContext) throws {
+        do {
+            try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        } catch let error {
+            throw error
+        }
+        
+        var shouldUseMPS = false
+        let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
+        if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
+            if initContext.useAggresiveOptimization {
+                if (param.input.tensorDim[1] == 1 || param.input.tensorDim[1] > 4) && (param.output.tensorDim[1] == 1 || param.output.tensorDim[1] > 4) {
+                    shouldUseMPS = true
+                }
+            } else {
+                if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
+                    shouldUseMPS = true
+                }
+            }
+        }
+        if type(of: self).isWinoGrad(functionName: functionName) {
+            shouldUseMPS = false
+        }
+        let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
+        if !isDepthWise && param.groups > 1 {
+            shouldUseMPS = false
+        }
+        if shouldUseMPS {
+            super.init(device: device, inFunctionName: nil, initContext: initContext)
+            setupWithMPS(device: device, param: param)
+        } else {
+            if functionName == nil {
+                fatalError(" unsupport yet ")
+            }
+            super.init(device: device, inFunctionName: functionName, initContext: initContext)
+            setupWithoutMPS(device: device, param: param)
+        }
+    }
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddReluParam<P>) throws {
+        if #available(iOS 10.0, *) {
+            if let conv = mpsConvOp as? MPSCNNConvolution {
+                let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
+                let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
+                conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
+                return
+            }
+        }
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        if let y = param.y {
+            encoder.setBuffer(y.buffer, offset: 0, index: 2)
+        } else {
+            encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2)
+        }
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil)
+        encoder.endEncoding()
+    }
+    
+    func setupWithMPS(device: MTLDevice, param: ConvAddReluParam<P>) {
+        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
+        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
+        
+        let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
+        if #available(iOS 11.0, *) {
+            param.input.useMPS = true
+            param.output.useMPS = true
+            let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
+                MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
+                                                     kernelHeight: param.filter.tensorDim[2],
+                                                     inputFeatureChannels: param.input.tensorDim[1],
+                                                     outputFeatureChannels: param.output.tensorDim[1],
+                                                     neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
+                MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
+                                            kernelHeight: param.filter.tensorDim[2],
+                                            inputFeatureChannels: param.input.tensorDim[1],
+                                            outputFeatureChannels: param.output.tensorDim[1],
+                                            neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
+            desc.strideInPixelsX = Int(param.stride[0])
+            desc.strideInPixelsY = Int(param.stride[1])
+            let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
+            let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.y)
+            let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
+            conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
+            conv.edgeMode = .zero
+            mpsConvOp = conv
+        }
+    }
+    
+    func setupWithoutMPS(device: MTLDevice, param: ConvAddReluParam<P>) {
+        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
+        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
+        let offsetZ = 0.0
+        let iC = param.input.tensorDim[1];
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0))
+        metalParam = inMetalParam
+        
+        if type(of: self).isWinoGrad(functionName: functionName) {
+            let _ = param.filter.convert(converter: WinogradPointerConverter<P>.init())
+        }
+        let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
+        if let y = param.y {
+            y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        } else {
+            blankTensor = Tensor<P>.init(inDim: Dim(inDim: [1, 1, 1, 4]), inLayout: DataLayout.NHWC(), originDimsCount: 4)
+            blankTensor?.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        }
+    }
+    
+    class func kernelFunctionName(param: ConvAddReluParam<P>, useAggressiveOptimization: Bool = false) -> String? {
        if GlobalConfig.shared.computePrecision == .Float16 {
            if param.filter.width == 1 && param.filter.height == 1 {
                return "conv_add_relu_1x1_half"
@@ -60,10 +277,27 @@ class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
        }
    }
    
-    override func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
-        if #available(iOS 10.0, *) {
-            return MPSCNNNeuronReLU(device: device, a: 0)
+    open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
+        if hasReluOp() {
+            if #available(iOS 10.0, *) {
+                return MPSCNNNeuronReLU(device: device, a: 0)
+            }
        }
        return nil
    }
+    
+    open func hasAddOp() -> Bool {
+        return true
+    }
+    
+    open func hasReluOp() -> Bool {
+        return true
+    }
+    
+    open class func isWinoGrad(functionName: String?) -> Bool {
+        if let functionName = functionName {
+            return functionName.hasSuffix("winograd")
+        }
+        return false
+    }
 }
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
@@ -105,7 +105,7 @@ class ConvBNReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable {
        let iC = param.input.tensorDim[1];
        let fC = param.filter.tensorDim[1];
        let oC = param.output.tensorDim[1];
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
        
        var invs: [P] = []
        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
@@ -13,55 +13,188 @@
 limitations under the License. */

 import Foundation
-
-public struct MetalConvParam {
-    let offsetX: Int16
-    let offsetY: Int16
-    let offsetZ: Int16
-    let strideX: UInt16
-    let strideY: UInt16
-    let dilationX: UInt16
-    let dilationY: UInt16
-    let groups: UInt16
-    let iC: UInt16
-    let fC: UInt16
-    let oC: UInt16
-}
+import MetalPerformanceShaders

 class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
    var metalParam: MetalConvParam!
+    var mpsConvOp: Any?
+    var blankTensor: Tensor<P>?
+    
    required init(device: MTLDevice, param: ConvParam<P>, initContext: InitContext) throws {
-        param.filter.initBuffer(device: device, precision: Precision.Float32)
-        if param.filter.width == 1 && param.filter.height == 1 {
-            super.init(device: device, inFunctionName: "conv_1x1", initContext: initContext)
-        } else if param.filter.channel == 1 {
-            super.init(device: device, inFunctionName: "depthwise_conv_3x3", initContext: initContext)
-        } else if param.filter.width == 3 && param.filter.height == 3 {
-            super.init(device: device, inFunctionName: "conv_3x3", initContext: initContext)
-        } else {
-            fatalError(" unsupport ")
+        do {
+            try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        } catch let error {
+            throw error
        }
        
-        let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
-        let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
-        let offsetZ = 0.0
-        let iC = param.input.tensorDim[1];
-        let fC = param.filter.tensorDim[1];
-        let oC = param.output.tensorDim[1];
-        
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
+        var shouldUseMPS = false
+        let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
+        if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
+            if initContext.useAggresiveOptimization {
+                if (param.input.tensorDim[1] == 1 || param.input.tensorDim[1] > 4) && (param.output.tensorDim[1] == 1 || param.output.tensorDim[1] > 4) {
+                    shouldUseMPS = true
+                }
+            } else {
+                if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
+                    shouldUseMPS = true
+                }
+            }
+        }
+        if type(of: self).isWinoGrad(functionName: functionName) {
+            shouldUseMPS = false
+        }
+        let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
+        if !isDepthWise && param.groups > 1 {
+            shouldUseMPS = false
+        }
+        if shouldUseMPS {
+            super.init(device: device, inFunctionName: nil, initContext: initContext)
+            setupWithMPS(device: device, param: param)
+        } else {
+            if functionName == nil {
+                fatalError(" unsupport yet ")
+            }
+            super.init(device: device, inFunctionName: functionName, initContext: initContext)
+            setupWithoutMPS(device: device, param: param)
+        }
    }
    
    func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
+        if #available(iOS 10.0, *) {
+            if let conv = mpsConvOp as? MPSCNNConvolution {
+                let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
+                let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
+                conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
+                return
+            }
+        }
        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
            throw PaddleMobileError.predictError(message: " encode is nil")
        }
-        
        encoder.setTexture(param.input.metalTexture, index: 0)
        encoder.setTexture(param.output.metalTexture, index: 1)
        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil)
        encoder.endEncoding()
    }
+    
+    func setupWithMPS(device: MTLDevice, param: ConvParam<P>) {
+        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
+        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
+        
+        let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
+        if #available(iOS 11.0, *) {
+            param.input.useMPS = true
+            param.output.useMPS = true
+            let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
+                MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
+                                                     kernelHeight: param.filter.tensorDim[2],
+                                                     inputFeatureChannels: param.input.tensorDim[1],
+                                                     outputFeatureChannels: param.output.tensorDim[1],
+                                                     neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
+                MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
+                                            kernelHeight: param.filter.tensorDim[2],
+                                            inputFeatureChannels: param.input.tensorDim[1],
+                                            outputFeatureChannels: param.output.tensorDim[1],
+                                            neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
+            desc.strideInPixelsX = Int(param.stride[0])
+            desc.strideInPixelsY = Int(param.stride[1])
+            let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
+            let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: nil)
+            let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
+            conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
+            conv.edgeMode = .zero
+            mpsConvOp = conv
+        }
+    }
+    
+    func setupWithoutMPS(device: MTLDevice, param: ConvParam<P>) {
+        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
+        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
+        let offsetZ = 0.0
+        let iC = param.input.tensorDim[1];
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0))
+        metalParam = inMetalParam
+        
+        if type(of: self).isWinoGrad(functionName: functionName) {
+            let _ = param.filter.convert(converter: WinogradPointerConverter<P>.init())
+        }
+        let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
+        blankTensor = Tensor<P>.init(inDim: Dim(inDim: [1, 1, 1, 4]), inLayout: DataLayout.NHWC(), originDimsCount: 4)
+        blankTensor?.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+    }
+    
+    class func kernelFunctionName(param: ConvParam<P>, useAggressiveOptimization: Bool = false) -> String? {
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                return "conv_add_relu_1x1_half"
+            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
+                if useAggressiveOptimization {
+                    let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3
+                        && param.filter.n == 16 && param.stride[0] == 1 && param.stride[1] == 1
+                        && param.dilations[0] == 1 && param.dilations[1] == 1
+                    if couldUseWinograd {
+                        return "depthwise_conv_add_relu_3x3_half_winograd"
+                    }
+                }
+                return "depthwise_conv_add_relu_3x3_half"
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                if param.groups == 1 {
+                    return "conv_add_relu_3x3_half"
+                } else {
+                    return "group_conv_add_relu_3x3_half"
+                }
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                return "conv_add_relu_5x1_half"
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                return "conv_add_relu_1x5_half"
+            } else {
+                return nil
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                return "conv_add_relu_1x1"
+            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
+                return "depthwise_conv_add_relu_3x3"
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                return "conv_add_relu_5x1"
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                return "conv_add_relu_1x5"
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                if param.groups == 1 {
+                    return "conv_add_relu_3x3"
+                } else {
+                    return "group_conv_add_relu_3x3"
+                }
+            } else {
+                return nil
+            }
+        } else {
+            return nil
+        }
+    }
+    
+    open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
+        return nil
+    }
+    
+    open func hasAddOp() -> Bool {
+        return false
+    }
+    
+    open func hasReluOp() -> Bool {
+        return false
+    }
+    
+    open class func isWinoGrad(functionName: String?) -> Bool {
+        if let functionName = functionName {
+            return functionName.hasSuffix("winograd")
+        }
+        return false
+    }
 }