fuse Conv-Add-Relu into one op.

200be579 · yangyanzhan · 99cccf3b · 200be579 · 200be579 · 200be579
9 changed file
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
@@ -7,6 +7,7 @@
 	objects = {
 /* Begin PBXBuildFile section */
+		165F38D72276F4C00088E29F /* ConvAddReluMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = 165F38D62276F4C00088E29F /* ConvAddReluMetal.metal */; };
 		5CCC0CF6759710BAFE999DB7 /* Pods_paddle_mobile_metallib.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5D9D330A035906298947080B /* Pods_paddle_mobile_metallib.framework */; };
 		FCC15DE5221E69E100DC3CB2 /* ReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBC221E69DD00DC3CB2 /* ReluKernel.metal */; };
 		FCC15DE6221E69E100DC3CB2 /* BoxCoder.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBD221E69DD00DC3CB2 /* BoxCoder.metal */; };
@@ -52,6 +53,7 @@
 /* End PBXBuildFile section */
 /* Begin PBXFileReference section */
+		165F38D62276F4C00088E29F /* ConvAddReluMetal.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddReluMetal.metal; sourceTree = "<group>"; };
 		33511F4FF7FE78679BE12DC0 /* Pods-paddle-mobile-metallib.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-metallib.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-metallib/Pods-paddle-mobile-metallib.release.xcconfig"; sourceTree = "<group>"; };
 		5D9D330A035906298947080B /* Pods_paddle_mobile_metallib.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile_metallib.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		C6D31B9F9533810DBCA6B28D /* Pods-paddle-mobile-metallib.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-metallib.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-metallib/Pods-paddle-mobile-metallib.debug.xcconfig"; sourceTree = "<group>"; };
@@ -190,6 +192,7 @@
 				FCC15DBF221E69DD00DC3CB2 /* Split.metal */,
 				FCC15DC9221E69DE00DC3CB2 /* TransposeKernel.inc.metal */,
 				FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */,
+				165F38D62276F4C00088E29F /* ConvAddReluMetal.metal */,
 			);
 			path = "paddle-mobile-metallib";
 			sourceTree = "<group>";
@@ -310,6 +313,7 @@
 				FCC15E08221E69E100DC3CB2 /* Split.inc.metal in Sources */,
 				FCC15DF4221E69E100DC3CB2 /* ResizeBilinear.metal in Sources */,
 				FCC15E05221E69E100DC3CB2 /* BatchNormKernel.metal in Sources */,
+				165F38D72276F4C00088E29F /* ConvAddReluMetal.metal in Sources */,
 				FCC15DE6221E69E100DC3CB2 /* BoxCoder.metal in Sources */,
 				FCC15DF6221E69E100DC3CB2 /* PoolKernel.metal in Sources */,
 				FCC15E09221E69E100DC3CB2 /* ConcatKernel.inc.metal in Sources */,

--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+#pragma mark - convAdd
+kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    float4 output = biase[gid.z];
+    float4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    float4 relu = fmax(output, 0.0);
+    outTexture.write(relu, gid.xy, gid.z);
+}
+kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    float4 output = biase[gid.z];
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    float4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+        for (int j = 0; j < 9; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
+    }
+    float4 relu = fmax(output, 0.0);
+    outTexture.write(relu, gid.xy, gid.z);
+}
+kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 5;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    float4 output = biase[gid.z];
+    ushort dilation_y = param.dilationY;
+    float4 input[5];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+        for (int j = 0; j < 5; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
+    }
+    float4 relu = fmax(output, 0.0);
+    outTexture.write(relu, gid.xy, gid.z);
+}
+kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 5;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    float4 output = biase[gid.z];
+    ushort dilation_x = param.dilationX;
+    float4 input[5];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+        for (int j = 0; j < 5; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
+    }
+    float4 relu = fmax(output, 0.0);
+    outTexture.write(relu, gid.xy, gid.z);
+}
+kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                   constant MetalConvParam &param [[buffer(0)]],
+                                   const device float *weights [[buffer(1)]],
+                                   const device float4 *biase [[buffer(2)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = biase[gid.z];
+    float4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        float4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    float4 relu = fmax(output, 0.0);
+    outTexture.write(relu, gid.xy, gid.z);
+}
+#pragma mark - half
+kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device half4 *weights [[buffer(1)]],
+                              const device half4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    float4 output = float4(biase[gid.z]);
+    float4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = float4(inTexture.sample(sample, float2(posInInput.x, posInInput.y), i));
+        float4 weight_x = float4(weights[weithTo + 0 * kernelHXW * input_arr_size  + i]);
+        output.x += dot(input, weight_x);
+        float4 weight_y = float4(weights[weithTo + 1 * kernelHXW * input_arr_size  + i]);
+        output.y += dot(input, weight_y);
+        float4 weight_z = float4(weights[weithTo + 2 * kernelHXW * input_arr_size  + i]);
+        output.z += dot(input, weight_z);
+        float4 weight_w = float4(weights[weithTo + 3 * kernelHXW * input_arr_size + i]);
+        output.w += dot(input, weight_w);
+    }
+    float4 relu = fmax(output, 0.0);
+    outTexture.write(half4(relu), gid.xy, gid.z);
+}
+kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device half4 *weights [[buffer(1)]],
+                              const device half4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    float4 output = float4(biase[gid.z]);
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    half4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+        for (int j = 0; j < 9; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(float4(input[j]), float4(weight_x));
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(float4(input[j]), float4(weight_y));
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(float4(input[j]), float4(weight_z));
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(float4(input[j]), float4(weight_w));
+        }
+    }
+    float4 relu = fmax(output, 0.0);
+    outTexture.write(half4(relu), gid.xy, gid.z);
+}
+kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                        constant MetalConvParam &param [[buffer(0)]],
+                                        const device half *weights [[buffer(1)]],
+                                        const device half4 *biase [[buffer(2)]],
+                                        uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(biase[gid.z]);
+    half4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
+        output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
+        output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
+        output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
+    }
+    float4 relu = fmax(output, 0.0);
+    outTexture.write(half4(relu), gid.xy, gid.z);
+}
+kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device half4 *weights [[buffer(1)]],
+                              const device half4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 5;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    float4 output = float4(biase[gid.z]);
+    ushort dilation_y = param.dilationY;
+    half4 input[5];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+        for (int j = 0; j < 5; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(float4(input[j]), float4(weight_x));
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(float4(input[j]), float4(weight_y));
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(float4(input[j]), float4(weight_z));
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(float4(input[j]), float4(weight_w));
+        }
+    }
+    float4 relu = fmax(output, 0.0);
+    outTexture.write(half4(relu), gid.xy, gid.z);
+}
+kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device half4 *weights [[buffer(1)]],
+                              const device half4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 5;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    float4 output = float4(biase[gid.z]);
+    ushort dilation_x = param.dilationX;
+    half4 input[5];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+        for (int j = 0; j < 5; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(float4(input[j]), float4(weight_x));
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(float4(input[j]), float4(weight_y));
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(float4(input[j]), float4(weight_z));
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(float4(input[j]), float4(weight_w));
+        }
+    }
+    float4 relu = fmax(output, 0.0);
+    outTexture.write(half4(relu), gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -7,6 +7,8 @@
 	objects = {
 /* Begin PBXBuildFile section */
+		165F38D32276CDEA0088E29F /* ConvAddReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 165F38D22276CDEA0088E29F /* ConvAddReluOp.swift */; };
+		165F38D52276CE7D0088E29F /* ConvAddReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 165F38D42276CE7D0088E29F /* ConvAddReluKernel.swift */; };
 		456BB7B421F5B356001474E2 /* Framework.pbobjc.m in Sources */ = {isa = PBXBuildFile; fileRef = 456BB7B221F5B356001474E2 /* Framework.pbobjc.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc"; }; };
 		456BB7B521F5B356001474E2 /* Framework.pbobjc.h in Headers */ = {isa = PBXBuildFile; fileRef = 456BB7B321F5B356001474E2 /* Framework.pbobjc.h */; settings = {ATTRIBUTES = (Public, ); }; };
 		4AA1EA862146625E00D0F791 /* BilinearInterpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */; };
@@ -101,6 +103,8 @@
 /* End PBXBuildFile section */
 /* Begin PBXFileReference section */
+		165F38D22276CDEA0088E29F /* ConvAddReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddReluOp.swift; sourceTree = "<group>"; };
+		165F38D42276CE7D0088E29F /* ConvAddReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddReluKernel.swift; sourceTree = "<group>"; };
 		456BB7B221F5B356001474E2 /* Framework.pbobjc.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = Framework.pbobjc.m; sourceTree = "<group>"; };
 		456BB7B321F5B356001474E2 /* Framework.pbobjc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Framework.pbobjc.h; sourceTree = "<group>"; };
 		4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BilinearInterpOp.swift; sourceTree = "<group>"; };
@@ -323,6 +327,7 @@
 				FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */,
 				FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */,
 				FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */,
+				165F38D22276CDEA0088E29F /* ConvAddReluOp.swift */,
 			);
 			path = Operators;
 			sourceTree = "<group>";
@@ -377,6 +382,7 @@
 				FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */,
 				FC2BFD4521DF685F00C262B2 /* Scale.swift */,
 				FCB40E5821E0DCAB0075EC91 /* FetchKernel.swift */,
+				165F38D42276CE7D0088E29F /* ConvAddReluKernel.swift */,
 			);
 			path = Kernels;
 			sourceTree = "<group>";
@@ -541,6 +547,7 @@
 				FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */,
 				4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */,
 				FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */,
+				165F38D52276CE7D0088E29F /* ConvAddReluKernel.swift in Sources */,
 				FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */,
 				FCEB684C212F093800D2448E /* PreluOp.swift in Sources */,
 				FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */,
@@ -592,6 +599,7 @@
 				FC9D038220E2312E000F735A /* FetchOp.swift in Sources */,
 				FC039BBD20E11CC20081E9F8 /* Program.swift in Sources */,
 				FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */,
+				165F38D32276CDEA0088E29F /* ConvAddReluOp.swift in Sources */,
 				FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */,
 				FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */,
 				FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */,

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift
@@ -68,7 +68,8 @@ class OpCreator<P: PrecisionProtocol> {
         gConvAddPreluType          :     ConvAddPreluOp<P>.creat,
         gConvAddAddPreluType       :     ConvAddAddPreluOp<P>.creat,
         gElementwiseAddPreluType   :     ElementwiseAddPreluOp<P>.creat,
-         gFusionConvAddType         :     ConvAddOp<P>.creat]
+         gFusionConvAddType         :     ConvAddOp<P>.creat,
+         gConvAddReluType           :     ConvAddReluOp<P>.creat]
    private init(){}
 }
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
@@ -173,6 +173,7 @@ let gBilinearInterpType         = "bilinear_interp"
 let gSplit                      = "split"
 let gShape                      = "shape"
 let gFlatten                    = "flatten"
+let gConvAddReluType           = "conv_add_relu"
 let gConvAddPreluType           = "conv_add_prelu"
 let gConvAddAddPreluType        = "conv_add_add_prelu"
 let gElementwiseAddPreluType    = "elementwise_add_prelu"

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Foundation
+class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion {
+    typealias OpType = ConvAddReluOp<P>
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode
+            --> Node.init(inType: gElementwiseAddType)
+            --> Node.init(inType: gReluType)
+        return beginNode
+    }
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
+    static func fusionType() -> String {
+        return gConvAddReluType
+    }
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
+    }
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture)
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
@@ -103,95 +103,27 @@ class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
    let identifyingKey: String = getUniqueKey()
    required init(device: MTLDevice, param: ConvAddParam<P>, initContext: InitContext) {
        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1)/2 - Int(param.paddings[1])
+        var shouldUseMPS = false
-        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1)/2 - Int(param.paddings[0])
+        if #available(iOS 11.0, *), initContext.useMPS {
+            // 输入输出 tensor channel 必须都大于 4
-        let key = identifyingKey
+            if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
+                shouldUseMPS = true
-        if initContext.useMPS {  // 使用 apple 的 MetalPerformanceShaders
-            if #available(iOS 11.0, *) {
-                var desc: MPSCNNConvolutionDescriptor?
-                // 如果不是 depth wise, 并且输入输出 tensor channel 都大于 4
-                let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
-                if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
-                    if isDepthWise {
-                        desc = MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
-                                                                    kernelHeight: param.filter.tensorDim[2],
-                                                                    inputFeatureChannels: param.input.tensorDim[1],
-                                                                    outputFeatureChannels: param.output.tensorDim[1],
-                                                                    neuronFilter: nil)
-                    } else {
-                        desc = MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
-                                                           kernelHeight: param.filter.tensorDim[2],
-                                                           inputFeatureChannels: param.input.tensorDim[1],
-                                                           outputFeatureChannels: param.output.tensorDim[1],
-                                                           neuronFilter: nil)
-                    }
-                }
-                desc?.strideInPixelsX = Int(param.stride[0])
-                desc?.strideInPixelsY = Int(param.stride[1])
-                if let inDesc = desc {
-                    let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
-                    let dataSource = ConvDataSource.init(inDesc: inDesc, inWeights: param.filter, inBiasTerms: param.y)
-                    let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
-                    conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
-                    conv.edgeMode = .zero
-                    convDic[key] = conv
-                    super.init(device: device, inFunctionName: nil, initContext: initContext)
-                    return
-                }
            }
        }
-        let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
+        if shouldUseMPS {
-        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
+            super.init(device: device, inFunctionName: nil, initContext: initContext)
-        param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+            setupWithMPS(device: device, param: param)
+        } else {
-        if GlobalConfig.shared.computePrecision == .Float16 {
+            let functionName = type(of: self).kernelFunctionName(param: param)
-            if param.filter.width == 1 && param.filter.height == 1 {
+            if functionName == nil {
-                super.init(device: device, inFunctionName: "conv_add_1x1_half", initContext: initContext)
-            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-                super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half", initContext: initContext)
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                super.init(device: device, inFunctionName: "conv_add_3x3_half", initContext: initContext)
-            } else if param.filter.width == 1 && param.filter.height == 5 {
-                super.init(device: device, inFunctionName: "conv_add_5x1_half", initContext: initContext)
-            } else if param.filter.width == 5 && param.filter.height == 1 {
-                super.init(device: device, inFunctionName: "conv_add_1x5_half", initContext: initContext)
-            } else {
-                fatalError(" unsupport yet ")
-            }
-        } else if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                super.init(device: device, inFunctionName: "conv_add_1x1", initContext: initContext)
-            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-                super.init(device: device, inFunctionName: "depthwise_conv_add_3x3", initContext: initContext)
-            } else if param.filter.width == 1 && param.filter.height == 5 {
-                super.init(device: device, inFunctionName: "conv_add_5x1", initContext: initContext)
-            } else if param.filter.width == 5 && param.filter.height == 1 {
-                super.init(device: device, inFunctionName: "conv_add_1x5", initContext: initContext)
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                super.init(device: device, inFunctionName: "conv_add_3x3", initContext: initContext)
-            } else {
                fatalError(" unsupport yet ")
            }
-        } else {
+            super.init(device: device, inFunctionName: functionName, initContext: initContext)
-            fatalError()
+            setupWithoutMPS(device: device, param: param)
        }
-        //    print(" function: \(functionName)")
-        //    print("offset x: \(offsetX)")
-        //    print("offset y: \(offsetY)")
-        let offsetZ = 0.0
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-        //    print("metal param: ")
-        //    print(inMetalParam)
-        metalParam = inMetalParam
    }
    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
@@ -200,7 +132,7 @@ class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
                let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
                let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
                conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
-                return;
+                return
            }
        }
@@ -221,5 +153,85 @@ class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
            convDic.removeValue(forKey: identifyingKey)
        }
    }
+    func setupWithMPS(device: MTLDevice, param: ConvAddParam<P>) {
+        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
+        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
+        let key = identifyingKey
+        let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
+        if #available(iOS 11.0, *) {
+            let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
+                MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
+                                                     kernelHeight: param.filter.tensorDim[2],
+                                                     inputFeatureChannels: param.input.tensorDim[1],
+                                                     outputFeatureChannels: param.output.tensorDim[1],
+                                                     neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
+                MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
+                                            kernelHeight: param.filter.tensorDim[2],
+                                            inputFeatureChannels: param.input.tensorDim[1],
+                                            outputFeatureChannels: param.output.tensorDim[1],
+                                            neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
+            desc.strideInPixelsX = Int(param.stride[0])
+            desc.strideInPixelsY = Int(param.stride[1])
+            let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
+            let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.y)
+            let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
+            conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
+            conv.edgeMode = .zero
+            convDic[key] = conv
+        }
+    }
+    func setupWithoutMPS(device: MTLDevice, param: ConvAddParam<P>) {
+        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
+        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
+        let offsetZ = 0.0
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        metalParam = inMetalParam
+        let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
+        param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+    }
+    open class func kernelFunctionName(param: ConvAddParam<P>) -> String? {
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                return "conv_add_1x1_half"
+            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
+                return "depthwise_conv_add_3x3_half"
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                return "conv_add_3x3_half"
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                return "conv_add_5x1_half"
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                return "conv_add_1x5_half"
+            } else {
+                return nil
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                return "conv_add_1x1"
+            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
+                return "depthwise_conv_add_3x3"
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                return "conv_add_5x1"
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                return "conv_add_1x5"
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                return "conv_add_3x3"
+            } else {
+                return nil
+            }
+        } else {
+            return nil
+        }
+    }
+    func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
+        return nil
+    }
 }
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
+//
+//  ConvAddReluKernel.swift
+//  paddle-mobile
+//
+//  Created by Yang,Yanzhan on 2019/4/29.
+//  Copyright © 2019 orange. All rights reserved.
+//
+import Foundation
+import MetalPerformanceShaders
+class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
+    override class func kernelFunctionName(param: ConvAddParam<P>) -> String? {
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                return "conv_add_relu_1x1_half"
+            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
+                return "depthwise_conv_add_relu_3x3_half"
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                return "conv_add_relu_3x3_half"
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                return "conv_add_relu_5x1_half"
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                return "conv_add_relu_1x5_half"
+            } else {
+                return nil
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                return "conv_add_relu_1x1"
+            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
+                return "depthwise_conv_add_relu_3x3"
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                return "conv_add_relu_5x1"
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                return "conv_add_relu_1x5"
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                return "conv_add_relu_3x3"
+            } else {
+                return nil
+            }
+        } else {
+            return nil
+        }
+    }
+    override func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
+        if #available(iOS 10.0, *) {
+            return MPSCNNNeuronReLU(device: device, a: 0)
+        }
+        return nil
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift
@@ -184,6 +184,7 @@ extension Node: Equatable {
 class ProgramOptimize<P: PrecisionProtocol> {
    // register fusion
    let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
+        ConvAddReluOp<P>.self,
                                    //                                  ConvAddAddPreluOp<P>.self,
        ConvAddPreluOp<P>.self,
        ConvAddOp<P>.self,