From 200be579e82079472e9bead50b6eef5703d5d0be Mon Sep 17 00:00:00 2001 From: yangyanzhan Date: Mon, 29 Apr 2019 17:34:55 +0800 Subject: [PATCH] fuse Conv-Add-Relu into one op. --- .../project.pbxproj | 4 + .../ConvAddReluMetal.metal | 542 ++++++++++++++++++ .../paddle-mobile.xcodeproj/project.pbxproj | 8 + .../Src/Operators/Base/OpCreator.swift | 3 +- .../Src/Operators/Base/Operator.swift | 1 + .../Src/Operators/ConvAddReluOp.swift | 72 +++ .../Src/Operators/Kernels/ConvAddKernel.swift | 176 +++--- .../Operators/Kernels/ConvAddReluKernel.swift | 53 ++ .../Src/Program/ProgramOptimize.swift | 1 + 9 files changed, 777 insertions(+), 83 deletions(-) create mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal create mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift create mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj index 9eef77e7f1..a8cdfbc293 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj @@ -7,6 +7,7 @@ objects = { /* Begin PBXBuildFile section */ + 165F38D72276F4C00088E29F /* ConvAddReluMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = 165F38D62276F4C00088E29F /* ConvAddReluMetal.metal */; }; 5CCC0CF6759710BAFE999DB7 /* Pods_paddle_mobile_metallib.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5D9D330A035906298947080B /* Pods_paddle_mobile_metallib.framework */; }; FCC15DE5221E69E100DC3CB2 /* ReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBC221E69DD00DC3CB2 /* ReluKernel.metal */; }; FCC15DE6221E69E100DC3CB2 /* BoxCoder.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBD221E69DD00DC3CB2 /* BoxCoder.metal */; }; @@ -52,6 +53,7 @@ /* End PBXBuildFile section */ /* Begin PBXFileReference section */ + 165F38D62276F4C00088E29F /* ConvAddReluMetal.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddReluMetal.metal; sourceTree = ""; }; 33511F4FF7FE78679BE12DC0 /* Pods-paddle-mobile-metallib.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-metallib.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-metallib/Pods-paddle-mobile-metallib.release.xcconfig"; sourceTree = ""; }; 5D9D330A035906298947080B /* Pods_paddle_mobile_metallib.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile_metallib.framework; sourceTree = BUILT_PRODUCTS_DIR; }; C6D31B9F9533810DBCA6B28D /* Pods-paddle-mobile-metallib.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-metallib.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-metallib/Pods-paddle-mobile-metallib.debug.xcconfig"; sourceTree = ""; }; @@ -190,6 +192,7 @@ FCC15DBF221E69DD00DC3CB2 /* Split.metal */, FCC15DC9221E69DE00DC3CB2 /* TransposeKernel.inc.metal */, FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */, + 165F38D62276F4C00088E29F /* ConvAddReluMetal.metal */, ); path = "paddle-mobile-metallib"; sourceTree = ""; @@ -310,6 +313,7 @@ FCC15E08221E69E100DC3CB2 /* Split.inc.metal in Sources */, FCC15DF4221E69E100DC3CB2 /* ResizeBilinear.metal in Sources */, FCC15E05221E69E100DC3CB2 /* BatchNormKernel.metal in Sources */, + 165F38D72276F4C00088E29F /* ConvAddReluMetal.metal in Sources */, FCC15DE6221E69E100DC3CB2 /* BoxCoder.metal in Sources */, FCC15DF6221E69E100DC3CB2 /* PoolKernel.metal in Sources */, FCC15E09221E69E100DC3CB2 /* ConcatKernel.inc.metal in Sources */, diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal new file mode 100644 index 0000000000..c638709f88 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal @@ -0,0 +1,542 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" + +using namespace metal; + +#pragma mark - convAdd +kernel void conv_add_relu_1x1(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = biase[gid.z]; + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + float4 relu = fmax(output, 0.0); + outTexture.write(relu, gid.xy, gid.z); +} + +kernel void conv_add_relu_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 9; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + float4 input[9]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + float4 relu = fmax(output, 0.0); + outTexture.write(relu, gid.xy, gid.z); +} + +kernel void conv_add_relu_5x1(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = biase[gid.z]; + + ushort dilation_y = param.dilationY; + float4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); + + for (int j = 0; j < 5; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + float4 relu = fmax(output, 0.0); + outTexture.write(relu, gid.xy, gid.z); +} + +kernel void conv_add_relu_1x5(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + float4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); + + for (int j = 0; j < 5; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + float4 relu = fmax(output, 0.0); + outTexture.write(relu, gid.xy, gid.z); +} + +kernel void depthwise_conv_add_relu_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = biase[gid.z]; + float4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + float4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + float4 relu = fmax(output, 0.0); + outTexture.write(relu, gid.xy, gid.z); +} + +#pragma mark - half + +kernel void conv_add_relu_1x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(biase[gid.z]); + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = float4(inTexture.sample(sample, float2(posInInput.x, posInInput.y), i)); + float4 weight_x = float4(weights[weithTo + 0 * kernelHXW * input_arr_size + i]); + output.x += dot(input, weight_x); + + float4 weight_y = float4(weights[weithTo + 1 * kernelHXW * input_arr_size + i]); + output.y += dot(input, weight_y); + + float4 weight_z = float4(weights[weithTo + 2 * kernelHXW * input_arr_size + i]); + output.z += dot(input, weight_z); + + float4 weight_w = float4(weights[weithTo + 3 * kernelHXW * input_arr_size + i]); + output.w += dot(input, weight_w); + } + float4 relu = fmax(output, 0.0); + outTexture.write(half4(relu), gid.xy, gid.z); +} + +kernel void conv_add_relu_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(biase[gid.z]); + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + float4 relu = fmax(output, 0.0); + outTexture.write(half4(relu), gid.xy, gid.z); +} + +kernel void depthwise_conv_add_relu_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(biase[gid.z]); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]); + output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]); + output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]); + output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]); + } + float4 relu = fmax(output, 0.0); + outTexture.write(half4(relu), gid.xy, gid.z); +} + +kernel void conv_add_relu_5x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(biase[gid.z]); + + ushort dilation_y = param.dilationY; + half4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); + + for (int j = 0; j < 5; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + float4 relu = fmax(output, 0.0); + outTexture.write(half4(relu), gid.xy, gid.z); +} + +kernel void conv_add_relu_1x5_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(biase[gid.z]); + + ushort dilation_x = param.dilationX; + half4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); + + for (int j = 0; j < 5; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + float4 relu = fmax(output, 0.0); + outTexture.write(half4(relu), gid.xy, gid.z); +} diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj index 4b3dd397de..8f258aef45 100644 --- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj +++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj @@ -7,6 +7,8 @@ objects = { /* Begin PBXBuildFile section */ + 165F38D32276CDEA0088E29F /* ConvAddReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 165F38D22276CDEA0088E29F /* ConvAddReluOp.swift */; }; + 165F38D52276CE7D0088E29F /* ConvAddReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 165F38D42276CE7D0088E29F /* ConvAddReluKernel.swift */; }; 456BB7B421F5B356001474E2 /* Framework.pbobjc.m in Sources */ = {isa = PBXBuildFile; fileRef = 456BB7B221F5B356001474E2 /* Framework.pbobjc.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc"; }; }; 456BB7B521F5B356001474E2 /* Framework.pbobjc.h in Headers */ = {isa = PBXBuildFile; fileRef = 456BB7B321F5B356001474E2 /* Framework.pbobjc.h */; settings = {ATTRIBUTES = (Public, ); }; }; 4AA1EA862146625E00D0F791 /* BilinearInterpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */; }; @@ -101,6 +103,8 @@ /* End PBXBuildFile section */ /* Begin PBXFileReference section */ + 165F38D22276CDEA0088E29F /* ConvAddReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddReluOp.swift; sourceTree = ""; }; + 165F38D42276CE7D0088E29F /* ConvAddReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddReluKernel.swift; sourceTree = ""; }; 456BB7B221F5B356001474E2 /* Framework.pbobjc.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = Framework.pbobjc.m; sourceTree = ""; }; 456BB7B321F5B356001474E2 /* Framework.pbobjc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Framework.pbobjc.h; sourceTree = ""; }; 4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BilinearInterpOp.swift; sourceTree = ""; }; @@ -323,6 +327,7 @@ FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */, FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */, FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */, + 165F38D22276CDEA0088E29F /* ConvAddReluOp.swift */, ); path = Operators; sourceTree = ""; @@ -377,6 +382,7 @@ FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */, FC2BFD4521DF685F00C262B2 /* Scale.swift */, FCB40E5821E0DCAB0075EC91 /* FetchKernel.swift */, + 165F38D42276CE7D0088E29F /* ConvAddReluKernel.swift */, ); path = Kernels; sourceTree = ""; @@ -541,6 +547,7 @@ FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */, 4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */, FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */, + 165F38D52276CE7D0088E29F /* ConvAddReluKernel.swift in Sources */, FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */, FCEB684C212F093800D2448E /* PreluOp.swift in Sources */, FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */, @@ -592,6 +599,7 @@ FC9D038220E2312E000F735A /* FetchOp.swift in Sources */, FC039BBD20E11CC20081E9F8 /* Program.swift in Sources */, FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */, + 165F38D32276CDEA0088E29F /* ConvAddReluOp.swift in Sources */, FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */, FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */, FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */, diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift index a2f4104b9b..bbd726cc0c 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift @@ -68,7 +68,8 @@ class OpCreator { gConvAddPreluType : ConvAddPreluOp

.creat, gConvAddAddPreluType : ConvAddAddPreluOp

.creat, gElementwiseAddPreluType : ElementwiseAddPreluOp

.creat, - gFusionConvAddType : ConvAddOp

.creat] + gFusionConvAddType : ConvAddOp

.creat, + gConvAddReluType : ConvAddReluOp

.creat] private init(){} } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift index d6ba07add3..2e286b2ff2 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift @@ -173,6 +173,7 @@ let gBilinearInterpType = "bilinear_interp" let gSplit = "split" let gShape = "shape" let gFlatten = "flatten" +let gConvAddReluType = "conv_add_relu" let gConvAddPreluType = "conv_add_prelu" let gConvAddAddPreluType = "conv_add_add_prelu" let gElementwiseAddPreluType = "elementwise_add_prelu" diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift new file mode 100644 index 0000000000..91139cc1ce --- /dev/null +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +import Foundation + +class ConvAddReluOp: Operator, ConvAddParam

>, Runable, Creator, InferShaperable, Fusion { + typealias OpType = ConvAddReluOp

+ + static func fusionNode() -> Node { + let beginNode = Node.init(inType: gConvType) + _ = beginNode + --> Node.init(inType: gElementwiseAddType) + --> Node.init(inType: gReluType) + return beginNode + } + + static func change() -> [String : [(from: String, to: String)]] { + return [:] + } + + static func fusionType() -> String { + return gConvAddReluType + } + + func inferShape() { + let inDims = para.input.dim + let filterDim = para.filter.dim + let strides = para.stride + let paddings = para.paddings + let dilations = para.dilations + + var outDim = [inDims[0]] + for i in 0..: Kernel, Computable { let identifyingKey: String = getUniqueKey() required init(device: MTLDevice, param: ConvAddParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) - let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1)/2 - Int(param.paddings[1]) - let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1)/2 - Int(param.paddings[0]) - - let key = identifyingKey - - if initContext.useMPS { // 使用 apple 的 MetalPerformanceShaders - if #available(iOS 11.0, *) { - var desc: MPSCNNConvolutionDescriptor? - // 如果不是 depth wise, 并且输入输出 tensor channel 都大于 4 - let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1] - if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 { - if isDepthWise { - desc = MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3], - kernelHeight: param.filter.tensorDim[2], - inputFeatureChannels: param.input.tensorDim[1], - outputFeatureChannels: param.output.tensorDim[1], - neuronFilter: nil) - } else { - desc = MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3], - kernelHeight: param.filter.tensorDim[2], - inputFeatureChannels: param.input.tensorDim[1], - outputFeatureChannels: param.output.tensorDim[1], - neuronFilter: nil) - } - } - desc?.strideInPixelsX = Int(param.stride[0]) - desc?.strideInPixelsY = Int(param.stride[1]) - if let inDesc = desc { - let _ = param.filter.convert(converter: MPSPointerConverter

.init()) - let dataSource = ConvDataSource.init(inDesc: inDesc, inWeights: param.filter, inBiasTerms: param.y) - let conv = MPSCNNConvolution.init(device: device, weights: dataSource) - conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0) - conv.edgeMode = .zero - convDic[key] = conv - super.init(device: device, inFunctionName: nil, initContext: initContext) - return - } + var shouldUseMPS = false + if #available(iOS 11.0, *), initContext.useMPS { + // 输入输出 tensor channel 必须都大于 4 + if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 { + shouldUseMPS = true } } - let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1]) - param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC) - param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - - if GlobalConfig.shared.computePrecision == .Float16 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_1x1_half", initContext: initContext) - } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_add_3x3_half", initContext: initContext) - } else if param.filter.width == 1 && param.filter.height == 5 { - super.init(device: device, inFunctionName: "conv_add_5x1_half", initContext: initContext) - } else if param.filter.width == 5 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_1x5_half", initContext: initContext) - } else { - fatalError(" unsupport yet ") - } - } else if GlobalConfig.shared.computePrecision == .Float32 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_1x1", initContext: initContext) - } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3", initContext: initContext) - } else if param.filter.width == 1 && param.filter.height == 5 { - super.init(device: device, inFunctionName: "conv_add_5x1", initContext: initContext) - } else if param.filter.width == 5 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_1x5", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_add_3x3", initContext: initContext) - } else { + if shouldUseMPS { + super.init(device: device, inFunctionName: nil, initContext: initContext) + setupWithMPS(device: device, param: param) + } else { + let functionName = type(of: self).kernelFunctionName(param: param) + if functionName == nil { fatalError(" unsupport yet ") } - } else { - fatalError() + super.init(device: device, inFunctionName: functionName, initContext: initContext) + setupWithoutMPS(device: device, param: param) } - - // print(" function: \(functionName)") - // print("offset x: \(offsetX)") - // print("offset y: \(offsetY)") - - let offsetZ = 0.0 - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) - // print("metal param: ") - // print(inMetalParam) - - metalParam = inMetalParam } func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam

) throws { @@ -200,7 +132,7 @@ class ConvAddKernel: Kernel, Computable { let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1]) let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1]) conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage) - return; + return } } @@ -221,5 +153,85 @@ class ConvAddKernel: Kernel, Computable { convDic.removeValue(forKey: identifyingKey) } } + + func setupWithMPS(device: MTLDevice, param: ConvAddParam

) { + let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0]) + let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1]) + + let key = identifyingKey + + let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1] + if #available(iOS 11.0, *) { + let desc: MPSCNNConvolutionDescriptor = isDepthWise ? + MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3], + kernelHeight: param.filter.tensorDim[2], + inputFeatureChannels: param.input.tensorDim[1], + outputFeatureChannels: param.output.tensorDim[1], + neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) : + MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3], + kernelHeight: param.filter.tensorDim[2], + inputFeatureChannels: param.input.tensorDim[1], + outputFeatureChannels: param.output.tensorDim[1], + neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) + desc.strideInPixelsX = Int(param.stride[0]) + desc.strideInPixelsY = Int(param.stride[1]) + let _ = param.filter.convert(converter: MPSPointerConverter

.init()) + let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.y) + let conv = MPSCNNConvolution.init(device: device, weights: dataSource) + conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0) + conv.edgeMode = .zero + convDic[key] = conv + } + } + + func setupWithoutMPS(device: MTLDevice, param: ConvAddParam

) { + let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0]) + let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1]) + let offsetZ = 0.0 + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + metalParam = inMetalParam + + let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1]) + param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC) + param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + } + + open class func kernelFunctionName(param: ConvAddParam

) -> String? { + if GlobalConfig.shared.computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + return "conv_add_1x1_half" + } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { + return "depthwise_conv_add_3x3_half" + } else if param.filter.width == 3 && param.filter.height == 3 { + return "conv_add_3x3_half" + } else if param.filter.width == 1 && param.filter.height == 5 { + return "conv_add_5x1_half" + } else if param.filter.width == 5 && param.filter.height == 1 { + return "conv_add_1x5_half" + } else { + return nil + } + } else if GlobalConfig.shared.computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + return "conv_add_1x1" + } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { + return "depthwise_conv_add_3x3" + } else if param.filter.width == 1 && param.filter.height == 5 { + return "conv_add_5x1" + } else if param.filter.width == 5 && param.filter.height == 1 { + return "conv_add_1x5" + } else if param.filter.width == 3 && param.filter.height == 3 { + return "conv_add_3x3" + } else { + return nil + } + } else { + return nil + } + } + + func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? { + return nil + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift new file mode 100644 index 0000000000..9582f9c4a4 --- /dev/null +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift @@ -0,0 +1,53 @@ +// +// ConvAddReluKernel.swift +// paddle-mobile +// +// Created by Yang,Yanzhan on 2019/4/29. +// Copyright © 2019 orange. All rights reserved. +// + +import Foundation +import MetalPerformanceShaders + +class ConvAddReluKernel: ConvAddKernel

{ + override class func kernelFunctionName(param: ConvAddParam

) -> String? { + if GlobalConfig.shared.computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + return "conv_add_relu_1x1_half" + } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { + return "depthwise_conv_add_relu_3x3_half" + } else if param.filter.width == 3 && param.filter.height == 3 { + return "conv_add_relu_3x3_half" + } else if param.filter.width == 1 && param.filter.height == 5 { + return "conv_add_relu_5x1_half" + } else if param.filter.width == 5 && param.filter.height == 1 { + return "conv_add_relu_1x5_half" + } else { + return nil + } + } else if GlobalConfig.shared.computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + return "conv_add_relu_1x1" + } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { + return "depthwise_conv_add_relu_3x3" + } else if param.filter.width == 1 && param.filter.height == 5 { + return "conv_add_relu_5x1" + } else if param.filter.width == 5 && param.filter.height == 1 { + return "conv_add_relu_1x5" + } else if param.filter.width == 3 && param.filter.height == 3 { + return "conv_add_relu_3x3" + } else { + return nil + } + } else { + return nil + } + } + + override func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? { + if #available(iOS 10.0, *) { + return MPSCNNNeuronReLU(device: device, a: 0) + } + return nil + } +} diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift index 8637f7683c..afdf435a21 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift @@ -184,6 +184,7 @@ extension Node: Equatable { class ProgramOptimize { // register fusion let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp

.self, + ConvAddReluOp

.self, // ConvAddAddPreluOp

.self, ConvAddPreluOp

.self, ConvAddOp

.self, -- GitLab