提交 200be579 编写于 作者: Y yangyanzhan

fuse Conv-Add-Relu into one op.

上级 99cccf3b
......@@ -7,6 +7,7 @@
objects = {
/* Begin PBXBuildFile section */
165F38D72276F4C00088E29F /* ConvAddReluMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = 165F38D62276F4C00088E29F /* ConvAddReluMetal.metal */; };
5CCC0CF6759710BAFE999DB7 /* Pods_paddle_mobile_metallib.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5D9D330A035906298947080B /* Pods_paddle_mobile_metallib.framework */; };
FCC15DE5221E69E100DC3CB2 /* ReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBC221E69DD00DC3CB2 /* ReluKernel.metal */; };
FCC15DE6221E69E100DC3CB2 /* BoxCoder.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBD221E69DD00DC3CB2 /* BoxCoder.metal */; };
......@@ -52,6 +53,7 @@
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
165F38D62276F4C00088E29F /* ConvAddReluMetal.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddReluMetal.metal; sourceTree = "<group>"; };
33511F4FF7FE78679BE12DC0 /* Pods-paddle-mobile-metallib.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-metallib.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-metallib/Pods-paddle-mobile-metallib.release.xcconfig"; sourceTree = "<group>"; };
5D9D330A035906298947080B /* Pods_paddle_mobile_metallib.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile_metallib.framework; sourceTree = BUILT_PRODUCTS_DIR; };
C6D31B9F9533810DBCA6B28D /* Pods-paddle-mobile-metallib.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-metallib.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-metallib/Pods-paddle-mobile-metallib.debug.xcconfig"; sourceTree = "<group>"; };
......@@ -190,6 +192,7 @@
FCC15DBF221E69DD00DC3CB2 /* Split.metal */,
FCC15DC9221E69DE00DC3CB2 /* TransposeKernel.inc.metal */,
FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */,
165F38D62276F4C00088E29F /* ConvAddReluMetal.metal */,
);
path = "paddle-mobile-metallib";
sourceTree = "<group>";
......@@ -310,6 +313,7 @@
FCC15E08221E69E100DC3CB2 /* Split.inc.metal in Sources */,
FCC15DF4221E69E100DC3CB2 /* ResizeBilinear.metal in Sources */,
FCC15E05221E69E100DC3CB2 /* BatchNormKernel.metal in Sources */,
165F38D72276F4C00088E29F /* ConvAddReluMetal.metal in Sources */,
FCC15DE6221E69E100DC3CB2 /* BoxCoder.metal in Sources */,
FCC15DF6221E69E100DC3CB2 /* PoolKernel.metal in Sources */,
FCC15E09221E69E100DC3CB2 /* ConcatKernel.inc.metal in Sources */,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <metal_stdlib>
#include "Common.metal"
using namespace metal;
#pragma mark - convAdd
kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = biase[gid.z];
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
float4 relu = fmax(output, 0.0);
outTexture.write(relu, gid.xy, gid.z);
}
kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = biase[gid.z];
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
float4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i);
for (int j = 0; j < 9; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w);
}
}
float4 relu = fmax(output, 0.0);
outTexture.write(relu, gid.xy, gid.z);
}
kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 5;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = biase[gid.z];
ushort dilation_y = param.dilationY;
float4 input[5];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
for (int j = 0; j < 5; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w);
}
}
float4 relu = fmax(output, 0.0);
outTexture.write(relu, gid.xy, gid.z);
}
kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 5;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = biase[gid.z];
ushort dilation_x = param.dilationX;
float4 input[5];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
for (int j = 0; j < 5; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w);
}
}
float4 relu = fmax(output, 0.0);
outTexture.write(relu, gid.xy, gid.z);
}
kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = biase[gid.z];
float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
float4 input = inputs[j];
output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
}
float4 relu = fmax(output, 0.0);
outTexture.write(relu, gid.xy, gid.z);
}
#pragma mark - half
kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(biase[gid.z]);
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = float4(inTexture.sample(sample, float2(posInInput.x, posInInput.y), i));
float4 weight_x = float4(weights[weithTo + 0 * kernelHXW * input_arr_size + i]);
output.x += dot(input, weight_x);
float4 weight_y = float4(weights[weithTo + 1 * kernelHXW * input_arr_size + i]);
output.y += dot(input, weight_y);
float4 weight_z = float4(weights[weithTo + 2 * kernelHXW * input_arr_size + i]);
output.z += dot(input, weight_z);
float4 weight_w = float4(weights[weithTo + 3 * kernelHXW * input_arr_size + i]);
output.w += dot(input, weight_w);
}
float4 relu = fmax(output, 0.0);
outTexture.write(half4(relu), gid.xy, gid.z);
}
kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(biase[gid.z]);
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
half4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i);
for (int j = 0; j < 9; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(float4(input[j]), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(float4(input[j]), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(float4(input[j]), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(float4(input[j]), float4(weight_w));
}
}
float4 relu = fmax(output, 0.0);
outTexture.write(half4(relu), gid.xy, gid.z);
}
kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(biase[gid.z]);
half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
half4 input = inputs[j];
output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
}
float4 relu = fmax(output, 0.0);
outTexture.write(half4(relu), gid.xy, gid.z);
}
kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 5;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(biase[gid.z]);
ushort dilation_y = param.dilationY;
half4 input[5];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
for (int j = 0; j < 5; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(float4(input[j]), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(float4(input[j]), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(float4(input[j]), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(float4(input[j]), float4(weight_w));
}
}
float4 relu = fmax(output, 0.0);
outTexture.write(half4(relu), gid.xy, gid.z);
}
kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 5;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(biase[gid.z]);
ushort dilation_x = param.dilationX;
half4 input[5];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
for (int j = 0; j < 5; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(float4(input[j]), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(float4(input[j]), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(float4(input[j]), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(float4(input[j]), float4(weight_w));
}
}
float4 relu = fmax(output, 0.0);
outTexture.write(half4(relu), gid.xy, gid.z);
}
......@@ -7,6 +7,8 @@
objects = {
/* Begin PBXBuildFile section */
165F38D32276CDEA0088E29F /* ConvAddReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 165F38D22276CDEA0088E29F /* ConvAddReluOp.swift */; };
165F38D52276CE7D0088E29F /* ConvAddReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 165F38D42276CE7D0088E29F /* ConvAddReluKernel.swift */; };
456BB7B421F5B356001474E2 /* Framework.pbobjc.m in Sources */ = {isa = PBXBuildFile; fileRef = 456BB7B221F5B356001474E2 /* Framework.pbobjc.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc"; }; };
456BB7B521F5B356001474E2 /* Framework.pbobjc.h in Headers */ = {isa = PBXBuildFile; fileRef = 456BB7B321F5B356001474E2 /* Framework.pbobjc.h */; settings = {ATTRIBUTES = (Public, ); }; };
4AA1EA862146625E00D0F791 /* BilinearInterpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */; };
......@@ -101,6 +103,8 @@
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
165F38D22276CDEA0088E29F /* ConvAddReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddReluOp.swift; sourceTree = "<group>"; };
165F38D42276CE7D0088E29F /* ConvAddReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddReluKernel.swift; sourceTree = "<group>"; };
456BB7B221F5B356001474E2 /* Framework.pbobjc.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = Framework.pbobjc.m; sourceTree = "<group>"; };
456BB7B321F5B356001474E2 /* Framework.pbobjc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Framework.pbobjc.h; sourceTree = "<group>"; };
4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BilinearInterpOp.swift; sourceTree = "<group>"; };
......@@ -323,6 +327,7 @@
FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */,
FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */,
FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */,
165F38D22276CDEA0088E29F /* ConvAddReluOp.swift */,
);
path = Operators;
sourceTree = "<group>";
......@@ -377,6 +382,7 @@
FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */,
FC2BFD4521DF685F00C262B2 /* Scale.swift */,
FCB40E5821E0DCAB0075EC91 /* FetchKernel.swift */,
165F38D42276CE7D0088E29F /* ConvAddReluKernel.swift */,
);
path = Kernels;
sourceTree = "<group>";
......@@ -541,6 +547,7 @@
FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */,
4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */,
FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */,
165F38D52276CE7D0088E29F /* ConvAddReluKernel.swift in Sources */,
FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */,
FCEB684C212F093800D2448E /* PreluOp.swift in Sources */,
FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */,
......@@ -592,6 +599,7 @@
FC9D038220E2312E000F735A /* FetchOp.swift in Sources */,
FC039BBD20E11CC20081E9F8 /* Program.swift in Sources */,
FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */,
165F38D32276CDEA0088E29F /* ConvAddReluOp.swift in Sources */,
FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */,
FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */,
FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */,
......
......@@ -68,7 +68,8 @@ class OpCreator<P: PrecisionProtocol> {
gConvAddPreluType : ConvAddPreluOp<P>.creat,
gConvAddAddPreluType : ConvAddAddPreluOp<P>.creat,
gElementwiseAddPreluType : ElementwiseAddPreluOp<P>.creat,
gFusionConvAddType : ConvAddOp<P>.creat]
gFusionConvAddType : ConvAddOp<P>.creat,
gConvAddReluType : ConvAddReluOp<P>.creat]
private init(){}
}
......@@ -173,6 +173,7 @@ let gBilinearInterpType = "bilinear_interp"
let gSplit = "split"
let gShape = "shape"
let gFlatten = "flatten"
let gConvAddReluType = "conv_add_relu"
let gConvAddPreluType = "conv_add_prelu"
let gConvAddAddPreluType = "conv_add_add_prelu"
let gElementwiseAddPreluType = "elementwise_add_prelu"
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
import Foundation
class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion {
typealias OpType = ConvAddReluOp<P>
static func fusionNode() -> Node {
let beginNode = Node.init(inType: gConvType)
_ = beginNode
--> Node.init(inType: gElementwiseAddType)
--> Node.init(inType: gReluType)
return beginNode
}
static func change() -> [String : [(from: String, to: String)]] {
return [:]
}
static func fusionType() -> String {
return gConvAddReluType
}
func inferShape() {
let inDims = para.input.dim
let filterDim = para.filter.dim
let strides = para.stride
let paddings = para.paddings
let dilations = para.dilations
var outDim = [inDims[0]]
for i in 0..<strides.count {
let dilation: Int = Int(dilations[i])
let filterSize: Int = filterDim[i + 1]
let inputSize: Int = inDims[i + 1]
let padding: Int = Int(paddings[i])
let stride: Int = Int(strides[i])
let dKernel = dilation * (filterSize - 1) + 1
let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
outDim.append(outputSize)
}
outDim.append(filterDim[0])
para.output.dim = Dim.init(inDim: outDim)
}
func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
do {
try kernel.compute(commandBuffer: buffer, param: para)
} catch let error {
throw error
}
}
func delogOutput() {
print(" \(type) output: ")
print(para.output.metalTexture)
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
}
}
......@@ -103,95 +103,27 @@ class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
let identifyingKey: String = getUniqueKey()
required init(device: MTLDevice, param: ConvAddParam<P>, initContext: InitContext) {
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1)/2 - Int(param.paddings[1])
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1)/2 - Int(param.paddings[0])
let key = identifyingKey
if initContext.useMPS { // 使用 apple 的 MetalPerformanceShaders
if #available(iOS 11.0, *) {
var desc: MPSCNNConvolutionDescriptor?
// 如果不是 depth wise, 并且输入输出 tensor channel 都大于 4
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
if isDepthWise {
desc = MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: nil)
} else {
desc = MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: nil)
}
}
desc?.strideInPixelsX = Int(param.stride[0])
desc?.strideInPixelsY = Int(param.stride[1])
if let inDesc = desc {
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
let dataSource = ConvDataSource.init(inDesc: inDesc, inWeights: param.filter, inBiasTerms: param.y)
let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
conv.edgeMode = .zero
convDic[key] = conv
super.init(device: device, inFunctionName: nil, initContext: initContext)
return
}
var shouldUseMPS = false
if #available(iOS 11.0, *), initContext.useMPS {
// 输入输出 tensor channel 必须都大于 4
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true
}
}
let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
if GlobalConfig.shared.computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_1x1_half", initContext: initContext)
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half", initContext: initContext)
} else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_add_3x3_half", initContext: initContext)
} else if param.filter.width == 1 && param.filter.height == 5 {
super.init(device: device, inFunctionName: "conv_add_5x1_half", initContext: initContext)
} else if param.filter.width == 5 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_1x5_half", initContext: initContext)
} else {
fatalError(" unsupport yet ")
}
} else if GlobalConfig.shared.computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_1x1", initContext: initContext)
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
super.init(device: device, inFunctionName: "depthwise_conv_add_3x3", initContext: initContext)
} else if param.filter.width == 1 && param.filter.height == 5 {
super.init(device: device, inFunctionName: "conv_add_5x1", initContext: initContext)
} else if param.filter.width == 5 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_1x5", initContext: initContext)
} else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_add_3x3", initContext: initContext)
} else {
if shouldUseMPS {
super.init(device: device, inFunctionName: nil, initContext: initContext)
setupWithMPS(device: device, param: param)
} else {
let functionName = type(of: self).kernelFunctionName(param: param)
if functionName == nil {
fatalError(" unsupport yet ")
}
} else {
fatalError()
super.init(device: device, inFunctionName: functionName, initContext: initContext)
setupWithoutMPS(device: device, param: param)
}
// print(" function: \(functionName)")
// print("offset x: \(offsetX)")
// print("offset y: \(offsetY)")
let offsetZ = 0.0
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
// print("metal param: ")
// print(inMetalParam)
metalParam = inMetalParam
}
func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
......@@ -200,7 +132,7 @@ class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
return;
return
}
}
......@@ -221,5 +153,85 @@ class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
convDic.removeValue(forKey: identifyingKey)
}
}
func setupWithMPS(device: MTLDevice, param: ConvAddParam<P>) {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let key = identifyingKey
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if #available(iOS 11.0, *) {
let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
desc.strideInPixelsX = Int(param.stride[0])
desc.strideInPixelsY = Int(param.stride[1])
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.y)
let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
conv.edgeMode = .zero
convDic[key] = conv
}
}
func setupWithoutMPS(device: MTLDevice, param: ConvAddParam<P>) {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let offsetZ = 0.0
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
metalParam = inMetalParam
let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
}
open class func kernelFunctionName(param: ConvAddParam<P>) -> String? {
if GlobalConfig.shared.computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_1x1_half"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
return "depthwise_conv_add_3x3_half"
} else if param.filter.width == 3 && param.filter.height == 3 {
return "conv_add_3x3_half"
} else if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_5x1_half"
} else if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_1x5_half"
} else {
return nil
}
} else if GlobalConfig.shared.computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_1x1"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
return "depthwise_conv_add_3x3"
} else if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_5x1"
} else if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_1x5"
} else if param.filter.width == 3 && param.filter.height == 3 {
return "conv_add_3x3"
} else {
return nil
}
} else {
return nil
}
}
func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
return nil
}
}
//
// ConvAddReluKernel.swift
// paddle-mobile
//
// Created by Yang,Yanzhan on 2019/4/29.
// Copyright © 2019 orange. All rights reserved.
//
import Foundation
import MetalPerformanceShaders
class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
override class func kernelFunctionName(param: ConvAddParam<P>) -> String? {
if GlobalConfig.shared.computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_relu_1x1_half"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
return "depthwise_conv_add_relu_3x3_half"
} else if param.filter.width == 3 && param.filter.height == 3 {
return "conv_add_relu_3x3_half"
} else if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_relu_5x1_half"
} else if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_relu_1x5_half"
} else {
return nil
}
} else if GlobalConfig.shared.computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_relu_1x1"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
return "depthwise_conv_add_relu_3x3"
} else if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_relu_5x1"
} else if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_relu_1x5"
} else if param.filter.width == 3 && param.filter.height == 3 {
return "conv_add_relu_3x3"
} else {
return nil
}
} else {
return nil
}
}
override func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
if #available(iOS 10.0, *) {
return MPSCNNNeuronReLU(device: device, a: 0)
}
return nil
}
}
......@@ -184,6 +184,7 @@ extension Node: Equatable {
class ProgramOptimize<P: PrecisionProtocol> {
// register fusion
let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
ConvAddReluOp<P>.self,
// ConvAddAddPreluOp<P>.self,
ConvAddPreluOp<P>.self,
ConvAddOp<P>.self,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册