未验证 提交 b53a20aa 编写于 作者: Y Yanzhan Yang 提交者: GitHub

1.merge conv, conv add, conv add relu into one implementation. (#1661)

上级 881dcb2e
......@@ -40,7 +40,6 @@
FCC15DFD221E69E100DC3CB2 /* Elementwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */; };
FCC15DFE221E69E100DC3CB2 /* ReshapeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */; };
FCC15DFF221E69E100DC3CB2 /* Scale.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD6221E69DF00DC3CB2 /* Scale.metal */; };
FCC15E00221E69E100DC3CB2 /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD7221E69DF00DC3CB2 /* ConvKernel.metal */; };
FCC15E01221E69E100DC3CB2 /* PriorBoxKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */; };
FCC15E02221E69E100DC3CB2 /* BatchNormRelu.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */; };
FCC15E03221E69E100DC3CB2 /* TransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */; };
......@@ -53,7 +52,6 @@
FCC15E0A221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */; };
FCC15E0B221E69E100DC3CB2 /* FetchKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */; };
FCC15E0C221E69E100DC3CB2 /* BufferToTexture.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */; };
FCC15E0D221E69E100DC3CB2 /* ConvAddMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE4221E69E100DC3CB2 /* ConvAddMetal.metal */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
......@@ -93,7 +91,6 @@
FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Elementwise.metal; sourceTree = "<group>"; };
FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.metal; sourceTree = "<group>"; };
FCC15DD6221E69DF00DC3CB2 /* Scale.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Scale.metal; sourceTree = "<group>"; };
FCC15DD7221E69DF00DC3CB2 /* ConvKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = "<group>"; };
FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PriorBoxKernel.metal; sourceTree = "<group>"; };
FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BatchNormRelu.metal; sourceTree = "<group>"; };
FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = TransposeKernel.metal; sourceTree = "<group>"; };
......@@ -106,7 +103,6 @@
FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.inc.metal; sourceTree = "<group>"; };
FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = FetchKernel.inc.metal; sourceTree = "<group>"; };
FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BufferToTexture.metal; sourceTree = "<group>"; };
FCC15DE4221E69E100DC3CB2 /* ConvAddMetal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvAddMetal.metal; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
......@@ -170,11 +166,9 @@
FCC15DE0221E69E100DC3CB2 /* ConcatKernel.inc.metal */,
FCC15DCA221E69DE00DC3CB2 /* ConcatKernel.metal */,
FCC15DBE221E69DD00DC3CB2 /* ConvAddBNReluKernel.metal */,
FCC15DE4221E69E100DC3CB2 /* ConvAddMetal.metal */,
FCC15DDB221E69E000DC3CB2 /* ConvAddPrelu.inc.metal */,
FCC15DD3221E69DF00DC3CB2 /* ConvAddPreluKernel.metal */,
FCC15DCF221E69DE00DC3CB2 /* ConvBNReluKernel.metal */,
FCC15DD7221E69DF00DC3CB2 /* ConvKernel.metal */,
FCC15DC5221E69DE00DC3CB2 /* ConvTransposeKernel.metal */,
FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */,
FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */,
......@@ -309,12 +303,10 @@
FCC15E03221E69E100DC3CB2 /* TransposeKernel.metal in Sources */,
FCC15DFE221E69E100DC3CB2 /* ReshapeKernel.metal in Sources */,
16FBFB3E22925D040025B406 /* ActivationKernel.metal in Sources */,
FCC15E0D221E69E100DC3CB2 /* ConvAddMetal.metal in Sources */,
FCC15DF7221E69E100DC3CB2 /* ReshapeKernel.inc.metal in Sources */,
FCC15DE5221E69E100DC3CB2 /* ReluKernel.metal in Sources */,
FCC15DEF221E69E100DC3CB2 /* Macro.metal in Sources */,
FCC15E02221E69E100DC3CB2 /* BatchNormRelu.metal in Sources */,
FCC15E00221E69E100DC3CB2 /* ConvKernel.metal in Sources */,
FCC15E01221E69E100DC3CB2 /* PriorBoxKernel.metal in Sources */,
FCC15DEA221E69E100DC3CB2 /* ElementwiseAddPreluKernel.metal in Sources */,
FCC15DED221E69E100DC3CB2 /* PoolKernel.inc.metal in Sources */,
......
......@@ -13,7 +13,6 @@
limitations under the License. */
#include <metal_stdlib>
#include <metal_math>
using namespace metal;
kernel void exp(texture2d_array<float, access::sample> inTexture [[texture(0)]],
......
......@@ -120,4 +120,6 @@ struct MetalConvParam {
ushort iC;
ushort fC;
ushort oC;
ushort hasAddOp;
ushort hasReluOp;
};
......@@ -40,7 +40,7 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = biase[gid.z];
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
......@@ -57,7 +57,7 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
float4 relu = fmax(output, 0.0);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(relu, gid.xy, gid.z);
}
......@@ -85,7 +85,7 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = biase[gid.z];
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
......@@ -125,7 +125,7 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
output.w += dot(input[j], weight_w);
}
}
float4 relu = fmax(output, 0.0);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(relu, gid.xy, gid.z);
}
......@@ -148,7 +148,7 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
const uint kernelHXW = 9;
float4 output = biase[gid.z];
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
......@@ -180,7 +180,7 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
}
}
float4 relu = fmax(output, 0.0);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(relu, gid.xy, gid.z);
}
......@@ -208,7 +208,7 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = biase[gid.z];
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_y = param.dilationY;
float4 input[5];
......@@ -238,7 +238,7 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
output.w += dot(input[j], weight_w);
}
}
float4 relu = fmax(output, 0.0);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(relu, gid.xy, gid.z);
}
......@@ -266,7 +266,7 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = biase[gid.z];
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX;
float4 input[5];
......@@ -296,7 +296,7 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
output.w += dot(input[j], weight_w);
}
}
float4 relu = fmax(output, 0.0);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(relu, gid.xy, gid.z);
}
......@@ -318,7 +318,7 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = biase[gid.z];
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
......@@ -336,7 +336,7 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
}
float4 relu = fmax(output, 0.0);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(relu, gid.xy, gid.z);
}
......@@ -364,7 +364,7 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(biase[gid.z]);
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
......@@ -381,7 +381,7 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
float4 weight_w = float4(weights[weithTo + 3 * kernelHXW * input_arr_size + i]);
output.w += dot(input, weight_w);
}
float4 relu = fmax(output, 0.0);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(half4(relu), gid.xy, gid.z);
}
......@@ -406,7 +406,7 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(biase[gid.z]);
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
......@@ -436,7 +436,7 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
output.w += dot(float4(input[j]), float4(weight_w));
}
}
float4 relu = fmax(output, 0.0);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(half4(relu), gid.xy, gid.z);
}
......@@ -459,7 +459,7 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
const uint kernelHXW = 9;
half4 output = biase[gid.z];
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
......@@ -486,13 +486,13 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
for (int j = 0; j < 9; ++j) {
half weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i];
output[c] += input[j] * weight;
output[c] += float(input[j]) * float(weight);
}
}
}
half4 relu = fmax(output, 0.0);
outTexture.write(relu, gid.xy, gid.z);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(half4(relu), gid.xy, gid.z);
}
kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
......@@ -512,7 +512,7 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(biase[gid.z]);
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
......@@ -530,8 +530,9 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
}
output = fmax(output, 0.0);
outTexture.write(half4(output), gid.xy, gid.z);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(half4(relu), gid.xy, gid.z);
}
kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, access::sample> inTexture [[texture(0)]],
......@@ -640,11 +641,18 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
res[3][c] = T[7] - T[11] + T[15] + tmp1 - tmp2;
}
half4 base = biase[tc];
outTexture.write(fmax(res[0] + base, 0.0), uint2(tx, ty), tc);
outTexture.write(fmax(res[1] + base, 0.0), uint2(tx + 1, ty), tc);
outTexture.write(fmax(res[2] + base, 0.0), uint2(tx, ty + 1), tc);
outTexture.write(fmax(res[3] + base, 0.0), uint2(tx + 1, ty + 1), tc);
if (param.hasReluOp == 1) {
half4 base = biase[tc];
outTexture.write(fmax(res[0] + base, 0.0), uint2(tx, ty), tc);
outTexture.write(fmax(res[1] + base, 0.0), uint2(tx + 1, ty), tc);
outTexture.write(fmax(res[2] + base, 0.0), uint2(tx, ty + 1), tc);
outTexture.write(fmax(res[3] + base, 0.0), uint2(tx + 1, ty + 1), tc);
} else {
outTexture.write(res[0], uint2(tx, ty), tc);
outTexture.write(res[1], uint2(tx + 1, ty), tc);
outTexture.write(res[2], uint2(tx, ty + 1), tc);
outTexture.write(res[3], uint2(tx + 1, ty + 1), tc);
}
}
kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
......@@ -653,7 +661,6 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
......@@ -671,7 +678,7 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(biase[gid.z]);
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_y = param.dilationY;
half4 input[5];
......@@ -701,7 +708,7 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
output.w += dot(float4(input[j]), float4(weight_w));
}
}
float4 relu = fmax(output, 0.0);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(half4(relu), gid.xy, gid.z);
}
......@@ -729,7 +736,7 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(biase[gid.z]);
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX;
half4 input[5];
......@@ -759,6 +766,6 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu
output.w += dot(float4(input[j]), float4(weight_w));
}
}
float4 relu = fmax(output, 0.0);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(half4(relu), gid.xy, gid.z);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <metal_stdlib>
#include "Common.metal"
using namespace metal;
// conv
#pragma mark -- conv
kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
float4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w);
}
}
outTexture.write(output, gid.xy, gid.z);
}
kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(0.0);
float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
float4 input = inputs[j];
output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
}
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
half4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(float4(input[j]), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(float4(input[j]), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(float4(input[j]), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(float4(input[j]), float4(weight_w));
}
}
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void depthwise_conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(0.0);
half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
half4 input = inputs[j];
output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
}
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void conv_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
half4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(float4(input), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(float4(input), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(float4(input), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(float4(input), float4(weight_w));
}
outTexture.write(half4(output), gid.xy, gid.z);
}
......@@ -325,7 +325,7 @@ public class PaddleMobileUnitTest {
let fC = 4
let oC = 4
let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
......
......@@ -14,36 +14,7 @@
import Foundation
class ConvAddParam<P: PrecisionProtocol>: OpParam {
//typealias ParamPrecisionType = P
required init(opDesc: PMOpDesc, inScope: Scope) throws {
do {
filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope)
output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs)
paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs)
dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs)
groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs)
y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
} catch let error {
throw error
}
}
let input: Texture
let y: Tensor<P>
let filter: Tensor<P>
var output: Texture
let stride: [Int32]
let paddings: [Int32]
let dilations: [Int32]
let groups: Int
}
class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion{
class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
typealias OpType = ConvAddOp<P>
static func fusionNode() -> Node {
......
......@@ -14,7 +14,40 @@
import Foundation
class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion {
class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
required init(opDesc: PMOpDesc, inScope: Scope) throws {
do {
filter = try ConvAddReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
input = try ConvAddReluParam.input(inputs: opDesc.inputs, from: inScope)
output = try ConvAddReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
stride = try ConvAddReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
paddings = try ConvAddReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
dilations = try ConvAddReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
groups = try ConvAddReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
do {
y = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
} catch {
}
} catch let error {
throw error
}
}
let input: Texture
var y: Tensor<P>?
let filter: Tensor<P>
var output: Texture
let stride: [Int32]
let paddings: [Int32]
let dilations: [Int32]
let groups: Int
open class func hasY() -> Bool {
return true
}
}
class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion {
typealias OpType = ConvAddReluOp<P>
static func fusionNode() -> Node {
......@@ -69,4 +102,3 @@ class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAd
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
}
}
......@@ -15,7 +15,6 @@
import Foundation
class ConvParam<P: PrecisionProtocol>: OpParam {
//typealias ParamPrecisionType = P
required init(opDesc: PMOpDesc, inScope: Scope) throws {
do {
filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
......
......@@ -37,7 +37,7 @@ protocol KernelProtocol {
}
@objc open class Kernel: NSObject{
@objc open class Kernel: NSObject {
private var _pipline: MTLComputePipelineState? = nil
......
......@@ -135,7 +135,7 @@ class ConvAddAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
// print("metal param: ")
// print(inMetalParam)
......
......@@ -98,7 +98,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionProtocol>: Kernel, Computable, Test
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
......@@ -15,234 +15,17 @@
import Foundation
import MetalPerformanceShaders
@available(iOS 11.0, *)
class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSource {
var _descriptor: MPSCNNConvolutionDescriptor
var _weightsTensor: Tensor<P>
var _biasTensor: Tensor<P>
var _biasTerms: UnsafeMutablePointer<Float>?
func load() -> Bool {
switch P.precisionType {
case .Float32:
_biasTerms = _biasTensor.data.pointer as? UnsafeMutablePointer<Float>
case .Float16:
_biasTerms = UnsafeMutablePointer<Float>.allocate(capacity: _biasTensor.data.count)
if let float16Point = _biasTensor.data.pointer as? UnsafeMutablePointer<Float16> {
float16to32(input: float16Point, output: _biasTerms!, count: _biasTensor.data.count)
}
}
class ConvAddKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
override func hasAddOp() -> Bool {
return true
}
func purge() {
switch P.precisionType {
case .Float32:
return
case .Float16:
_biasTerms?.deinitialize(count: _biasTensor.data.count)
_biasTerms?.deallocate()
}
}
func label() -> String? {
return "conv_add_label"
}
func copy(with zone: NSZone? = nil) -> Any {
return self
}
init(inDesc: MPSCNNConvolutionDescriptor,
inWeights: Tensor<P>,
inBiasTerms: Tensor<P>) {
_descriptor = inDesc
_weightsTensor = inWeights
_biasTensor = inBiasTerms
super.init()
}
func descriptor() -> MPSCNNConvolutionDescriptor {
return _descriptor
}
func dataType() -> MPSDataType {
switch P.precisionType {
case .Float32:
return .float32
case .Float16:
return .float16
}
}
func weights() -> UnsafeMutableRawPointer {
return UnsafeMutableRawPointer.init(_weightsTensor.data.pointer)
}
func biasTerms() -> UnsafeMutablePointer<Float>? {
return _biasTerms
}
}
class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
var metalParam: MetalConvParam!
var mpsConvOp: Any?
required init(device: MTLDevice, param: ConvAddParam<P>, initContext: InitContext) throws {
do {
try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
} catch let error {
throw error
}
var shouldUseMPS = false
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
if initContext.useAggresiveOptimization {
if (param.input.tensorDim[1] == 1 || param.input.tensorDim[1] > 4) && (param.output.tensorDim[1] == 1 || param.output.tensorDim[1] > 4) {
shouldUseMPS = true
}
} else {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true
}
}
}
if type(of: self).isWinoGrad(functionName: functionName) {
shouldUseMPS = false
}
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if !isDepthWise && param.groups > 1 {
shouldUseMPS = false
}
if shouldUseMPS {
super.init(device: device, inFunctionName: nil, initContext: initContext)
setupWithMPS(device: device, param: param)
} else {
if functionName == nil {
fatalError(" unsupport yet ")
}
super.init(device: device, inFunctionName: functionName, initContext: initContext)
setupWithoutMPS(device: device, param: param)
}
}
func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
if #available(iOS 10.0, *) {
if let conv = mpsConvOp as? MPSCNNConvolution {
let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
return
}
}
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
}
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil)
encoder.endEncoding()
}
func setupWithMPS(device: MTLDevice, param: ConvAddParam<P>) {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if #available(iOS 11.0, *) {
param.input.useMPS = true
param.output.useMPS = true
let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
desc.strideInPixelsX = Int(param.stride[0])
desc.strideInPixelsY = Int(param.stride[1])
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.y)
let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
conv.edgeMode = .zero
mpsConvOp = conv
}
}
func setupWithoutMPS(device: MTLDevice, param: ConvAddParam<P>) {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let offsetZ = 0.0
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
metalParam = inMetalParam
if type(of: self).isWinoGrad(functionName: functionName) {
let _ = param.filter.convert(converter: WinogradPointerConverter<P>.init())
}
let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
}
open class func kernelFunctionName(param: ConvAddParam<P>, useAggressiveOptimization: Bool = false) -> String? {
if GlobalConfig.shared.computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_1x1_half"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
return "depthwise_conv_add_3x3_half"
} else if param.filter.width == 3 && param.filter.height == 3 {
return "conv_add_3x3_half"
} else if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_5x1_half"
} else if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_1x5_half"
} else {
return nil
}
} else if GlobalConfig.shared.computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_1x1"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
return "depthwise_conv_add_3x3"
} else if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_5x1"
} else if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_1x5"
} else if param.filter.width == 3 && param.filter.height == 3 {
return "conv_add_3x3"
} else {
return nil
}
} else {
return nil
}
override func hasReluOp() -> Bool {
return false
}
func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
override func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
return nil
}
open class func isWinoGrad(functionName: String?) -> Bool {
if let functionName = functionName {
return functionName.hasSuffix("winograd")
}
return false
}
}
......@@ -135,7 +135,7 @@ class ConvAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
// print("metal param: ")
// print(inMetalParam)
......
//
// ConvAddReluKernel.swift
// paddle-mobile
//
// Created by Yang,Yanzhan on 2019/4/29.
// Copyright © 2019 orange. All rights reserved.
//
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
import Foundation
import MetalPerformanceShaders
class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
override class func kernelFunctionName(param: ConvAddParam<P>, useAggressiveOptimization: Bool = false) -> String? {
public struct MetalConvParam {
let offsetX: Int16
let offsetY: Int16
let offsetZ: Int16
let strideX: UInt16
let strideY: UInt16
let dilationX: UInt16
let dilationY: UInt16
let groups: UInt16
let iC: UInt16
let fC: UInt16
let oC: UInt16
let hasAddOp: UInt16
let hasReluOp: UInt16
}
@available(iOS 11.0, *)
class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSource {
var _descriptor: MPSCNNConvolutionDescriptor
var _weightsTensor: Tensor<P>
var _biasTensor: Tensor<P>?
var _biasTerms: UnsafeMutablePointer<Float>?
func load() -> Bool {
if let biasTensor = _biasTensor {
switch P.precisionType {
case .Float32:
_biasTerms = biasTensor.data.pointer as? UnsafeMutablePointer<Float>
case .Float16:
_biasTerms = UnsafeMutablePointer<Float>.allocate(capacity: biasTensor.data.count)
if let float16Point = biasTensor.data.pointer as? UnsafeMutablePointer<Float16> {
float16to32(input: float16Point, output: _biasTerms!, count: biasTensor.data.count)
}
}
}
return true
}
func purge() {
switch P.precisionType {
case .Float32:
return
case .Float16:
if let biasTensor = _biasTensor {
_biasTerms?.deinitialize(count: biasTensor.data.count)
_biasTerms?.deallocate()
}
}
}
func label() -> String? {
return "conv_add_label"
}
func copy(with zone: NSZone? = nil) -> Any {
return self
}
init(inDesc: MPSCNNConvolutionDescriptor,
inWeights: Tensor<P>,
inBiasTerms: Tensor<P>?) {
_descriptor = inDesc
_weightsTensor = inWeights
_biasTensor = inBiasTerms
super.init()
}
func descriptor() -> MPSCNNConvolutionDescriptor {
return _descriptor
}
func dataType() -> MPSDataType {
switch P.precisionType {
case .Float32:
return .float32
case .Float16:
return .float16
}
}
func weights() -> UnsafeMutableRawPointer {
return UnsafeMutableRawPointer.init(_weightsTensor.data.pointer)
}
func biasTerms() -> UnsafeMutablePointer<Float>? {
return _biasTerms
}
}
class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
var metalParam: MetalConvParam!
var mpsConvOp: Any?
var blankTensor: Tensor<P>?
required init(device: MTLDevice, param: ConvAddReluParam<P>, initContext: InitContext) throws {
do {
try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
} catch let error {
throw error
}
var shouldUseMPS = false
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
if initContext.useAggresiveOptimization {
if (param.input.tensorDim[1] == 1 || param.input.tensorDim[1] > 4) && (param.output.tensorDim[1] == 1 || param.output.tensorDim[1] > 4) {
shouldUseMPS = true
}
} else {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true
}
}
}
if type(of: self).isWinoGrad(functionName: functionName) {
shouldUseMPS = false
}
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if !isDepthWise && param.groups > 1 {
shouldUseMPS = false
}
if shouldUseMPS {
super.init(device: device, inFunctionName: nil, initContext: initContext)
setupWithMPS(device: device, param: param)
} else {
if functionName == nil {
fatalError(" unsupport yet ")
}
super.init(device: device, inFunctionName: functionName, initContext: initContext)
setupWithoutMPS(device: device, param: param)
}
}
func compute(commandBuffer: MTLCommandBuffer, param: ConvAddReluParam<P>) throws {
if #available(iOS 10.0, *) {
if let conv = mpsConvOp as? MPSCNNConvolution {
let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
return
}
}
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
}
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
if let y = param.y {
encoder.setBuffer(y.buffer, offset: 0, index: 2)
} else {
encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2)
}
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil)
encoder.endEncoding()
}
func setupWithMPS(device: MTLDevice, param: ConvAddReluParam<P>) {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if #available(iOS 11.0, *) {
param.input.useMPS = true
param.output.useMPS = true
let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
desc.strideInPixelsX = Int(param.stride[0])
desc.strideInPixelsY = Int(param.stride[1])
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.y)
let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
conv.edgeMode = .zero
mpsConvOp = conv
}
}
func setupWithoutMPS(device: MTLDevice, param: ConvAddReluParam<P>) {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let offsetZ = 0.0
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0))
metalParam = inMetalParam
if type(of: self).isWinoGrad(functionName: functionName) {
let _ = param.filter.convert(converter: WinogradPointerConverter<P>.init())
}
let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
if let y = param.y {
y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
} else {
blankTensor = Tensor<P>.init(inDim: Dim(inDim: [1, 1, 1, 4]), inLayout: DataLayout.NHWC(), originDimsCount: 4)
blankTensor?.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
}
}
class func kernelFunctionName(param: ConvAddReluParam<P>, useAggressiveOptimization: Bool = false) -> String? {
if GlobalConfig.shared.computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_relu_1x1_half"
......@@ -60,10 +277,27 @@ class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
}
}
override func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
if #available(iOS 10.0, *) {
return MPSCNNNeuronReLU(device: device, a: 0)
open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
if hasReluOp() {
if #available(iOS 10.0, *) {
return MPSCNNNeuronReLU(device: device, a: 0)
}
}
return nil
}
open func hasAddOp() -> Bool {
return true
}
open func hasReluOp() -> Bool {
return true
}
open class func isWinoGrad(functionName: String?) -> Bool {
if let functionName = functionName {
return functionName.hasSuffix("winograd")
}
return false
}
}
......@@ -105,7 +105,7 @@ class ConvBNReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable {
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
......@@ -13,55 +13,188 @@
limitations under the License. */
import Foundation
public struct MetalConvParam {
let offsetX: Int16
let offsetY: Int16
let offsetZ: Int16
let strideX: UInt16
let strideY: UInt16
let dilationX: UInt16
let dilationY: UInt16
let groups: UInt16
let iC: UInt16
let fC: UInt16
let oC: UInt16
}
import MetalPerformanceShaders
class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
var metalParam: MetalConvParam!
var mpsConvOp: Any?
var blankTensor: Tensor<P>?
required init(device: MTLDevice, param: ConvParam<P>, initContext: InitContext) throws {
param.filter.initBuffer(device: device, precision: Precision.Float32)
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_1x1", initContext: initContext)
} else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_3x3", initContext: initContext)
} else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_3x3", initContext: initContext)
} else {
fatalError(" unsupport ")
do {
try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
} catch let error {
throw error
}
let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
let offsetZ = 0.0
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
var shouldUseMPS = false
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
if initContext.useAggresiveOptimization {
if (param.input.tensorDim[1] == 1 || param.input.tensorDim[1] > 4) && (param.output.tensorDim[1] == 1 || param.output.tensorDim[1] > 4) {
shouldUseMPS = true
}
} else {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true
}
}
}
if type(of: self).isWinoGrad(functionName: functionName) {
shouldUseMPS = false
}
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if !isDepthWise && param.groups > 1 {
shouldUseMPS = false
}
if shouldUseMPS {
super.init(device: device, inFunctionName: nil, initContext: initContext)
setupWithMPS(device: device, param: param)
} else {
if functionName == nil {
fatalError(" unsupport yet ")
}
super.init(device: device, inFunctionName: functionName, initContext: initContext)
setupWithoutMPS(device: device, param: param)
}
}
func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
if #available(iOS 10.0, *) {
if let conv = mpsConvOp as? MPSCNNConvolution {
let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
return
}
}
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
}
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil)
encoder.endEncoding()
}
func setupWithMPS(device: MTLDevice, param: ConvParam<P>) {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if #available(iOS 11.0, *) {
param.input.useMPS = true
param.output.useMPS = true
let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
desc.strideInPixelsX = Int(param.stride[0])
desc.strideInPixelsY = Int(param.stride[1])
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: nil)
let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
conv.edgeMode = .zero
mpsConvOp = conv
}
}
func setupWithoutMPS(device: MTLDevice, param: ConvParam<P>) {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let offsetZ = 0.0
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0))
metalParam = inMetalParam
if type(of: self).isWinoGrad(functionName: functionName) {
let _ = param.filter.convert(converter: WinogradPointerConverter<P>.init())
}
let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
blankTensor = Tensor<P>.init(inDim: Dim(inDim: [1, 1, 1, 4]), inLayout: DataLayout.NHWC(), originDimsCount: 4)
blankTensor?.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
}
class func kernelFunctionName(param: ConvParam<P>, useAggressiveOptimization: Bool = false) -> String? {
if GlobalConfig.shared.computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_relu_1x1_half"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
if useAggressiveOptimization {
let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3
&& param.filter.n == 16 && param.stride[0] == 1 && param.stride[1] == 1
&& param.dilations[0] == 1 && param.dilations[1] == 1
if couldUseWinograd {
return "depthwise_conv_add_relu_3x3_half_winograd"
}
}
return "depthwise_conv_add_relu_3x3_half"
} else if param.filter.width == 3 && param.filter.height == 3 {
if param.groups == 1 {
return "conv_add_relu_3x3_half"
} else {
return "group_conv_add_relu_3x3_half"
}
} else if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_relu_5x1_half"
} else if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_relu_1x5_half"
} else {
return nil
}
} else if GlobalConfig.shared.computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_relu_1x1"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
return "depthwise_conv_add_relu_3x3"
} else if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_relu_5x1"
} else if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_relu_1x5"
} else if param.filter.width == 3 && param.filter.height == 3 {
if param.groups == 1 {
return "conv_add_relu_3x3"
} else {
return "group_conv_add_relu_3x3"
}
} else {
return nil
}
} else {
return nil
}
}
open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
return nil
}
open func hasAddOp() -> Bool {
return false
}
open func hasReluOp() -> Bool {
return false
}
open class func isWinoGrad(functionName: String?) -> Bool {
if let functionName = functionName {
return functionName.hasSuffix("winograd")
}
return false
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册