提交 349857ce 编写于 作者: Y Yanzhan Yang 提交者: GitHub

1.merge conv, conv add, conv add relu into one implementation. (#1661)

上级 d6c620b9
...@@ -40,7 +40,6 @@ ...@@ -40,7 +40,6 @@
FCC15DFD221E69E100DC3CB2 /* Elementwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */; }; FCC15DFD221E69E100DC3CB2 /* Elementwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */; };
FCC15DFE221E69E100DC3CB2 /* ReshapeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */; }; FCC15DFE221E69E100DC3CB2 /* ReshapeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */; };
FCC15DFF221E69E100DC3CB2 /* Scale.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD6221E69DF00DC3CB2 /* Scale.metal */; }; FCC15DFF221E69E100DC3CB2 /* Scale.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD6221E69DF00DC3CB2 /* Scale.metal */; };
FCC15E00221E69E100DC3CB2 /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD7221E69DF00DC3CB2 /* ConvKernel.metal */; };
FCC15E01221E69E100DC3CB2 /* PriorBoxKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */; }; FCC15E01221E69E100DC3CB2 /* PriorBoxKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */; };
FCC15E02221E69E100DC3CB2 /* BatchNormRelu.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */; }; FCC15E02221E69E100DC3CB2 /* BatchNormRelu.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */; };
FCC15E03221E69E100DC3CB2 /* TransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */; }; FCC15E03221E69E100DC3CB2 /* TransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */; };
...@@ -53,7 +52,6 @@ ...@@ -53,7 +52,6 @@
FCC15E0A221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */; }; FCC15E0A221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */; };
FCC15E0B221E69E100DC3CB2 /* FetchKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */; }; FCC15E0B221E69E100DC3CB2 /* FetchKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */; };
FCC15E0C221E69E100DC3CB2 /* BufferToTexture.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */; }; FCC15E0C221E69E100DC3CB2 /* BufferToTexture.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */; };
FCC15E0D221E69E100DC3CB2 /* ConvAddMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE4221E69E100DC3CB2 /* ConvAddMetal.metal */; };
/* End PBXBuildFile section */ /* End PBXBuildFile section */
/* Begin PBXFileReference section */ /* Begin PBXFileReference section */
...@@ -93,7 +91,6 @@ ...@@ -93,7 +91,6 @@
FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Elementwise.metal; sourceTree = "<group>"; }; FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Elementwise.metal; sourceTree = "<group>"; };
FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.metal; sourceTree = "<group>"; }; FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.metal; sourceTree = "<group>"; };
FCC15DD6221E69DF00DC3CB2 /* Scale.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Scale.metal; sourceTree = "<group>"; }; FCC15DD6221E69DF00DC3CB2 /* Scale.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Scale.metal; sourceTree = "<group>"; };
FCC15DD7221E69DF00DC3CB2 /* ConvKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = "<group>"; };
FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PriorBoxKernel.metal; sourceTree = "<group>"; }; FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PriorBoxKernel.metal; sourceTree = "<group>"; };
FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BatchNormRelu.metal; sourceTree = "<group>"; }; FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BatchNormRelu.metal; sourceTree = "<group>"; };
FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = TransposeKernel.metal; sourceTree = "<group>"; }; FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = TransposeKernel.metal; sourceTree = "<group>"; };
...@@ -106,7 +103,6 @@ ...@@ -106,7 +103,6 @@
FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.inc.metal; sourceTree = "<group>"; }; FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.inc.metal; sourceTree = "<group>"; };
FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = FetchKernel.inc.metal; sourceTree = "<group>"; }; FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = FetchKernel.inc.metal; sourceTree = "<group>"; };
FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BufferToTexture.metal; sourceTree = "<group>"; }; FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BufferToTexture.metal; sourceTree = "<group>"; };
FCC15DE4221E69E100DC3CB2 /* ConvAddMetal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvAddMetal.metal; sourceTree = "<group>"; };
/* End PBXFileReference section */ /* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */ /* Begin PBXFrameworksBuildPhase section */
...@@ -170,11 +166,9 @@ ...@@ -170,11 +166,9 @@
FCC15DE0221E69E100DC3CB2 /* ConcatKernel.inc.metal */, FCC15DE0221E69E100DC3CB2 /* ConcatKernel.inc.metal */,
FCC15DCA221E69DE00DC3CB2 /* ConcatKernel.metal */, FCC15DCA221E69DE00DC3CB2 /* ConcatKernel.metal */,
FCC15DBE221E69DD00DC3CB2 /* ConvAddBNReluKernel.metal */, FCC15DBE221E69DD00DC3CB2 /* ConvAddBNReluKernel.metal */,
FCC15DE4221E69E100DC3CB2 /* ConvAddMetal.metal */,
FCC15DDB221E69E000DC3CB2 /* ConvAddPrelu.inc.metal */, FCC15DDB221E69E000DC3CB2 /* ConvAddPrelu.inc.metal */,
FCC15DD3221E69DF00DC3CB2 /* ConvAddPreluKernel.metal */, FCC15DD3221E69DF00DC3CB2 /* ConvAddPreluKernel.metal */,
FCC15DCF221E69DE00DC3CB2 /* ConvBNReluKernel.metal */, FCC15DCF221E69DE00DC3CB2 /* ConvBNReluKernel.metal */,
FCC15DD7221E69DF00DC3CB2 /* ConvKernel.metal */,
FCC15DC5221E69DE00DC3CB2 /* ConvTransposeKernel.metal */, FCC15DC5221E69DE00DC3CB2 /* ConvTransposeKernel.metal */,
FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */, FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */,
FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */, FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */,
...@@ -309,12 +303,10 @@ ...@@ -309,12 +303,10 @@
FCC15E03221E69E100DC3CB2 /* TransposeKernel.metal in Sources */, FCC15E03221E69E100DC3CB2 /* TransposeKernel.metal in Sources */,
FCC15DFE221E69E100DC3CB2 /* ReshapeKernel.metal in Sources */, FCC15DFE221E69E100DC3CB2 /* ReshapeKernel.metal in Sources */,
16FBFB3E22925D040025B406 /* ActivationKernel.metal in Sources */, 16FBFB3E22925D040025B406 /* ActivationKernel.metal in Sources */,
FCC15E0D221E69E100DC3CB2 /* ConvAddMetal.metal in Sources */,
FCC15DF7221E69E100DC3CB2 /* ReshapeKernel.inc.metal in Sources */, FCC15DF7221E69E100DC3CB2 /* ReshapeKernel.inc.metal in Sources */,
FCC15DE5221E69E100DC3CB2 /* ReluKernel.metal in Sources */, FCC15DE5221E69E100DC3CB2 /* ReluKernel.metal in Sources */,
FCC15DEF221E69E100DC3CB2 /* Macro.metal in Sources */, FCC15DEF221E69E100DC3CB2 /* Macro.metal in Sources */,
FCC15E02221E69E100DC3CB2 /* BatchNormRelu.metal in Sources */, FCC15E02221E69E100DC3CB2 /* BatchNormRelu.metal in Sources */,
FCC15E00221E69E100DC3CB2 /* ConvKernel.metal in Sources */,
FCC15E01221E69E100DC3CB2 /* PriorBoxKernel.metal in Sources */, FCC15E01221E69E100DC3CB2 /* PriorBoxKernel.metal in Sources */,
FCC15DEA221E69E100DC3CB2 /* ElementwiseAddPreluKernel.metal in Sources */, FCC15DEA221E69E100DC3CB2 /* ElementwiseAddPreluKernel.metal in Sources */,
FCC15DED221E69E100DC3CB2 /* PoolKernel.inc.metal in Sources */, FCC15DED221E69E100DC3CB2 /* PoolKernel.inc.metal in Sources */,
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
limitations under the License. */ limitations under the License. */
#include <metal_stdlib> #include <metal_stdlib>
#include <metal_math>
using namespace metal; using namespace metal;
kernel void exp(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void exp(texture2d_array<float, access::sample> inTexture [[texture(0)]],
......
...@@ -120,4 +120,6 @@ struct MetalConvParam { ...@@ -120,4 +120,6 @@ struct MetalConvParam {
ushort iC; ushort iC;
ushort fC; ushort fC;
ushort oC; ushort oC;
ushort hasAddOp;
ushort hasReluOp;
}; };
...@@ -40,7 +40,7 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [ ...@@ -40,7 +40,7 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
uint input_arr_size = inTexture.get_array_size(); uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = biase[gid.z]; float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
float4 input; float4 input;
for (uint i = 0; i < input_arr_size; ++i) { for (uint i = 0; i < input_arr_size; ++i) {
...@@ -57,7 +57,7 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [ ...@@ -57,7 +57,7 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w); output.w += dot(input, weight_w);
} }
float4 relu = fmax(output, 0.0); float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(relu, gid.xy, gid.z); outTexture.write(relu, gid.xy, gid.z);
} }
...@@ -85,7 +85,7 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [ ...@@ -85,7 +85,7 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = biase[gid.z]; float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
...@@ -125,7 +125,7 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [ ...@@ -125,7 +125,7 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
} }
} }
float4 relu = fmax(output, 0.0); float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(relu, gid.xy, gid.z); outTexture.write(relu, gid.xy, gid.z);
} }
...@@ -148,7 +148,7 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex ...@@ -148,7 +148,7 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
const uint kernelHXW = 9; const uint kernelHXW = 9;
float4 output = biase[gid.z]; float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
...@@ -180,7 +180,7 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex ...@@ -180,7 +180,7 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
} }
} }
float4 relu = fmax(output, 0.0); float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(relu, gid.xy, gid.z); outTexture.write(relu, gid.xy, gid.z);
} }
...@@ -208,7 +208,7 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [ ...@@ -208,7 +208,7 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = biase[gid.z]; float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
float4 input[5]; float4 input[5];
...@@ -238,7 +238,7 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [ ...@@ -238,7 +238,7 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
} }
} }
float4 relu = fmax(output, 0.0); float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(relu, gid.xy, gid.z); outTexture.write(relu, gid.xy, gid.z);
} }
...@@ -266,7 +266,7 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [ ...@@ -266,7 +266,7 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = biase[gid.z]; float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
float4 input[5]; float4 input[5];
...@@ -296,7 +296,7 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [ ...@@ -296,7 +296,7 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
} }
} }
float4 relu = fmax(output, 0.0); float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(relu, gid.xy, gid.z); outTexture.write(relu, gid.xy, gid.z);
} }
...@@ -318,7 +318,7 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i ...@@ -318,7 +318,7 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9; const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4; uint weithTo = gid.z * kernelHXW * 4;
float4 output = biase[gid.z]; float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
float4 inputs[9]; float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
...@@ -336,7 +336,7 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i ...@@ -336,7 +336,7 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
} }
float4 relu = fmax(output, 0.0); float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(relu, gid.xy, gid.z); outTexture.write(relu, gid.xy, gid.z);
} }
...@@ -364,7 +364,7 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu ...@@ -364,7 +364,7 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size(); uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(biase[gid.z]); float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
float4 input; float4 input;
for (uint i = 0; i < input_arr_size; ++i) { for (uint i = 0; i < input_arr_size; ++i) {
...@@ -381,7 +381,7 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu ...@@ -381,7 +381,7 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
float4 weight_w = float4(weights[weithTo + 3 * kernelHXW * input_arr_size + i]); float4 weight_w = float4(weights[weithTo + 3 * kernelHXW * input_arr_size + i]);
output.w += dot(input, weight_w); output.w += dot(input, weight_w);
} }
float4 relu = fmax(output, 0.0); float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(half4(relu), gid.xy, gid.z); outTexture.write(half4(relu), gid.xy, gid.z);
} }
...@@ -406,7 +406,7 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu ...@@ -406,7 +406,7 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size(); uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(biase[gid.z]); float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
...@@ -436,7 +436,7 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu ...@@ -436,7 +436,7 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
output.w += dot(float4(input[j]), float4(weight_w)); output.w += dot(float4(input[j]), float4(weight_w));
} }
} }
float4 relu = fmax(output, 0.0); float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(half4(relu), gid.xy, gid.z); outTexture.write(half4(relu), gid.xy, gid.z);
} }
...@@ -459,7 +459,7 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i ...@@ -459,7 +459,7 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
const uint kernelHXW = 9; const uint kernelHXW = 9;
half4 output = biase[gid.z]; float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
...@@ -486,13 +486,13 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i ...@@ -486,13 +486,13 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index]; input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
half weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i]; half weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i];
output[c] += input[j] * weight; output[c] += float(input[j]) * float(weight);
} }
} }
} }
half4 relu = fmax(output, 0.0); float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(relu, gid.xy, gid.z); outTexture.write(half4(relu), gid.xy, gid.z);
} }
kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
...@@ -512,7 +512,7 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl ...@@ -512,7 +512,7 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9; const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4; uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(biase[gid.z]); float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
half4 inputs[9]; half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
...@@ -530,8 +530,9 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl ...@@ -530,8 +530,9 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]); output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]); output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
} }
output = fmax(output, 0.0);
outTexture.write(half4(output), gid.xy, gid.z); float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(half4(relu), gid.xy, gid.z);
} }
kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, access::sample> inTexture [[texture(0)]],
...@@ -640,11 +641,18 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce ...@@ -640,11 +641,18 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
res[3][c] = T[7] - T[11] + T[15] + tmp1 - tmp2; res[3][c] = T[7] - T[11] + T[15] + tmp1 - tmp2;
} }
half4 base = biase[tc]; if (param.hasReluOp == 1) {
outTexture.write(fmax(res[0] + base, 0.0), uint2(tx, ty), tc); half4 base = biase[tc];
outTexture.write(fmax(res[1] + base, 0.0), uint2(tx + 1, ty), tc); outTexture.write(fmax(res[0] + base, 0.0), uint2(tx, ty), tc);
outTexture.write(fmax(res[2] + base, 0.0), uint2(tx, ty + 1), tc); outTexture.write(fmax(res[1] + base, 0.0), uint2(tx + 1, ty), tc);
outTexture.write(fmax(res[3] + base, 0.0), uint2(tx + 1, ty + 1), tc); outTexture.write(fmax(res[2] + base, 0.0), uint2(tx, ty + 1), tc);
outTexture.write(fmax(res[3] + base, 0.0), uint2(tx + 1, ty + 1), tc);
} else {
outTexture.write(res[0], uint2(tx, ty), tc);
outTexture.write(res[1], uint2(tx + 1, ty), tc);
outTexture.write(res[2], uint2(tx, ty + 1), tc);
outTexture.write(res[3], uint2(tx + 1, ty + 1), tc);
}
} }
kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
...@@ -653,7 +661,6 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu ...@@ -653,7 +661,6 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
const device half4 *weights [[buffer(1)]], const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]], const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
...@@ -671,7 +678,7 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu ...@@ -671,7 +678,7 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(biase[gid.z]); float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
half4 input[5]; half4 input[5];
...@@ -701,7 +708,7 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu ...@@ -701,7 +708,7 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
output.w += dot(float4(input[j]), float4(weight_w)); output.w += dot(float4(input[j]), float4(weight_w));
} }
} }
float4 relu = fmax(output, 0.0); float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(half4(relu), gid.xy, gid.z); outTexture.write(half4(relu), gid.xy, gid.z);
} }
...@@ -729,7 +736,7 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu ...@@ -729,7 +736,7 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(biase[gid.z]); float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
half4 input[5]; half4 input[5];
...@@ -759,6 +766,6 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu ...@@ -759,6 +766,6 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu
output.w += dot(float4(input[j]), float4(weight_w)); output.w += dot(float4(input[j]), float4(weight_w));
} }
} }
float4 relu = fmax(output, 0.0); float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(half4(relu), gid.xy, gid.z); outTexture.write(half4(relu), gid.xy, gid.z);
} }
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <metal_stdlib>
#include "Common.metal"
using namespace metal;
// conv
#pragma mark -- conv
kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
float4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w);
}
}
outTexture.write(output, gid.xy, gid.z);
}
kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(0.0);
float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
float4 input = inputs[j];
output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
}
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
half4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(float4(input[j]), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(float4(input[j]), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(float4(input[j]), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(float4(input[j]), float4(weight_w));
}
}
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void depthwise_conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(0.0);
half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
half4 input = inputs[j];
output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
}
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void conv_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
half4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(float4(input), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(float4(input), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(float4(input), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(float4(input), float4(weight_w));
}
outTexture.write(half4(output), gid.xy, gid.z);
}
...@@ -325,7 +325,7 @@ public class PaddleMobileUnitTest { ...@@ -325,7 +325,7 @@ public class PaddleMobileUnitTest {
let fC = 4 let fC = 4
let oC = 4 let oC = 4
let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC)) let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize) let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
......
...@@ -14,36 +14,7 @@ ...@@ -14,36 +14,7 @@
import Foundation import Foundation
class ConvAddParam<P: PrecisionProtocol>: OpParam { class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
//typealias ParamPrecisionType = P
required init(opDesc: PMOpDesc, inScope: Scope) throws {
do {
filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope)
output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs)
paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs)
dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs)
groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs)
y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
} catch let error {
throw error
}
}
let input: Texture
let y: Tensor<P>
let filter: Tensor<P>
var output: Texture
let stride: [Int32]
let paddings: [Int32]
let dilations: [Int32]
let groups: Int
}
class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion{
typealias OpType = ConvAddOp<P> typealias OpType = ConvAddOp<P>
static func fusionNode() -> Node { static func fusionNode() -> Node {
......
...@@ -14,7 +14,40 @@ ...@@ -14,7 +14,40 @@
import Foundation import Foundation
class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion { class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
required init(opDesc: PMOpDesc, inScope: Scope) throws {
do {
filter = try ConvAddReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
input = try ConvAddReluParam.input(inputs: opDesc.inputs, from: inScope)
output = try ConvAddReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
stride = try ConvAddReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
paddings = try ConvAddReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
dilations = try ConvAddReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
groups = try ConvAddReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
do {
y = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
} catch {
}
} catch let error {
throw error
}
}
let input: Texture
var y: Tensor<P>?
let filter: Tensor<P>
var output: Texture
let stride: [Int32]
let paddings: [Int32]
let dilations: [Int32]
let groups: Int
open class func hasY() -> Bool {
return true
}
}
class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion {
typealias OpType = ConvAddReluOp<P> typealias OpType = ConvAddReluOp<P>
static func fusionNode() -> Node { static func fusionNode() -> Node {
...@@ -69,4 +102,3 @@ class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAd ...@@ -69,4 +102,3 @@ class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAd
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
} }
} }
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
import Foundation import Foundation
class ConvParam<P: PrecisionProtocol>: OpParam { class ConvParam<P: PrecisionProtocol>: OpParam {
//typealias ParamPrecisionType = P
required init(opDesc: PMOpDesc, inScope: Scope) throws { required init(opDesc: PMOpDesc, inScope: Scope) throws {
do { do {
filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
......
...@@ -37,7 +37,7 @@ protocol KernelProtocol { ...@@ -37,7 +37,7 @@ protocol KernelProtocol {
} }
@objc open class Kernel: NSObject{ @objc open class Kernel: NSObject {
private var _pipline: MTLComputePipelineState? = nil private var _pipline: MTLComputePipelineState? = nil
......
...@@ -135,7 +135,7 @@ class ConvAddAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -135,7 +135,7 @@ class ConvAddAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
let iC = param.input.tensorDim[1]; let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1]; let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1]; let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC)) let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
// print("metal param: ") // print("metal param: ")
// print(inMetalParam) // print(inMetalParam)
......
...@@ -98,7 +98,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionProtocol>: Kernel, Computable, Test ...@@ -98,7 +98,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionProtocol>: Kernel, Computable, Test
let iC = param.input.tensorDim[1]; let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1]; let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1]; let oC = param.output.tensorDim[1];
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC)) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
var invs: [P] = [] var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
...@@ -15,234 +15,17 @@ ...@@ -15,234 +15,17 @@
import Foundation import Foundation
import MetalPerformanceShaders import MetalPerformanceShaders
@available(iOS 11.0, *) class ConvAddKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSource { override func hasAddOp() -> Bool {
var _descriptor: MPSCNNConvolutionDescriptor
var _weightsTensor: Tensor<P>
var _biasTensor: Tensor<P>
var _biasTerms: UnsafeMutablePointer<Float>?
func load() -> Bool {
switch P.precisionType {
case .Float32:
_biasTerms = _biasTensor.data.pointer as? UnsafeMutablePointer<Float>
case .Float16:
_biasTerms = UnsafeMutablePointer<Float>.allocate(capacity: _biasTensor.data.count)
if let float16Point = _biasTensor.data.pointer as? UnsafeMutablePointer<Float16> {
float16to32(input: float16Point, output: _biasTerms!, count: _biasTensor.data.count)
}
}
return true return true
} }
func purge() {
switch P.precisionType {
case .Float32:
return
case .Float16:
_biasTerms?.deinitialize(count: _biasTensor.data.count)
_biasTerms?.deallocate()
}
}
func label() -> String? {
return "conv_add_label"
}
func copy(with zone: NSZone? = nil) -> Any {
return self
}
init(inDesc: MPSCNNConvolutionDescriptor,
inWeights: Tensor<P>,
inBiasTerms: Tensor<P>) {
_descriptor = inDesc
_weightsTensor = inWeights
_biasTensor = inBiasTerms
super.init()
}
func descriptor() -> MPSCNNConvolutionDescriptor {
return _descriptor
}
func dataType() -> MPSDataType {
switch P.precisionType {
case .Float32:
return .float32
case .Float16:
return .float16
}
}
func weights() -> UnsafeMutableRawPointer {
return UnsafeMutableRawPointer.init(_weightsTensor.data.pointer)
}
func biasTerms() -> UnsafeMutablePointer<Float>? {
return _biasTerms
}
}
class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
var metalParam: MetalConvParam!
var mpsConvOp: Any?
required init(device: MTLDevice, param: ConvAddParam<P>, initContext: InitContext) throws {
do {
try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
} catch let error {
throw error
}
var shouldUseMPS = false
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
if initContext.useAggresiveOptimization {
if (param.input.tensorDim[1] == 1 || param.input.tensorDim[1] > 4) && (param.output.tensorDim[1] == 1 || param.output.tensorDim[1] > 4) {
shouldUseMPS = true
}
} else {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true
}
}
}
if type(of: self).isWinoGrad(functionName: functionName) {
shouldUseMPS = false
}
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if !isDepthWise && param.groups > 1 {
shouldUseMPS = false
}
if shouldUseMPS {
super.init(device: device, inFunctionName: nil, initContext: initContext)
setupWithMPS(device: device, param: param)
} else {
if functionName == nil {
fatalError(" unsupport yet ")
}
super.init(device: device, inFunctionName: functionName, initContext: initContext)
setupWithoutMPS(device: device, param: param)
}
}
func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
if #available(iOS 10.0, *) {
if let conv = mpsConvOp as? MPSCNNConvolution {
let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
return
}
}
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
}
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil)
encoder.endEncoding()
}
func setupWithMPS(device: MTLDevice, param: ConvAddParam<P>) { override func hasReluOp() -> Bool {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0]) return false
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if #available(iOS 11.0, *) {
param.input.useMPS = true
param.output.useMPS = true
let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
desc.strideInPixelsX = Int(param.stride[0])
desc.strideInPixelsY = Int(param.stride[1])
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.y)
let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
conv.edgeMode = .zero
mpsConvOp = conv
}
}
func setupWithoutMPS(device: MTLDevice, param: ConvAddParam<P>) {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let offsetZ = 0.0
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
metalParam = inMetalParam
if type(of: self).isWinoGrad(functionName: functionName) {
let _ = param.filter.convert(converter: WinogradPointerConverter<P>.init())
}
let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
}
open class func kernelFunctionName(param: ConvAddParam<P>, useAggressiveOptimization: Bool = false) -> String? {
if GlobalConfig.shared.computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_1x1_half"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
return "depthwise_conv_add_3x3_half"
} else if param.filter.width == 3 && param.filter.height == 3 {
return "conv_add_3x3_half"
} else if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_5x1_half"
} else if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_1x5_half"
} else {
return nil
}
} else if GlobalConfig.shared.computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_1x1"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
return "depthwise_conv_add_3x3"
} else if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_5x1"
} else if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_1x5"
} else if param.filter.width == 3 && param.filter.height == 3 {
return "conv_add_3x3"
} else {
return nil
}
} else {
return nil
}
} }
func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? { override func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
return nil return nil
} }
open class func isWinoGrad(functionName: String?) -> Bool {
if let functionName = functionName {
return functionName.hasSuffix("winograd")
}
return false
}
} }
...@@ -135,7 +135,7 @@ class ConvAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -135,7 +135,7 @@ class ConvAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
let iC = param.input.tensorDim[1]; let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1]; let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1]; let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC)) let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
// print("metal param: ") // print("metal param: ")
// print(inMetalParam) // print(inMetalParam)
......
// /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// ConvAddReluKernel.swift
// paddle-mobile Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// Created by Yang,Yanzhan on 2019/4/29. You may obtain a copy of the License at
// Copyright © 2019 orange. All rights reserved.
// http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
import Foundation import Foundation
import MetalPerformanceShaders import MetalPerformanceShaders
class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> { public struct MetalConvParam {
override class func kernelFunctionName(param: ConvAddParam<P>, useAggressiveOptimization: Bool = false) -> String? { let offsetX: Int16
let offsetY: Int16
let offsetZ: Int16
let strideX: UInt16
let strideY: UInt16
let dilationX: UInt16
let dilationY: UInt16
let groups: UInt16
let iC: UInt16
let fC: UInt16
let oC: UInt16
let hasAddOp: UInt16
let hasReluOp: UInt16
}
@available(iOS 11.0, *)
class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSource {
var _descriptor: MPSCNNConvolutionDescriptor
var _weightsTensor: Tensor<P>
var _biasTensor: Tensor<P>?
var _biasTerms: UnsafeMutablePointer<Float>?
func load() -> Bool {
if let biasTensor = _biasTensor {
switch P.precisionType {
case .Float32:
_biasTerms = biasTensor.data.pointer as? UnsafeMutablePointer<Float>
case .Float16:
_biasTerms = UnsafeMutablePointer<Float>.allocate(capacity: biasTensor.data.count)
if let float16Point = biasTensor.data.pointer as? UnsafeMutablePointer<Float16> {
float16to32(input: float16Point, output: _biasTerms!, count: biasTensor.data.count)
}
}
}
return true
}
func purge() {
switch P.precisionType {
case .Float32:
return
case .Float16:
if let biasTensor = _biasTensor {
_biasTerms?.deinitialize(count: biasTensor.data.count)
_biasTerms?.deallocate()
}
}
}
func label() -> String? {
return "conv_add_label"
}
func copy(with zone: NSZone? = nil) -> Any {
return self
}
init(inDesc: MPSCNNConvolutionDescriptor,
inWeights: Tensor<P>,
inBiasTerms: Tensor<P>?) {
_descriptor = inDesc
_weightsTensor = inWeights
_biasTensor = inBiasTerms
super.init()
}
func descriptor() -> MPSCNNConvolutionDescriptor {
return _descriptor
}
func dataType() -> MPSDataType {
switch P.precisionType {
case .Float32:
return .float32
case .Float16:
return .float16
}
}
func weights() -> UnsafeMutableRawPointer {
return UnsafeMutableRawPointer.init(_weightsTensor.data.pointer)
}
func biasTerms() -> UnsafeMutablePointer<Float>? {
return _biasTerms
}
}
class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
var metalParam: MetalConvParam!
var mpsConvOp: Any?
var blankTensor: Tensor<P>?
required init(device: MTLDevice, param: ConvAddReluParam<P>, initContext: InitContext) throws {
do {
try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
} catch let error {
throw error
}
var shouldUseMPS = false
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
if initContext.useAggresiveOptimization {
if (param.input.tensorDim[1] == 1 || param.input.tensorDim[1] > 4) && (param.output.tensorDim[1] == 1 || param.output.tensorDim[1] > 4) {
shouldUseMPS = true
}
} else {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true
}
}
}
if type(of: self).isWinoGrad(functionName: functionName) {
shouldUseMPS = false
}
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if !isDepthWise && param.groups > 1 {
shouldUseMPS = false
}
if shouldUseMPS {
super.init(device: device, inFunctionName: nil, initContext: initContext)
setupWithMPS(device: device, param: param)
} else {
if functionName == nil {
fatalError(" unsupport yet ")
}
super.init(device: device, inFunctionName: functionName, initContext: initContext)
setupWithoutMPS(device: device, param: param)
}
}
func compute(commandBuffer: MTLCommandBuffer, param: ConvAddReluParam<P>) throws {
if #available(iOS 10.0, *) {
if let conv = mpsConvOp as? MPSCNNConvolution {
let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
return
}
}
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
}
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
if let y = param.y {
encoder.setBuffer(y.buffer, offset: 0, index: 2)
} else {
encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2)
}
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil)
encoder.endEncoding()
}
func setupWithMPS(device: MTLDevice, param: ConvAddReluParam<P>) {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if #available(iOS 11.0, *) {
param.input.useMPS = true
param.output.useMPS = true
let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
desc.strideInPixelsX = Int(param.stride[0])
desc.strideInPixelsY = Int(param.stride[1])
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.y)
let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
conv.edgeMode = .zero
mpsConvOp = conv
}
}
func setupWithoutMPS(device: MTLDevice, param: ConvAddReluParam<P>) {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let offsetZ = 0.0
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0))
metalParam = inMetalParam
if type(of: self).isWinoGrad(functionName: functionName) {
let _ = param.filter.convert(converter: WinogradPointerConverter<P>.init())
}
let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
if let y = param.y {
y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
} else {
blankTensor = Tensor<P>.init(inDim: Dim(inDim: [1, 1, 1, 4]), inLayout: DataLayout.NHWC(), originDimsCount: 4)
blankTensor?.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
}
}
class func kernelFunctionName(param: ConvAddReluParam<P>, useAggressiveOptimization: Bool = false) -> String? {
if GlobalConfig.shared.computePrecision == .Float16 { if GlobalConfig.shared.computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_relu_1x1_half" return "conv_add_relu_1x1_half"
...@@ -60,10 +277,27 @@ class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> { ...@@ -60,10 +277,27 @@ class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
} }
} }
override func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? { open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
if #available(iOS 10.0, *) { if hasReluOp() {
return MPSCNNNeuronReLU(device: device, a: 0) if #available(iOS 10.0, *) {
return MPSCNNNeuronReLU(device: device, a: 0)
}
} }
return nil return nil
} }
open func hasAddOp() -> Bool {
return true
}
open func hasReluOp() -> Bool {
return true
}
open class func isWinoGrad(functionName: String?) -> Bool {
if let functionName = functionName {
return functionName.hasSuffix("winograd")
}
return false
}
} }
...@@ -105,7 +105,7 @@ class ConvBNReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable { ...@@ -105,7 +105,7 @@ class ConvBNReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable {
let iC = param.input.tensorDim[1]; let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1]; let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1]; let oC = param.output.tensorDim[1];
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC)) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
var invs: [P] = [] var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
...@@ -13,55 +13,188 @@ ...@@ -13,55 +13,188 @@
limitations under the License. */ limitations under the License. */
import Foundation import Foundation
import MetalPerformanceShaders
public struct MetalConvParam {
let offsetX: Int16
let offsetY: Int16
let offsetZ: Int16
let strideX: UInt16
let strideY: UInt16
let dilationX: UInt16
let dilationY: UInt16
let groups: UInt16
let iC: UInt16
let fC: UInt16
let oC: UInt16
}
class ConvKernel<P: PrecisionProtocol>: Kernel, Computable { class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
var metalParam: MetalConvParam! var metalParam: MetalConvParam!
var mpsConvOp: Any?
var blankTensor: Tensor<P>?
required init(device: MTLDevice, param: ConvParam<P>, initContext: InitContext) throws { required init(device: MTLDevice, param: ConvParam<P>, initContext: InitContext) throws {
param.filter.initBuffer(device: device, precision: Precision.Float32) do {
if param.filter.width == 1 && param.filter.height == 1 { try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
super.init(device: device, inFunctionName: "conv_1x1", initContext: initContext) } catch let error {
} else if param.filter.channel == 1 { throw error
super.init(device: device, inFunctionName: "depthwise_conv_3x3", initContext: initContext)
} else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_3x3", initContext: initContext)
} else {
fatalError(" unsupport ")
} }
let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0]) var shouldUseMPS = false
let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1]) let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
let offsetZ = 0.0 if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
let iC = param.input.tensorDim[1]; if initContext.useAggresiveOptimization {
let fC = param.filter.tensorDim[1]; if (param.input.tensorDim[1] == 1 || param.input.tensorDim[1] > 4) && (param.output.tensorDim[1] == 1 || param.output.tensorDim[1] > 4) {
let oC = param.output.tensorDim[1]; shouldUseMPS = true
}
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC)) } else {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true
}
}
}
if type(of: self).isWinoGrad(functionName: functionName) {
shouldUseMPS = false
}
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if !isDepthWise && param.groups > 1 {
shouldUseMPS = false
}
if shouldUseMPS {
super.init(device: device, inFunctionName: nil, initContext: initContext)
setupWithMPS(device: device, param: param)
} else {
if functionName == nil {
fatalError(" unsupport yet ")
}
super.init(device: device, inFunctionName: functionName, initContext: initContext)
setupWithoutMPS(device: device, param: param)
}
} }
func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
if #available(iOS 10.0, *) {
if let conv = mpsConvOp as? MPSCNNConvolution {
let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
return
}
}
guard let encoder = commandBuffer.makeComputeCommandEncoder() else { guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil") throw PaddleMobileError.predictError(message: " encode is nil")
} }
encoder.setTexture(param.input.metalTexture, index: 0) encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1) encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0) encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil)
encoder.endEncoding() encoder.endEncoding()
} }
func setupWithMPS(device: MTLDevice, param: ConvParam<P>) {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
if #available(iOS 11.0, *) {
param.input.useMPS = true
param.output.useMPS = true
let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
desc.strideInPixelsX = Int(param.stride[0])
desc.strideInPixelsY = Int(param.stride[1])
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: nil)
let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
conv.edgeMode = .zero
mpsConvOp = conv
}
}
func setupWithoutMPS(device: MTLDevice, param: ConvParam<P>) {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let offsetZ = 0.0
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0))
metalParam = inMetalParam
if type(of: self).isWinoGrad(functionName: functionName) {
let _ = param.filter.convert(converter: WinogradPointerConverter<P>.init())
}
let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
blankTensor = Tensor<P>.init(inDim: Dim(inDim: [1, 1, 1, 4]), inLayout: DataLayout.NHWC(), originDimsCount: 4)
blankTensor?.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
}
class func kernelFunctionName(param: ConvParam<P>, useAggressiveOptimization: Bool = false) -> String? {
if GlobalConfig.shared.computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_relu_1x1_half"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
if useAggressiveOptimization {
let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3
&& param.filter.n == 16 && param.stride[0] == 1 && param.stride[1] == 1
&& param.dilations[0] == 1 && param.dilations[1] == 1
if couldUseWinograd {
return "depthwise_conv_add_relu_3x3_half_winograd"
}
}
return "depthwise_conv_add_relu_3x3_half"
} else if param.filter.width == 3 && param.filter.height == 3 {
if param.groups == 1 {
return "conv_add_relu_3x3_half"
} else {
return "group_conv_add_relu_3x3_half"
}
} else if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_relu_5x1_half"
} else if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_relu_1x5_half"
} else {
return nil
}
} else if GlobalConfig.shared.computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_relu_1x1"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
return "depthwise_conv_add_relu_3x3"
} else if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_relu_5x1"
} else if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_relu_1x5"
} else if param.filter.width == 3 && param.filter.height == 3 {
if param.groups == 1 {
return "conv_add_relu_3x3"
} else {
return "group_conv_add_relu_3x3"
}
} else {
return nil
}
} else {
return nil
}
}
open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
return nil
}
open func hasAddOp() -> Bool {
return false
}
open func hasReluOp() -> Bool {
return false
}
open class func isWinoGrad(functionName: String?) -> Bool {
if let functionName = functionName {
return functionName.hasSuffix("winograd")
}
return false
}
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册