Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
b53a20aa
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b53a20aa
编写于
5月 23, 2019
作者:
Y
Yanzhan Yang
提交者:
GitHub
5月 23, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
1.merge conv, conv add, conv add relu into one implementation. (#1661)
上级
881dcb2e
变更
18
展开全部
隐藏空白更改
内联
并排
Showing
18 changed file
with
500 addition
and
1250 deletion
+500
-1250
metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
...metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
+0
-8
metal/paddle-mobile-metallib/paddle-mobile-metallib/ActivationKernel.metal
...le-metallib/paddle-mobile-metallib/ActivationKernel.metal
+0
-1
metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
...addle-mobile-metallib/paddle-mobile-metallib/Common.metal
+2
-0
metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal
...mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal
+0
-622
metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
...le-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
+40
-33
metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal
...e-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal
+0
-280
metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
...obile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
+1
-1
metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
...paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
+1
-30
metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift
...le-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift
+34
-2
metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
+0
-1
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
...ile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
+1
-1
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
...-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
+1
-1
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
...le/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+1
-1
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
...e/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
+5
-222
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
...dle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
+1
-1
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
...ddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
+246
-12
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
...addle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
+1
-1
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
...bile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
+166
-33
未找到文件。
metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
浏览文件 @
b53a20aa
...
...
@@ -40,7 +40,6 @@
FCC15DFD221E69E100DC3CB2
/* Elementwise.metal in Sources */
=
{
isa
=
PBXBuildFile
;
fileRef
=
FCC15DD4221E69DF00DC3CB2
/* Elementwise.metal */
;
};
FCC15DFE221E69E100DC3CB2
/* ReshapeKernel.metal in Sources */
=
{
isa
=
PBXBuildFile
;
fileRef
=
FCC15DD5221E69DF00DC3CB2
/* ReshapeKernel.metal */
;
};
FCC15DFF221E69E100DC3CB2
/* Scale.metal in Sources */
=
{
isa
=
PBXBuildFile
;
fileRef
=
FCC15DD6221E69DF00DC3CB2
/* Scale.metal */
;
};
FCC15E00221E69E100DC3CB2
/* ConvKernel.metal in Sources */
=
{
isa
=
PBXBuildFile
;
fileRef
=
FCC15DD7221E69DF00DC3CB2
/* ConvKernel.metal */
;
};
FCC15E01221E69E100DC3CB2
/* PriorBoxKernel.metal in Sources */
=
{
isa
=
PBXBuildFile
;
fileRef
=
FCC15DD8221E69DF00DC3CB2
/* PriorBoxKernel.metal */
;
};
FCC15E02221E69E100DC3CB2
/* BatchNormRelu.metal in Sources */
=
{
isa
=
PBXBuildFile
;
fileRef
=
FCC15DD9221E69E000DC3CB2
/* BatchNormRelu.metal */
;
};
FCC15E03221E69E100DC3CB2
/* TransposeKernel.metal in Sources */
=
{
isa
=
PBXBuildFile
;
fileRef
=
FCC15DDA221E69E000DC3CB2
/* TransposeKernel.metal */
;
};
...
...
@@ -53,7 +52,6 @@
FCC15E0A221E69E100DC3CB2
/* ElementwiseAddPreluKernel.inc.metal in Sources */
=
{
isa
=
PBXBuildFile
;
fileRef
=
FCC15DE1221E69E100DC3CB2
/* ElementwiseAddPreluKernel.inc.metal */
;
};
FCC15E0B221E69E100DC3CB2
/* FetchKernel.inc.metal in Sources */
=
{
isa
=
PBXBuildFile
;
fileRef
=
FCC15DE2221E69E100DC3CB2
/* FetchKernel.inc.metal */
;
};
FCC15E0C221E69E100DC3CB2
/* BufferToTexture.metal in Sources */
=
{
isa
=
PBXBuildFile
;
fileRef
=
FCC15DE3221E69E100DC3CB2
/* BufferToTexture.metal */
;
};
FCC15E0D221E69E100DC3CB2
/* ConvAddMetal.metal in Sources */
=
{
isa
=
PBXBuildFile
;
fileRef
=
FCC15DE4221E69E100DC3CB2
/* ConvAddMetal.metal */
;
};
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
...
...
@@ -93,7 +91,6 @@
FCC15DD4221E69DF00DC3CB2
/* Elementwise.metal */
=
{
isa
=
PBXFileReference
;
fileEncoding
=
4
;
lastKnownFileType
=
sourcecode.metal
;
path
=
Elementwise.metal
;
sourceTree
=
"<group>"
;
};
FCC15DD5221E69DF00DC3CB2
/* ReshapeKernel.metal */
=
{
isa
=
PBXFileReference
;
fileEncoding
=
4
;
lastKnownFileType
=
sourcecode.metal
;
path
=
ReshapeKernel.metal
;
sourceTree
=
"<group>"
;
};
FCC15DD6221E69DF00DC3CB2
/* Scale.metal */
=
{
isa
=
PBXFileReference
;
fileEncoding
=
4
;
lastKnownFileType
=
sourcecode.metal
;
path
=
Scale.metal
;
sourceTree
=
"<group>"
;
};
FCC15DD7221E69DF00DC3CB2
/* ConvKernel.metal */
=
{
isa
=
PBXFileReference
;
fileEncoding
=
4
;
lastKnownFileType
=
sourcecode.metal
;
path
=
ConvKernel.metal
;
sourceTree
=
"<group>"
;
};
FCC15DD8221E69DF00DC3CB2
/* PriorBoxKernel.metal */
=
{
isa
=
PBXFileReference
;
fileEncoding
=
4
;
lastKnownFileType
=
sourcecode.metal
;
path
=
PriorBoxKernel.metal
;
sourceTree
=
"<group>"
;
};
FCC15DD9221E69E000DC3CB2
/* BatchNormRelu.metal */
=
{
isa
=
PBXFileReference
;
fileEncoding
=
4
;
lastKnownFileType
=
sourcecode.metal
;
path
=
BatchNormRelu.metal
;
sourceTree
=
"<group>"
;
};
FCC15DDA221E69E000DC3CB2
/* TransposeKernel.metal */
=
{
isa
=
PBXFileReference
;
fileEncoding
=
4
;
lastKnownFileType
=
sourcecode.metal
;
path
=
TransposeKernel.metal
;
sourceTree
=
"<group>"
;
};
...
...
@@ -106,7 +103,6 @@
FCC15DE1221E69E100DC3CB2
/* ElementwiseAddPreluKernel.inc.metal */
=
{
isa
=
PBXFileReference
;
fileEncoding
=
4
;
lastKnownFileType
=
sourcecode.metal
;
path
=
ElementwiseAddPreluKernel.inc.metal
;
sourceTree
=
"<group>"
;
};
FCC15DE2221E69E100DC3CB2
/* FetchKernel.inc.metal */
=
{
isa
=
PBXFileReference
;
fileEncoding
=
4
;
lastKnownFileType
=
sourcecode.metal
;
path
=
FetchKernel.inc.metal
;
sourceTree
=
"<group>"
;
};
FCC15DE3221E69E100DC3CB2
/* BufferToTexture.metal */
=
{
isa
=
PBXFileReference
;
fileEncoding
=
4
;
lastKnownFileType
=
sourcecode.metal
;
path
=
BufferToTexture.metal
;
sourceTree
=
"<group>"
;
};
FCC15DE4221E69E100DC3CB2
/* ConvAddMetal.metal */
=
{
isa
=
PBXFileReference
;
fileEncoding
=
4
;
lastKnownFileType
=
sourcecode.metal
;
path
=
ConvAddMetal.metal
;
sourceTree
=
"<group>"
;
};
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
...
...
@@ -170,11 +166,9 @@
FCC15DE0221E69E100DC3CB2
/* ConcatKernel.inc.metal */
,
FCC15DCA221E69DE00DC3CB2
/* ConcatKernel.metal */
,
FCC15DBE221E69DD00DC3CB2
/* ConvAddBNReluKernel.metal */
,
FCC15DE4221E69E100DC3CB2
/* ConvAddMetal.metal */
,
FCC15DDB221E69E000DC3CB2
/* ConvAddPrelu.inc.metal */
,
FCC15DD3221E69DF00DC3CB2
/* ConvAddPreluKernel.metal */
,
FCC15DCF221E69DE00DC3CB2
/* ConvBNReluKernel.metal */
,
FCC15DD7221E69DF00DC3CB2
/* ConvKernel.metal */
,
FCC15DC5221E69DE00DC3CB2
/* ConvTransposeKernel.metal */
,
FCC15DD4221E69DF00DC3CB2
/* Elementwise.metal */
,
FCC15DE1221E69E100DC3CB2
/* ElementwiseAddPreluKernel.inc.metal */
,
...
...
@@ -309,12 +303,10 @@
FCC15E03221E69E100DC3CB2
/* TransposeKernel.metal in Sources */
,
FCC15DFE221E69E100DC3CB2
/* ReshapeKernel.metal in Sources */
,
16FBFB3E22925D040025B406
/* ActivationKernel.metal in Sources */
,
FCC15E0D221E69E100DC3CB2
/* ConvAddMetal.metal in Sources */
,
FCC15DF7221E69E100DC3CB2
/* ReshapeKernel.inc.metal in Sources */
,
FCC15DE5221E69E100DC3CB2
/* ReluKernel.metal in Sources */
,
FCC15DEF221E69E100DC3CB2
/* Macro.metal in Sources */
,
FCC15E02221E69E100DC3CB2
/* BatchNormRelu.metal in Sources */
,
FCC15E00221E69E100DC3CB2
/* ConvKernel.metal in Sources */
,
FCC15E01221E69E100DC3CB2
/* PriorBoxKernel.metal in Sources */
,
FCC15DEA221E69E100DC3CB2
/* ElementwiseAddPreluKernel.metal in Sources */
,
FCC15DED221E69E100DC3CB2
/* PoolKernel.inc.metal in Sources */
,
...
...
metal/paddle-mobile-metallib/paddle-mobile-metallib/ActivationKernel.metal
浏览文件 @
b53a20aa
...
...
@@ -13,7 +13,6 @@
limitations under the License. */
#include <metal_stdlib>
#include <metal_math>
using namespace metal;
kernel void exp(texture2d_array<float, access::sample> inTexture [[texture(0)]],
...
...
metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
浏览文件 @
b53a20aa
...
...
@@ -120,4 +120,6 @@ struct MetalConvParam {
ushort iC;
ushort fC;
ushort oC;
ushort hasAddOp;
ushort hasReluOp;
};
metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal
已删除
100644 → 0
浏览文件 @
881dcb2e
此差异已折叠。
点击以展开。
metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
浏览文件 @
b53a20aa
...
...
@@ -40,7 +40,7 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output =
biase[gid.z]
;
float4 output =
param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0)
;
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
...
...
@@ -57,7 +57,7 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
float4 relu =
fmax(output, 0.0)
;
float4 relu =
param.hasReluOp == 1 ? fmax(output, 0.0) : output
;
outTexture.write(relu, gid.xy, gid.z);
}
...
...
@@ -85,7 +85,7 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output =
biase[gid.z]
;
float4 output =
param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0)
;
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
...
...
@@ -125,7 +125,7 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
output.w += dot(input[j], weight_w);
}
}
float4 relu =
fmax(output, 0.0)
;
float4 relu =
param.hasReluOp == 1 ? fmax(output, 0.0) : output
;
outTexture.write(relu, gid.xy, gid.z);
}
...
...
@@ -148,7 +148,7 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
const uint kernelHXW = 9;
float4 output =
biase[gid.z]
;
float4 output =
param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0)
;
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
...
...
@@ -180,7 +180,7 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
}
}
float4 relu =
fmax(output, 0.0)
;
float4 relu =
param.hasReluOp == 1 ? fmax(output, 0.0) : output
;
outTexture.write(relu, gid.xy, gid.z);
}
...
...
@@ -208,7 +208,7 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output =
biase[gid.z]
;
float4 output =
param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0)
;
ushort dilation_y = param.dilationY;
float4 input[5];
...
...
@@ -238,7 +238,7 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
output.w += dot(input[j], weight_w);
}
}
float4 relu =
fmax(output, 0.0)
;
float4 relu =
param.hasReluOp == 1 ? fmax(output, 0.0) : output
;
outTexture.write(relu, gid.xy, gid.z);
}
...
...
@@ -266,7 +266,7 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output =
biase[gid.z]
;
float4 output =
param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0)
;
ushort dilation_x = param.dilationX;
float4 input[5];
...
...
@@ -296,7 +296,7 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
output.w += dot(input[j], weight_w);
}
}
float4 relu =
fmax(output, 0.0)
;
float4 relu =
param.hasReluOp == 1 ? fmax(output, 0.0) : output
;
outTexture.write(relu, gid.xy, gid.z);
}
...
...
@@ -318,7 +318,7 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output =
biase[gid.z]
;
float4 output =
param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0)
;
float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
...
...
@@ -336,7 +336,7 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
}
float4 relu =
fmax(output, 0.0)
;
float4 relu =
param.hasReluOp == 1 ? fmax(output, 0.0) : output
;
outTexture.write(relu, gid.xy, gid.z);
}
...
...
@@ -364,7 +364,7 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output =
float4(biase[gid.z]
);
float4 output =
param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0
);
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
...
...
@@ -381,7 +381,7 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
float4 weight_w = float4(weights[weithTo + 3 * kernelHXW * input_arr_size + i]);
output.w += dot(input, weight_w);
}
float4 relu =
fmax(output, 0.0)
;
float4 relu =
param.hasReluOp == 1 ? fmax(output, 0.0) : output
;
outTexture.write(half4(relu), gid.xy, gid.z);
}
...
...
@@ -406,7 +406,7 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output =
float4(biase[gid.z]
);
float4 output =
param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0
);
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
...
...
@@ -436,7 +436,7 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
output.w += dot(float4(input[j]), float4(weight_w));
}
}
float4 relu =
fmax(output, 0.0)
;
float4 relu =
param.hasReluOp == 1 ? fmax(output, 0.0) : output
;
outTexture.write(half4(relu), gid.xy, gid.z);
}
...
...
@@ -459,7 +459,7 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
const uint kernelHXW = 9;
half4 output = biase[gid.z]
;
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0)
;
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
...
...
@@ -486,13 +486,13 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
for (int j = 0; j < 9; ++j) {
half weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i];
output[c] +=
input[j] * weight
;
output[c] +=
float(input[j]) * float(weight)
;
}
}
}
half4 relu = fmax(output, 0.0)
;
outTexture.write(
relu
, gid.xy, gid.z);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output
;
outTexture.write(
half4(relu)
, gid.xy, gid.z);
}
kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
...
...
@@ -512,7 +512,7 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output =
float4(biase[gid.z]
);
float4 output =
param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0
);
half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
...
...
@@ -530,8 +530,9 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
}
output = fmax(output, 0.0);
outTexture.write(half4(output), gid.xy, gid.z);
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(half4(relu), gid.xy, gid.z);
}
kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, access::sample> inTexture [[texture(0)]],
...
...
@@ -640,11 +641,18 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
res[3][c] = T[7] - T[11] + T[15] + tmp1 - tmp2;
}
half4 base = biase[tc];
outTexture.write(fmax(res[0] + base, 0.0), uint2(tx, ty), tc);
outTexture.write(fmax(res[1] + base, 0.0), uint2(tx + 1, ty), tc);
outTexture.write(fmax(res[2] + base, 0.0), uint2(tx, ty + 1), tc);
outTexture.write(fmax(res[3] + base, 0.0), uint2(tx + 1, ty + 1), tc);
if (param.hasReluOp == 1) {
half4 base = biase[tc];
outTexture.write(fmax(res[0] + base, 0.0), uint2(tx, ty), tc);
outTexture.write(fmax(res[1] + base, 0.0), uint2(tx + 1, ty), tc);
outTexture.write(fmax(res[2] + base, 0.0), uint2(tx, ty + 1), tc);
outTexture.write(fmax(res[3] + base, 0.0), uint2(tx + 1, ty + 1), tc);
} else {
outTexture.write(res[0], uint2(tx, ty), tc);
outTexture.write(res[1], uint2(tx + 1, ty), tc);
outTexture.write(res[2], uint2(tx, ty + 1), tc);
outTexture.write(res[3], uint2(tx + 1, ty + 1), tc);
}
}
kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
...
...
@@ -653,7 +661,6 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
...
...
@@ -671,7 +678,7 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output =
float4(biase[gid.z]
);
float4 output =
param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0
);
ushort dilation_y = param.dilationY;
half4 input[5];
...
...
@@ -701,7 +708,7 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
output.w += dot(float4(input[j]), float4(weight_w));
}
}
float4 relu =
fmax(output, 0.0)
;
float4 relu =
param.hasReluOp == 1 ? fmax(output, 0.0) : output
;
outTexture.write(half4(relu), gid.xy, gid.z);
}
...
...
@@ -729,7 +736,7 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output =
float4(biase[gid.z]
);
float4 output =
param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0
);
ushort dilation_x = param.dilationX;
half4 input[5];
...
...
@@ -759,6 +766,6 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu
output.w += dot(float4(input[j]), float4(weight_w));
}
}
float4 relu =
fmax(output, 0.0)
;
float4 relu =
param.hasReluOp == 1 ? fmax(output, 0.0) : output
;
outTexture.write(half4(relu), gid.xy, gid.z);
}
metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal
已删除
100644 → 0
浏览文件 @
881dcb2e
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <metal_stdlib>
#include "Common.metal"
using namespace metal;
// conv
#pragma mark -- conv
kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
float4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w);
}
}
outTexture.write(output, gid.xy, gid.z);
}
kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device float *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(0.0);
float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
float4 input = inputs[j];
output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
}
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
half4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(float4(input[j]), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(float4(input[j]), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(float4(input[j]), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(float4(input[j]), float4(weight_w));
}
}
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void depthwise_conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device half *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(0.0);
half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
half4 input = inputs[j];
output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
}
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void conv_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam ¶m [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
half4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(float4(input), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(float4(input), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(float4(input), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(float4(input), float4(weight_w));
}
outTexture.write(half4(output), gid.xy, gid.z);
}
metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
浏览文件 @
b53a20aa
...
...
@@ -325,7 +325,7 @@ public class PaddleMobileUnitTest {
let
fC
=
4
let
oC
=
4
let
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
0
,
strideX
:
UInt16
(
stride
.
0
),
strideY
:
UInt16
(
stride
.
1
),
dilationX
:
UInt16
(
1
),
dilationY
:
UInt16
(
1
),
groups
:
UInt16
(
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
))
let
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
0
,
strideX
:
UInt16
(
stride
.
0
),
strideY
:
UInt16
(
stride
.
1
),
dilationX
:
UInt16
(
1
),
dilationY
:
UInt16
(
1
),
groups
:
UInt16
(
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
)
,
hasAddOp
:
UInt16
(
0
),
hasReluOp
:
UInt16
(
0
)
)
let
param
=
ConvAddBatchNormReluTestParam
.
init
(
inInputTexture
:
inputeTexture
,
inOutputTexture
:
outputTexture
,
inMetalParam
:
metalParam
,
inFilterBuffer
:
filterBuffer
,
inBiaseBuffer
:
biaseBuffer
,
inNewScaleBuffer
:
newScalueBuffer
,
inNewBiaseBuffer
:
newBiaseBuffer
,
inFilterSize
:
filterSize
)
...
...
metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
浏览文件 @
b53a20aa
...
...
@@ -14,36 +14,7 @@
import
Foundation
class
ConvAddParam
<
P
:
PrecisionProtocol
>
:
OpParam
{
//typealias ParamPrecisionType = P
required
init
(
opDesc
:
PMOpDesc
,
inScope
:
Scope
)
throws
{
do
{
filter
=
try
ConvAddParam
.
inputFilter
(
paraInputs
:
opDesc
.
paraInputs
,
from
:
inScope
)
input
=
try
ConvAddParam
.
input
(
inputs
:
opDesc
.
inputs
,
from
:
inScope
)
output
=
try
ConvAddParam
.
outputOut
(
outputs
:
opDesc
.
outputs
,
from
:
inScope
)
stride
=
try
ConvAddParam
.
getAttr
(
key
:
"strides"
,
attrs
:
opDesc
.
attrs
)
paddings
=
try
ConvAddParam
.
getAttr
(
key
:
"paddings"
,
attrs
:
opDesc
.
attrs
)
dilations
=
try
ConvAddParam
.
getAttr
(
key
:
"dilations"
,
attrs
:
opDesc
.
attrs
)
groups
=
try
ConvAddParam
.
getAttr
(
key
:
"groups"
,
attrs
:
opDesc
.
attrs
)
y
=
try
ConvAddParam
.
inputY
(
inputs
:
opDesc
.
paraInputs
,
from
:
inScope
)
}
catch
let
error
{
throw
error
}
}
let
input
:
Texture
let
y
:
Tensor
<
P
>
let
filter
:
Tensor
<
P
>
var
output
:
Texture
let
stride
:
[
Int32
]
let
paddings
:
[
Int32
]
let
dilations
:
[
Int32
]
let
groups
:
Int
}
class
ConvAddOp
<
P
:
PrecisionProtocol
>
:
Operator
<
ConvAddKernel
<
P
>
,
ConvAddParam
<
P
>>
,
Runable
,
Creator
,
InferShaperable
,
Fusion
{
class
ConvAddOp
<
P
:
PrecisionProtocol
>
:
Operator
<
ConvAddKernel
<
P
>
,
ConvAddReluParam
<
P
>>
,
Runable
,
Creator
,
InferShaperable
,
Fusion
{
typealias
OpType
=
ConvAddOp
<
P
>
static
func
fusionNode
()
->
Node
{
...
...
metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift
浏览文件 @
b53a20aa
...
...
@@ -14,7 +14,40 @@
import
Foundation
class
ConvAddReluOp
<
P
:
PrecisionProtocol
>
:
Operator
<
ConvAddReluKernel
<
P
>
,
ConvAddParam
<
P
>>
,
Runable
,
Creator
,
InferShaperable
,
Fusion
{
class
ConvAddReluParam
<
P
:
PrecisionProtocol
>
:
OpParam
{
required
init
(
opDesc
:
PMOpDesc
,
inScope
:
Scope
)
throws
{
do
{
filter
=
try
ConvAddReluParam
.
inputFilter
(
paraInputs
:
opDesc
.
paraInputs
,
from
:
inScope
)
input
=
try
ConvAddReluParam
.
input
(
inputs
:
opDesc
.
inputs
,
from
:
inScope
)
output
=
try
ConvAddReluParam
.
outputOut
(
outputs
:
opDesc
.
outputs
,
from
:
inScope
)
stride
=
try
ConvAddReluParam
.
getAttr
(
key
:
"strides"
,
attrs
:
opDesc
.
attrs
)
paddings
=
try
ConvAddReluParam
.
getAttr
(
key
:
"paddings"
,
attrs
:
opDesc
.
attrs
)
dilations
=
try
ConvAddReluParam
.
getAttr
(
key
:
"dilations"
,
attrs
:
opDesc
.
attrs
)
groups
=
try
ConvAddReluParam
.
getAttr
(
key
:
"groups"
,
attrs
:
opDesc
.
attrs
)
do
{
y
=
try
ConvAddReluParam
.
inputY
(
inputs
:
opDesc
.
paraInputs
,
from
:
inScope
)
}
catch
{
}
}
catch
let
error
{
throw
error
}
}
let
input
:
Texture
var
y
:
Tensor
<
P
>
?
let
filter
:
Tensor
<
P
>
var
output
:
Texture
let
stride
:
[
Int32
]
let
paddings
:
[
Int32
]
let
dilations
:
[
Int32
]
let
groups
:
Int
open
class
func
hasY
()
->
Bool
{
return
true
}
}
class
ConvAddReluOp
<
P
:
PrecisionProtocol
>
:
Operator
<
ConvAddReluKernel
<
P
>
,
ConvAddReluParam
<
P
>>
,
Runable
,
Creator
,
InferShaperable
,
Fusion
{
typealias
OpType
=
ConvAddReluOp
<
P
>
static
func
fusionNode
()
->
Node
{
...
...
@@ -69,4 +102,3 @@ class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAd
print
(
para
.
output
.
metalTexture
.
toTensor
(
dim
:
(
n
:
para
.
output
.
tensorDim
[
0
],
c
:
para
.
output
.
tensorDim
[
1
],
h
:
para
.
output
.
tensorDim
[
2
],
w
:
para
.
output
.
tensorDim
[
3
]))
.
strideArray
())
}
}
metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
浏览文件 @
b53a20aa
...
...
@@ -15,7 +15,6 @@
import
Foundation
class
ConvParam
<
P
:
PrecisionProtocol
>
:
OpParam
{
//typealias ParamPrecisionType = P
required
init
(
opDesc
:
PMOpDesc
,
inScope
:
Scope
)
throws
{
do
{
filter
=
try
ConvParam
.
inputFilter
(
paraInputs
:
opDesc
.
paraInputs
,
from
:
inScope
)
...
...
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
浏览文件 @
b53a20aa
...
...
@@ -37,7 +37,7 @@ protocol KernelProtocol {
}
@objc
open
class
Kernel
:
NSObject
{
@objc
open
class
Kernel
:
NSObject
{
private
var
_pipline
:
MTLComputePipelineState
?
=
nil
...
...
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
浏览文件 @
b53a20aa
...
...
@@ -135,7 +135,7 @@ class ConvAddAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
let
iC
=
param
.
input
.
tensorDim
[
1
];
let
fC
=
param
.
filter
.
tensorDim
[
1
];
let
oC
=
param
.
output
.
tensorDim
[
1
];
let
inMetalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
]),
groups
:
UInt16
(
param
.
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
))
let
inMetalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
]),
groups
:
UInt16
(
param
.
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
)
,
hasAddOp
:
UInt16
(
0
),
hasReluOp
:
UInt16
(
0
)
)
// print("metal param: ")
// print(inMetalParam)
...
...
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
浏览文件 @
b53a20aa
...
...
@@ -98,7 +98,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionProtocol>: Kernel, Computable, Test
let
iC
=
param
.
input
.
tensorDim
[
1
];
let
fC
=
param
.
filter
.
tensorDim
[
1
];
let
oC
=
param
.
output
.
tensorDim
[
1
];
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
]),
groups
:
UInt16
(
param
.
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
))
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
]),
groups
:
UInt16
(
param
.
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
)
,
hasAddOp
:
UInt16
(
0
),
hasReluOp
:
UInt16
(
0
)
)
var
invs
:
[
P
]
=
[]
let
varianceContents
=
param
.
variance
.
buffer
.
contents
()
.
assumingMemoryBound
(
to
:
P
.
self
)
...
...
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
浏览文件 @
b53a20aa
...
...
@@ -15,234 +15,17 @@
import
Foundation
import
MetalPerformanceShaders
@available
(
iOS
11.0
,
*
)
class
ConvDataSource
<
P
:
PrecisionProtocol
>
:
NSObject
,
MPSCNNConvolutionDataSource
{
var
_descriptor
:
MPSCNNConvolutionDescriptor
var
_weightsTensor
:
Tensor
<
P
>
var
_biasTensor
:
Tensor
<
P
>
var
_biasTerms
:
UnsafeMutablePointer
<
Float
>
?
func
load
()
->
Bool
{
switch
P
.
precisionType
{
case
.
Float32
:
_biasTerms
=
_biasTensor
.
data
.
pointer
as?
UnsafeMutablePointer
<
Float
>
case
.
Float16
:
_biasTerms
=
UnsafeMutablePointer
<
Float
>.
allocate
(
capacity
:
_biasTensor
.
data
.
count
)
if
let
float16Point
=
_biasTensor
.
data
.
pointer
as?
UnsafeMutablePointer
<
Float16
>
{
float16to32
(
input
:
float16Point
,
output
:
_biasTerms
!
,
count
:
_biasTensor
.
data
.
count
)
}
}
class
ConvAddKernel
<
P
:
PrecisionProtocol
>
:
ConvAddReluKernel
<
P
>
{
override
func
hasAddOp
()
->
Bool
{
return
true
}
func
purge
()
{
switch
P
.
precisionType
{
case
.
Float32
:
return
case
.
Float16
:
_biasTerms
?
.
deinitialize
(
count
:
_biasTensor
.
data
.
count
)
_biasTerms
?
.
deallocate
()
}
}
func
label
()
->
String
?
{
return
"conv_add_label"
}
func
copy
(
with
zone
:
NSZone
?
=
nil
)
->
Any
{
return
self
}
init
(
inDesc
:
MPSCNNConvolutionDescriptor
,
inWeights
:
Tensor
<
P
>
,
inBiasTerms
:
Tensor
<
P
>
)
{
_descriptor
=
inDesc
_weightsTensor
=
inWeights
_biasTensor
=
inBiasTerms
super
.
init
()
}
func
descriptor
()
->
MPSCNNConvolutionDescriptor
{
return
_descriptor
}
func
dataType
()
->
MPSDataType
{
switch
P
.
precisionType
{
case
.
Float32
:
return
.
float32
case
.
Float16
:
return
.
float16
}
}
func
weights
()
->
UnsafeMutableRawPointer
{
return
UnsafeMutableRawPointer
.
init
(
_weightsTensor
.
data
.
pointer
)
}
func
biasTerms
()
->
UnsafeMutablePointer
<
Float
>
?
{
return
_biasTerms
}
}
class
ConvAddKernel
<
P
:
PrecisionProtocol
>
:
Kernel
,
Computable
{
var
metalParam
:
MetalConvParam
!
var
mpsConvOp
:
Any
?
required
init
(
device
:
MTLDevice
,
param
:
ConvAddParam
<
P
>
,
initContext
:
InitContext
)
throws
{
do
{
try
param
.
output
.
initTexture
(
device
:
device
,
inTranspose
:
[
0
,
2
,
3
,
1
],
computePrecision
:
GlobalConfig
.
shared
.
computePrecision
)
}
catch
let
error
{
throw
error
}
var
shouldUseMPS
=
false
let
functionName
=
type
(
of
:
self
)
.
kernelFunctionName
(
param
:
param
,
useAggressiveOptimization
:
initContext
.
useAggresiveOptimization
)
if
#available(iOS 11.0, *)
,
(
initContext
.
useMPS
||
initContext
.
useAggresiveOptimization
)
{
if
initContext
.
useAggresiveOptimization
{
if
(
param
.
input
.
tensorDim
[
1
]
==
1
||
param
.
input
.
tensorDim
[
1
]
>
4
)
&&
(
param
.
output
.
tensorDim
[
1
]
==
1
||
param
.
output
.
tensorDim
[
1
]
>
4
)
{
shouldUseMPS
=
true
}
}
else
{
if
param
.
input
.
tensorDim
[
1
]
>
4
&&
param
.
output
.
tensorDim
[
1
]
>
4
{
shouldUseMPS
=
true
}
}
}
if
type
(
of
:
self
)
.
isWinoGrad
(
functionName
:
functionName
)
{
shouldUseMPS
=
false
}
let
isDepthWise
=
param
.
filter
.
tensorDim
[
1
]
==
1
&&
param
.
filter
.
tensorDim
[
0
]
==
param
.
input
.
tensorDim
[
1
]
if
!
isDepthWise
&&
param
.
groups
>
1
{
shouldUseMPS
=
false
}
if
shouldUseMPS
{
super
.
init
(
device
:
device
,
inFunctionName
:
nil
,
initContext
:
initContext
)
setupWithMPS
(
device
:
device
,
param
:
param
)
}
else
{
if
functionName
==
nil
{
fatalError
(
" unsupport yet "
)
}
super
.
init
(
device
:
device
,
inFunctionName
:
functionName
,
initContext
:
initContext
)
setupWithoutMPS
(
device
:
device
,
param
:
param
)
}
}
func
compute
(
commandBuffer
:
MTLCommandBuffer
,
param
:
ConvAddParam
<
P
>
)
throws
{
if
#available(iOS 10.0, *)
{
if
let
conv
=
mpsConvOp
as?
MPSCNNConvolution
{
let
inputImage
=
MPSImage
.
init
(
texture
:
param
.
input
.
metalTexture
,
featureChannels
:
param
.
input
.
tensorDim
[
1
])
let
outputImage
=
MPSImage
.
init
(
texture
:
param
.
output
.
metalTexture
,
featureChannels
:
param
.
output
.
tensorDim
[
1
])
conv
.
encode
(
commandBuffer
:
commandBuffer
,
sourceImage
:
inputImage
,
destinationImage
:
outputImage
)
return
}
}
guard
let
encoder
=
commandBuffer
.
makeComputeCommandEncoder
()
else
{
throw
PaddleMobileError
.
predictError
(
message
:
" encode is nil"
)
}
encoder
.
setTexture
(
param
.
input
.
metalTexture
,
index
:
0
)
encoder
.
setTexture
(
param
.
output
.
metalTexture
,
index
:
1
)
encoder
.
setBytes
(
&
metalParam
,
length
:
MemoryLayout
<
MetalConvParam
>.
size
,
index
:
0
)
encoder
.
setBuffer
(
param
.
filter
.
buffer
,
offset
:
0
,
index
:
1
)
encoder
.
setBuffer
(
param
.
y
.
buffer
,
offset
:
0
,
index
:
2
)
encoder
.
dispatch
(
computePipline
:
pipline
,
outTexture
:
param
.
output
.
metalTexture
,
groupDepth
:
type
(
of
:
self
)
.
isWinoGrad
(
functionName
:
functionName
)
?
1
:
nil
)
encoder
.
endEncoding
()
}
func
setupWithMPS
(
device
:
MTLDevice
,
param
:
ConvAddParam
<
P
>
)
{
let
offsetX
=
(
Int
(
param
.
dilations
[
0
])
*
(
param
.
filter
.
tensorDim
[
3
]
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
0
])
let
offsetY
=
(
Int
(
param
.
dilations
[
1
])
*
(
param
.
filter
.
tensorDim
[
2
]
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
1
])
let
isDepthWise
=
param
.
filter
.
tensorDim
[
1
]
==
1
&&
param
.
filter
.
tensorDim
[
0
]
==
param
.
input
.
tensorDim
[
1
]
if
#available(iOS 11.0, *)
{
param
.
input
.
useMPS
=
true
param
.
output
.
useMPS
=
true
let
desc
:
MPSCNNConvolutionDescriptor
=
isDepthWise
?
MPSCNNDepthWiseConvolutionDescriptor
(
kernelWidth
:
param
.
filter
.
tensorDim
[
3
],
kernelHeight
:
param
.
filter
.
tensorDim
[
2
],
inputFeatureChannels
:
param
.
input
.
tensorDim
[
1
],
outputFeatureChannels
:
param
.
output
.
tensorDim
[
1
],
neuronFilter
:
neuronFilterForMPSLayer
(
device
:
device
)
as?
MPSCNNNeuron
)
:
MPSCNNConvolutionDescriptor
(
kernelWidth
:
param
.
filter
.
tensorDim
[
3
],
kernelHeight
:
param
.
filter
.
tensorDim
[
2
],
inputFeatureChannels
:
param
.
input
.
tensorDim
[
1
],
outputFeatureChannels
:
param
.
output
.
tensorDim
[
1
],
neuronFilter
:
neuronFilterForMPSLayer
(
device
:
device
)
as?
MPSCNNNeuron
)
desc
.
strideInPixelsX
=
Int
(
param
.
stride
[
0
])
desc
.
strideInPixelsY
=
Int
(
param
.
stride
[
1
])
let
_
=
param
.
filter
.
convert
(
converter
:
MPSPointerConverter
<
P
>.
init
())
let
dataSource
=
ConvDataSource
.
init
(
inDesc
:
desc
,
inWeights
:
param
.
filter
,
inBiasTerms
:
param
.
y
)
let
conv
=
MPSCNNConvolution
.
init
(
device
:
device
,
weights
:
dataSource
)
conv
.
offset
=
MPSOffset
.
init
(
x
:
offsetX
,
y
:
offsetY
,
z
:
0
)
conv
.
edgeMode
=
.
zero
mpsConvOp
=
conv
}
}
func
setupWithoutMPS
(
device
:
MTLDevice
,
param
:
ConvAddParam
<
P
>
)
{
let
offsetX
=
(
Int
(
param
.
dilations
[
0
])
*
(
param
.
filter
.
tensorDim
[
3
]
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
0
])
let
offsetY
=
(
Int
(
param
.
dilations
[
1
])
*
(
param
.
filter
.
tensorDim
[
2
]
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
1
])
let
offsetZ
=
0.0
let
iC
=
param
.
input
.
tensorDim
[
1
];
let
fC
=
param
.
filter
.
tensorDim
[
1
];
let
oC
=
param
.
output
.
tensorDim
[
1
];
let
inMetalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
]),
groups
:
UInt16
(
param
.
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
))
metalParam
=
inMetalParam
if
type
(
of
:
self
)
.
isWinoGrad
(
functionName
:
functionName
)
{
let
_
=
param
.
filter
.
convert
(
converter
:
WinogradPointerConverter
<
P
>.
init
())
}
let
padWhenOneC
=
!
(
param
.
filter
.
channel
==
1
&&
param
.
filter
.
n
==
param
.
input
.
tensorDim
[
1
])
param
.
filter
.
initBuffer
(
device
:
device
,
precision
:
GlobalConfig
.
shared
.
computePrecision
,
padWhenOneC
:
padWhenOneC
)
param
.
y
.
initBuffer
(
device
:
device
,
precision
:
GlobalConfig
.
shared
.
computePrecision
)
}
open
class
func
kernelFunctionName
(
param
:
ConvAddParam
<
P
>
,
useAggressiveOptimization
:
Bool
=
false
)
->
String
?
{
if
GlobalConfig
.
shared
.
computePrecision
==
.
Float16
{
if
param
.
filter
.
width
==
1
&&
param
.
filter
.
height
==
1
{
return
"conv_add_1x1_half"
}
else
if
param
.
filter
.
channel
==
1
&&
param
.
filter
.
n
==
param
.
input
.
tensorDim
[
1
]
{
return
"depthwise_conv_add_3x3_half"
}
else
if
param
.
filter
.
width
==
3
&&
param
.
filter
.
height
==
3
{
return
"conv_add_3x3_half"
}
else
if
param
.
filter
.
width
==
1
&&
param
.
filter
.
height
==
5
{
return
"conv_add_5x1_half"
}
else
if
param
.
filter
.
width
==
5
&&
param
.
filter
.
height
==
1
{
return
"conv_add_1x5_half"
}
else
{
return
nil
}
}
else
if
GlobalConfig
.
shared
.
computePrecision
==
.
Float32
{
if
param
.
filter
.
width
==
1
&&
param
.
filter
.
height
==
1
{
return
"conv_add_1x1"
}
else
if
param
.
filter
.
channel
==
1
&&
param
.
filter
.
n
==
param
.
input
.
tensorDim
[
1
]
{
return
"depthwise_conv_add_3x3"
}
else
if
param
.
filter
.
width
==
1
&&
param
.
filter
.
height
==
5
{
return
"conv_add_5x1"
}
else
if
param
.
filter
.
width
==
5
&&
param
.
filter
.
height
==
1
{
return
"conv_add_1x5"
}
else
if
param
.
filter
.
width
==
3
&&
param
.
filter
.
height
==
3
{
return
"conv_add_3x3"
}
else
{
return
nil
}
}
else
{
return
nil
}
override
func
hasReluOp
()
->
Bool
{
return
false
}
func
neuronFilterForMPSLayer
(
device
:
MTLDevice
)
->
AnyObject
?
{
override
func
neuronFilterForMPSLayer
(
device
:
MTLDevice
)
->
AnyObject
?
{
return
nil
}
open
class
func
isWinoGrad
(
functionName
:
String
?)
->
Bool
{
if
let
functionName
=
functionName
{
return
functionName
.
hasSuffix
(
"winograd"
)
}
return
false
}
}
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
浏览文件 @
b53a20aa
...
...
@@ -135,7 +135,7 @@ class ConvAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
let
iC
=
param
.
input
.
tensorDim
[
1
];
let
fC
=
param
.
filter
.
tensorDim
[
1
];
let
oC
=
param
.
output
.
tensorDim
[
1
];
let
inMetalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
]),
groups
:
UInt16
(
param
.
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
))
let
inMetalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
]),
groups
:
UInt16
(
param
.
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
)
,
hasAddOp
:
UInt16
(
0
),
hasReluOp
:
UInt16
(
0
)
)
// print("metal param: ")
// print(inMetalParam)
...
...
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
浏览文件 @
b53a20aa
//
// ConvAddReluKernel.swift
// paddle-mobile
//
// Created by Yang,Yanzhan on 2019/4/29.
// Copyright © 2019 orange. All rights reserved.
//
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
import
Foundation
import
MetalPerformanceShaders
class
ConvAddReluKernel
<
P
:
PrecisionProtocol
>
:
ConvAddKernel
<
P
>
{
override
class
func
kernelFunctionName
(
param
:
ConvAddParam
<
P
>
,
useAggressiveOptimization
:
Bool
=
false
)
->
String
?
{
public
struct
MetalConvParam
{
let
offsetX
:
Int16
let
offsetY
:
Int16
let
offsetZ
:
Int16
let
strideX
:
UInt16
let
strideY
:
UInt16
let
dilationX
:
UInt16
let
dilationY
:
UInt16
let
groups
:
UInt16
let
iC
:
UInt16
let
fC
:
UInt16
let
oC
:
UInt16
let
hasAddOp
:
UInt16
let
hasReluOp
:
UInt16
}
@available
(
iOS
11.0
,
*
)
class
ConvDataSource
<
P
:
PrecisionProtocol
>
:
NSObject
,
MPSCNNConvolutionDataSource
{
var
_descriptor
:
MPSCNNConvolutionDescriptor
var
_weightsTensor
:
Tensor
<
P
>
var
_biasTensor
:
Tensor
<
P
>
?
var
_biasTerms
:
UnsafeMutablePointer
<
Float
>
?
func
load
()
->
Bool
{
if
let
biasTensor
=
_biasTensor
{
switch
P
.
precisionType
{
case
.
Float32
:
_biasTerms
=
biasTensor
.
data
.
pointer
as?
UnsafeMutablePointer
<
Float
>
case
.
Float16
:
_biasTerms
=
UnsafeMutablePointer
<
Float
>.
allocate
(
capacity
:
biasTensor
.
data
.
count
)
if
let
float16Point
=
biasTensor
.
data
.
pointer
as?
UnsafeMutablePointer
<
Float16
>
{
float16to32
(
input
:
float16Point
,
output
:
_biasTerms
!
,
count
:
biasTensor
.
data
.
count
)
}
}
}
return
true
}
func
purge
()
{
switch
P
.
precisionType
{
case
.
Float32
:
return
case
.
Float16
:
if
let
biasTensor
=
_biasTensor
{
_biasTerms
?
.
deinitialize
(
count
:
biasTensor
.
data
.
count
)
_biasTerms
?
.
deallocate
()
}
}
}
func
label
()
->
String
?
{
return
"conv_add_label"
}
func
copy
(
with
zone
:
NSZone
?
=
nil
)
->
Any
{
return
self
}
init
(
inDesc
:
MPSCNNConvolutionDescriptor
,
inWeights
:
Tensor
<
P
>
,
inBiasTerms
:
Tensor
<
P
>
?)
{
_descriptor
=
inDesc
_weightsTensor
=
inWeights
_biasTensor
=
inBiasTerms
super
.
init
()
}
func
descriptor
()
->
MPSCNNConvolutionDescriptor
{
return
_descriptor
}
func
dataType
()
->
MPSDataType
{
switch
P
.
precisionType
{
case
.
Float32
:
return
.
float32
case
.
Float16
:
return
.
float16
}
}
func
weights
()
->
UnsafeMutableRawPointer
{
return
UnsafeMutableRawPointer
.
init
(
_weightsTensor
.
data
.
pointer
)
}
func
biasTerms
()
->
UnsafeMutablePointer
<
Float
>
?
{
return
_biasTerms
}
}
class
ConvAddReluKernel
<
P
:
PrecisionProtocol
>
:
Kernel
,
Computable
{
var
metalParam
:
MetalConvParam
!
var
mpsConvOp
:
Any
?
var
blankTensor
:
Tensor
<
P
>
?
required
init
(
device
:
MTLDevice
,
param
:
ConvAddReluParam
<
P
>
,
initContext
:
InitContext
)
throws
{
do
{
try
param
.
output
.
initTexture
(
device
:
device
,
inTranspose
:
[
0
,
2
,
3
,
1
],
computePrecision
:
GlobalConfig
.
shared
.
computePrecision
)
}
catch
let
error
{
throw
error
}
var
shouldUseMPS
=
false
let
functionName
=
type
(
of
:
self
)
.
kernelFunctionName
(
param
:
param
,
useAggressiveOptimization
:
initContext
.
useAggresiveOptimization
)
if
#available(iOS 11.0, *)
,
(
initContext
.
useMPS
||
initContext
.
useAggresiveOptimization
)
{
if
initContext
.
useAggresiveOptimization
{
if
(
param
.
input
.
tensorDim
[
1
]
==
1
||
param
.
input
.
tensorDim
[
1
]
>
4
)
&&
(
param
.
output
.
tensorDim
[
1
]
==
1
||
param
.
output
.
tensorDim
[
1
]
>
4
)
{
shouldUseMPS
=
true
}
}
else
{
if
param
.
input
.
tensorDim
[
1
]
>
4
&&
param
.
output
.
tensorDim
[
1
]
>
4
{
shouldUseMPS
=
true
}
}
}
if
type
(
of
:
self
)
.
isWinoGrad
(
functionName
:
functionName
)
{
shouldUseMPS
=
false
}
let
isDepthWise
=
param
.
filter
.
tensorDim
[
1
]
==
1
&&
param
.
filter
.
tensorDim
[
0
]
==
param
.
input
.
tensorDim
[
1
]
if
!
isDepthWise
&&
param
.
groups
>
1
{
shouldUseMPS
=
false
}
if
shouldUseMPS
{
super
.
init
(
device
:
device
,
inFunctionName
:
nil
,
initContext
:
initContext
)
setupWithMPS
(
device
:
device
,
param
:
param
)
}
else
{
if
functionName
==
nil
{
fatalError
(
" unsupport yet "
)
}
super
.
init
(
device
:
device
,
inFunctionName
:
functionName
,
initContext
:
initContext
)
setupWithoutMPS
(
device
:
device
,
param
:
param
)
}
}
func
compute
(
commandBuffer
:
MTLCommandBuffer
,
param
:
ConvAddReluParam
<
P
>
)
throws
{
if
#available(iOS 10.0, *)
{
if
let
conv
=
mpsConvOp
as?
MPSCNNConvolution
{
let
inputImage
=
MPSImage
.
init
(
texture
:
param
.
input
.
metalTexture
,
featureChannels
:
param
.
input
.
tensorDim
[
1
])
let
outputImage
=
MPSImage
.
init
(
texture
:
param
.
output
.
metalTexture
,
featureChannels
:
param
.
output
.
tensorDim
[
1
])
conv
.
encode
(
commandBuffer
:
commandBuffer
,
sourceImage
:
inputImage
,
destinationImage
:
outputImage
)
return
}
}
guard
let
encoder
=
commandBuffer
.
makeComputeCommandEncoder
()
else
{
throw
PaddleMobileError
.
predictError
(
message
:
" encode is nil"
)
}
encoder
.
setTexture
(
param
.
input
.
metalTexture
,
index
:
0
)
encoder
.
setTexture
(
param
.
output
.
metalTexture
,
index
:
1
)
encoder
.
setBytes
(
&
metalParam
,
length
:
MemoryLayout
<
MetalConvParam
>.
size
,
index
:
0
)
encoder
.
setBuffer
(
param
.
filter
.
buffer
,
offset
:
0
,
index
:
1
)
if
let
y
=
param
.
y
{
encoder
.
setBuffer
(
y
.
buffer
,
offset
:
0
,
index
:
2
)
}
else
{
encoder
.
setBuffer
(
blankTensor
?
.
buffer
,
offset
:
0
,
index
:
2
)
}
encoder
.
dispatch
(
computePipline
:
pipline
,
outTexture
:
param
.
output
.
metalTexture
,
groupDepth
:
type
(
of
:
self
)
.
isWinoGrad
(
functionName
:
functionName
)
?
1
:
nil
)
encoder
.
endEncoding
()
}
func
setupWithMPS
(
device
:
MTLDevice
,
param
:
ConvAddReluParam
<
P
>
)
{
let
offsetX
=
(
Int
(
param
.
dilations
[
0
])
*
(
param
.
filter
.
tensorDim
[
3
]
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
0
])
let
offsetY
=
(
Int
(
param
.
dilations
[
1
])
*
(
param
.
filter
.
tensorDim
[
2
]
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
1
])
let
isDepthWise
=
param
.
filter
.
tensorDim
[
1
]
==
1
&&
param
.
filter
.
tensorDim
[
0
]
==
param
.
input
.
tensorDim
[
1
]
if
#available(iOS 11.0, *)
{
param
.
input
.
useMPS
=
true
param
.
output
.
useMPS
=
true
let
desc
:
MPSCNNConvolutionDescriptor
=
isDepthWise
?
MPSCNNDepthWiseConvolutionDescriptor
(
kernelWidth
:
param
.
filter
.
tensorDim
[
3
],
kernelHeight
:
param
.
filter
.
tensorDim
[
2
],
inputFeatureChannels
:
param
.
input
.
tensorDim
[
1
],
outputFeatureChannels
:
param
.
output
.
tensorDim
[
1
],
neuronFilter
:
neuronFilterForMPSLayer
(
device
:
device
)
as?
MPSCNNNeuron
)
:
MPSCNNConvolutionDescriptor
(
kernelWidth
:
param
.
filter
.
tensorDim
[
3
],
kernelHeight
:
param
.
filter
.
tensorDim
[
2
],
inputFeatureChannels
:
param
.
input
.
tensorDim
[
1
],
outputFeatureChannels
:
param
.
output
.
tensorDim
[
1
],
neuronFilter
:
neuronFilterForMPSLayer
(
device
:
device
)
as?
MPSCNNNeuron
)
desc
.
strideInPixelsX
=
Int
(
param
.
stride
[
0
])
desc
.
strideInPixelsY
=
Int
(
param
.
stride
[
1
])
let
_
=
param
.
filter
.
convert
(
converter
:
MPSPointerConverter
<
P
>.
init
())
let
dataSource
=
ConvDataSource
.
init
(
inDesc
:
desc
,
inWeights
:
param
.
filter
,
inBiasTerms
:
param
.
y
)
let
conv
=
MPSCNNConvolution
.
init
(
device
:
device
,
weights
:
dataSource
)
conv
.
offset
=
MPSOffset
.
init
(
x
:
offsetX
,
y
:
offsetY
,
z
:
0
)
conv
.
edgeMode
=
.
zero
mpsConvOp
=
conv
}
}
func
setupWithoutMPS
(
device
:
MTLDevice
,
param
:
ConvAddReluParam
<
P
>
)
{
let
offsetX
=
(
Int
(
param
.
dilations
[
0
])
*
(
param
.
filter
.
tensorDim
[
3
]
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
0
])
let
offsetY
=
(
Int
(
param
.
dilations
[
1
])
*
(
param
.
filter
.
tensorDim
[
2
]
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
1
])
let
offsetZ
=
0.0
let
iC
=
param
.
input
.
tensorDim
[
1
];
let
fC
=
param
.
filter
.
tensorDim
[
1
];
let
oC
=
param
.
output
.
tensorDim
[
1
];
let
inMetalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
]),
groups
:
UInt16
(
param
.
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
),
hasAddOp
:
UInt16
(
hasAddOp
()
?
1
:
0
),
hasReluOp
:
UInt16
(
hasReluOp
()
?
1
:
0
))
metalParam
=
inMetalParam
if
type
(
of
:
self
)
.
isWinoGrad
(
functionName
:
functionName
)
{
let
_
=
param
.
filter
.
convert
(
converter
:
WinogradPointerConverter
<
P
>.
init
())
}
let
padWhenOneC
=
!
(
param
.
filter
.
channel
==
1
&&
param
.
filter
.
n
==
param
.
input
.
tensorDim
[
1
])
param
.
filter
.
initBuffer
(
device
:
device
,
precision
:
GlobalConfig
.
shared
.
computePrecision
,
padWhenOneC
:
padWhenOneC
)
if
let
y
=
param
.
y
{
y
.
initBuffer
(
device
:
device
,
precision
:
GlobalConfig
.
shared
.
computePrecision
)
}
else
{
blankTensor
=
Tensor
<
P
>.
init
(
inDim
:
Dim
(
inDim
:
[
1
,
1
,
1
,
4
]),
inLayout
:
DataLayout
.
NHWC
(),
originDimsCount
:
4
)
blankTensor
?
.
initBuffer
(
device
:
device
,
precision
:
GlobalConfig
.
shared
.
computePrecision
)
}
}
class
func
kernelFunctionName
(
param
:
ConvAddReluParam
<
P
>
,
useAggressiveOptimization
:
Bool
=
false
)
->
String
?
{
if
GlobalConfig
.
shared
.
computePrecision
==
.
Float16
{
if
param
.
filter
.
width
==
1
&&
param
.
filter
.
height
==
1
{
return
"conv_add_relu_1x1_half"
...
...
@@ -60,10 +277,27 @@ class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
}
}
override
func
neuronFilterForMPSLayer
(
device
:
MTLDevice
)
->
AnyObject
?
{
if
#available(iOS 10.0, *)
{
return
MPSCNNNeuronReLU
(
device
:
device
,
a
:
0
)
open
func
neuronFilterForMPSLayer
(
device
:
MTLDevice
)
->
AnyObject
?
{
if
hasReluOp
()
{
if
#available(iOS 10.0, *)
{
return
MPSCNNNeuronReLU
(
device
:
device
,
a
:
0
)
}
}
return
nil
}
open
func
hasAddOp
()
->
Bool
{
return
true
}
open
func
hasReluOp
()
->
Bool
{
return
true
}
open
class
func
isWinoGrad
(
functionName
:
String
?)
->
Bool
{
if
let
functionName
=
functionName
{
return
functionName
.
hasSuffix
(
"winograd"
)
}
return
false
}
}
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
浏览文件 @
b53a20aa
...
...
@@ -105,7 +105,7 @@ class ConvBNReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable {
let
iC
=
param
.
input
.
tensorDim
[
1
];
let
fC
=
param
.
filter
.
tensorDim
[
1
];
let
oC
=
param
.
output
.
tensorDim
[
1
];
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
]),
groups
:
UInt16
(
param
.
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
))
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
]),
groups
:
UInt16
(
param
.
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
)
,
hasAddOp
:
UInt16
(
0
),
hasReluOp
:
UInt16
(
0
)
)
var
invs
:
[
P
]
=
[]
let
varianceContents
=
param
.
variance
.
buffer
.
contents
()
.
assumingMemoryBound
(
to
:
P
.
self
)
...
...
metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
浏览文件 @
b53a20aa
...
...
@@ -13,55 +13,188 @@
limitations under the License. */
import
Foundation
public
struct
MetalConvParam
{
let
offsetX
:
Int16
let
offsetY
:
Int16
let
offsetZ
:
Int16
let
strideX
:
UInt16
let
strideY
:
UInt16
let
dilationX
:
UInt16
let
dilationY
:
UInt16
let
groups
:
UInt16
let
iC
:
UInt16
let
fC
:
UInt16
let
oC
:
UInt16
}
import
MetalPerformanceShaders
class
ConvKernel
<
P
:
PrecisionProtocol
>
:
Kernel
,
Computable
{
var
metalParam
:
MetalConvParam
!
var
mpsConvOp
:
Any
?
var
blankTensor
:
Tensor
<
P
>
?
required
init
(
device
:
MTLDevice
,
param
:
ConvParam
<
P
>
,
initContext
:
InitContext
)
throws
{
param
.
filter
.
initBuffer
(
device
:
device
,
precision
:
Precision
.
Float32
)
if
param
.
filter
.
width
==
1
&&
param
.
filter
.
height
==
1
{
super
.
init
(
device
:
device
,
inFunctionName
:
"conv_1x1"
,
initContext
:
initContext
)
}
else
if
param
.
filter
.
channel
==
1
{
super
.
init
(
device
:
device
,
inFunctionName
:
"depthwise_conv_3x3"
,
initContext
:
initContext
)
}
else
if
param
.
filter
.
width
==
3
&&
param
.
filter
.
height
==
3
{
super
.
init
(
device
:
device
,
inFunctionName
:
"conv_3x3"
,
initContext
:
initContext
)
}
else
{
fatalError
(
" unsupport "
)
do
{
try
param
.
output
.
initTexture
(
device
:
device
,
inTranspose
:
[
0
,
2
,
3
,
1
],
computePrecision
:
GlobalConfig
.
shared
.
computePrecision
)
}
catch
let
error
{
throw
error
}
let
offsetX
=
param
.
filter
.
dim
[
2
]
/
2
-
Int
(
param
.
paddings
[
0
])
let
offsetY
=
param
.
filter
.
dim
[
1
]
/
2
-
Int
(
param
.
paddings
[
1
])
let
offsetZ
=
0.0
let
iC
=
param
.
input
.
tensorDim
[
1
];
let
fC
=
param
.
filter
.
tensorDim
[
1
];
let
oC
=
param
.
output
.
tensorDim
[
1
];
metalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
]),
groups
:
UInt16
(
param
.
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
))
var
shouldUseMPS
=
false
let
functionName
=
type
(
of
:
self
)
.
kernelFunctionName
(
param
:
param
,
useAggressiveOptimization
:
initContext
.
useAggresiveOptimization
)
if
#available(iOS 11.0, *)
,
(
initContext
.
useMPS
||
initContext
.
useAggresiveOptimization
)
{
if
initContext
.
useAggresiveOptimization
{
if
(
param
.
input
.
tensorDim
[
1
]
==
1
||
param
.
input
.
tensorDim
[
1
]
>
4
)
&&
(
param
.
output
.
tensorDim
[
1
]
==
1
||
param
.
output
.
tensorDim
[
1
]
>
4
)
{
shouldUseMPS
=
true
}
}
else
{
if
param
.
input
.
tensorDim
[
1
]
>
4
&&
param
.
output
.
tensorDim
[
1
]
>
4
{
shouldUseMPS
=
true
}
}
}
if
type
(
of
:
self
)
.
isWinoGrad
(
functionName
:
functionName
)
{
shouldUseMPS
=
false
}
let
isDepthWise
=
param
.
filter
.
tensorDim
[
1
]
==
1
&&
param
.
filter
.
tensorDim
[
0
]
==
param
.
input
.
tensorDim
[
1
]
if
!
isDepthWise
&&
param
.
groups
>
1
{
shouldUseMPS
=
false
}
if
shouldUseMPS
{
super
.
init
(
device
:
device
,
inFunctionName
:
nil
,
initContext
:
initContext
)
setupWithMPS
(
device
:
device
,
param
:
param
)
}
else
{
if
functionName
==
nil
{
fatalError
(
" unsupport yet "
)
}
super
.
init
(
device
:
device
,
inFunctionName
:
functionName
,
initContext
:
initContext
)
setupWithoutMPS
(
device
:
device
,
param
:
param
)
}
}
func
compute
(
commandBuffer
:
MTLCommandBuffer
,
param
:
ConvParam
<
P
>
)
throws
{
if
#available(iOS 10.0, *)
{
if
let
conv
=
mpsConvOp
as?
MPSCNNConvolution
{
let
inputImage
=
MPSImage
.
init
(
texture
:
param
.
input
.
metalTexture
,
featureChannels
:
param
.
input
.
tensorDim
[
1
])
let
outputImage
=
MPSImage
.
init
(
texture
:
param
.
output
.
metalTexture
,
featureChannels
:
param
.
output
.
tensorDim
[
1
])
conv
.
encode
(
commandBuffer
:
commandBuffer
,
sourceImage
:
inputImage
,
destinationImage
:
outputImage
)
return
}
}
guard
let
encoder
=
commandBuffer
.
makeComputeCommandEncoder
()
else
{
throw
PaddleMobileError
.
predictError
(
message
:
" encode is nil"
)
}
encoder
.
setTexture
(
param
.
input
.
metalTexture
,
index
:
0
)
encoder
.
setTexture
(
param
.
output
.
metalTexture
,
index
:
1
)
encoder
.
setBytes
(
&
metalParam
,
length
:
MemoryLayout
<
MetalConvParam
>.
size
,
index
:
0
)
encoder
.
setBuffer
(
param
.
filter
.
buffer
,
offset
:
0
,
index
:
1
)
encoder
.
dispatch
(
computePipline
:
pipline
,
outTexture
:
param
.
output
.
metalTexture
)
encoder
.
setBuffer
(
blankTensor
?
.
buffer
,
offset
:
0
,
index
:
2
)
encoder
.
dispatch
(
computePipline
:
pipline
,
outTexture
:
param
.
output
.
metalTexture
,
groupDepth
:
type
(
of
:
self
)
.
isWinoGrad
(
functionName
:
functionName
)
?
1
:
nil
)
encoder
.
endEncoding
()
}
func
setupWithMPS
(
device
:
MTLDevice
,
param
:
ConvParam
<
P
>
)
{
let
offsetX
=
(
Int
(
param
.
dilations
[
0
])
*
(
param
.
filter
.
tensorDim
[
3
]
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
0
])
let
offsetY
=
(
Int
(
param
.
dilations
[
1
])
*
(
param
.
filter
.
tensorDim
[
2
]
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
1
])
let
isDepthWise
=
param
.
filter
.
tensorDim
[
1
]
==
1
&&
param
.
filter
.
tensorDim
[
0
]
==
param
.
input
.
tensorDim
[
1
]
if
#available(iOS 11.0, *)
{
param
.
input
.
useMPS
=
true
param
.
output
.
useMPS
=
true
let
desc
:
MPSCNNConvolutionDescriptor
=
isDepthWise
?
MPSCNNDepthWiseConvolutionDescriptor
(
kernelWidth
:
param
.
filter
.
tensorDim
[
3
],
kernelHeight
:
param
.
filter
.
tensorDim
[
2
],
inputFeatureChannels
:
param
.
input
.
tensorDim
[
1
],
outputFeatureChannels
:
param
.
output
.
tensorDim
[
1
],
neuronFilter
:
neuronFilterForMPSLayer
(
device
:
device
)
as?
MPSCNNNeuron
)
:
MPSCNNConvolutionDescriptor
(
kernelWidth
:
param
.
filter
.
tensorDim
[
3
],
kernelHeight
:
param
.
filter
.
tensorDim
[
2
],
inputFeatureChannels
:
param
.
input
.
tensorDim
[
1
],
outputFeatureChannels
:
param
.
output
.
tensorDim
[
1
],
neuronFilter
:
neuronFilterForMPSLayer
(
device
:
device
)
as?
MPSCNNNeuron
)
desc
.
strideInPixelsX
=
Int
(
param
.
stride
[
0
])
desc
.
strideInPixelsY
=
Int
(
param
.
stride
[
1
])
let
_
=
param
.
filter
.
convert
(
converter
:
MPSPointerConverter
<
P
>.
init
())
let
dataSource
=
ConvDataSource
.
init
(
inDesc
:
desc
,
inWeights
:
param
.
filter
,
inBiasTerms
:
nil
)
let
conv
=
MPSCNNConvolution
.
init
(
device
:
device
,
weights
:
dataSource
)
conv
.
offset
=
MPSOffset
.
init
(
x
:
offsetX
,
y
:
offsetY
,
z
:
0
)
conv
.
edgeMode
=
.
zero
mpsConvOp
=
conv
}
}
func
setupWithoutMPS
(
device
:
MTLDevice
,
param
:
ConvParam
<
P
>
)
{
let
offsetX
=
(
Int
(
param
.
dilations
[
0
])
*
(
param
.
filter
.
tensorDim
[
3
]
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
0
])
let
offsetY
=
(
Int
(
param
.
dilations
[
1
])
*
(
param
.
filter
.
tensorDim
[
2
]
-
1
)
+
1
)
/
2
-
Int
(
param
.
paddings
[
1
])
let
offsetZ
=
0.0
let
iC
=
param
.
input
.
tensorDim
[
1
];
let
fC
=
param
.
filter
.
tensorDim
[
1
];
let
oC
=
param
.
output
.
tensorDim
[
1
];
let
inMetalParam
=
MetalConvParam
.
init
(
offsetX
:
Int16
(
offsetX
),
offsetY
:
Int16
(
offsetY
),
offsetZ
:
Int16
(
offsetZ
),
strideX
:
UInt16
(
param
.
stride
[
0
]),
strideY
:
UInt16
(
param
.
stride
[
1
]),
dilationX
:
UInt16
(
param
.
dilations
[
0
]),
dilationY
:
UInt16
(
param
.
dilations
[
1
]),
groups
:
UInt16
(
param
.
groups
),
iC
:
UInt16
(
iC
),
fC
:
UInt16
(
fC
),
oC
:
UInt16
(
oC
),
hasAddOp
:
UInt16
(
hasAddOp
()
?
1
:
0
),
hasReluOp
:
UInt16
(
hasReluOp
()
?
1
:
0
))
metalParam
=
inMetalParam
if
type
(
of
:
self
)
.
isWinoGrad
(
functionName
:
functionName
)
{
let
_
=
param
.
filter
.
convert
(
converter
:
WinogradPointerConverter
<
P
>.
init
())
}
let
padWhenOneC
=
!
(
param
.
filter
.
channel
==
1
&&
param
.
filter
.
n
==
param
.
input
.
tensorDim
[
1
])
param
.
filter
.
initBuffer
(
device
:
device
,
precision
:
GlobalConfig
.
shared
.
computePrecision
,
padWhenOneC
:
padWhenOneC
)
blankTensor
=
Tensor
<
P
>.
init
(
inDim
:
Dim
(
inDim
:
[
1
,
1
,
1
,
4
]),
inLayout
:
DataLayout
.
NHWC
(),
originDimsCount
:
4
)
blankTensor
?
.
initBuffer
(
device
:
device
,
precision
:
GlobalConfig
.
shared
.
computePrecision
)
}
class
func
kernelFunctionName
(
param
:
ConvParam
<
P
>
,
useAggressiveOptimization
:
Bool
=
false
)
->
String
?
{
if
GlobalConfig
.
shared
.
computePrecision
==
.
Float16
{
if
param
.
filter
.
width
==
1
&&
param
.
filter
.
height
==
1
{
return
"conv_add_relu_1x1_half"
}
else
if
param
.
filter
.
channel
==
1
&&
param
.
filter
.
n
==
param
.
input
.
tensorDim
[
1
]
{
if
useAggressiveOptimization
{
let
couldUseWinograd
=
param
.
filter
.
width
==
3
&&
param
.
filter
.
height
==
3
&&
param
.
filter
.
n
==
16
&&
param
.
stride
[
0
]
==
1
&&
param
.
stride
[
1
]
==
1
&&
param
.
dilations
[
0
]
==
1
&&
param
.
dilations
[
1
]
==
1
if
couldUseWinograd
{
return
"depthwise_conv_add_relu_3x3_half_winograd"
}
}
return
"depthwise_conv_add_relu_3x3_half"
}
else
if
param
.
filter
.
width
==
3
&&
param
.
filter
.
height
==
3
{
if
param
.
groups
==
1
{
return
"conv_add_relu_3x3_half"
}
else
{
return
"group_conv_add_relu_3x3_half"
}
}
else
if
param
.
filter
.
width
==
1
&&
param
.
filter
.
height
==
5
{
return
"conv_add_relu_5x1_half"
}
else
if
param
.
filter
.
width
==
5
&&
param
.
filter
.
height
==
1
{
return
"conv_add_relu_1x5_half"
}
else
{
return
nil
}
}
else
if
GlobalConfig
.
shared
.
computePrecision
==
.
Float32
{
if
param
.
filter
.
width
==
1
&&
param
.
filter
.
height
==
1
{
return
"conv_add_relu_1x1"
}
else
if
param
.
filter
.
channel
==
1
&&
param
.
filter
.
n
==
param
.
input
.
tensorDim
[
1
]
{
return
"depthwise_conv_add_relu_3x3"
}
else
if
param
.
filter
.
width
==
1
&&
param
.
filter
.
height
==
5
{
return
"conv_add_relu_5x1"
}
else
if
param
.
filter
.
width
==
5
&&
param
.
filter
.
height
==
1
{
return
"conv_add_relu_1x5"
}
else
if
param
.
filter
.
width
==
3
&&
param
.
filter
.
height
==
3
{
if
param
.
groups
==
1
{
return
"conv_add_relu_3x3"
}
else
{
return
"group_conv_add_relu_3x3"
}
}
else
{
return
nil
}
}
else
{
return
nil
}
}
open
func
neuronFilterForMPSLayer
(
device
:
MTLDevice
)
->
AnyObject
?
{
return
nil
}
open
func
hasAddOp
()
->
Bool
{
return
false
}
open
func
hasReluOp
()
->
Bool
{
return
false
}
open
class
func
isWinoGrad
(
functionName
:
String
?)
->
Bool
{
if
let
functionName
=
functionName
{
return
functionName
.
hasSuffix
(
"winograd"
)
}
return
false
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录