提交 c1ffeb30 编写于 作者: Y Yanzhan Yang 提交者: GitHub

1.enable both tensor and texture as bias of conv_add_relu. 2. add con… (#1662)

* 1.enable both tensor and texture as bias of conv_add_relu. 2. add conv_relu fusion op.

* winograd could be used when. out channel is less than or equal to 16
上级 b56e720e
......@@ -19,12 +19,11 @@ using namespace metal;
#pragma mark - convAdd
kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::sample> biasTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
......@@ -40,7 +39,7 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
......@@ -62,12 +61,11 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
}
kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::sample> biasTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
......@@ -85,7 +83,7 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
......@@ -130,10 +128,10 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
}
kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::sample> biasTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]],
const device float *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
......@@ -148,7 +146,7 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
const uint kernelHXW = 9;
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
......@@ -185,12 +183,11 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
}
kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::sample> biasTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
......@@ -208,7 +205,7 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_y = param.dilationY;
float4 input[5];
......@@ -243,12 +240,11 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
}
kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::sample> biasTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
......@@ -266,7 +262,7 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX;
float4 input[5];
......@@ -301,12 +297,11 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
}
kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
texture2d_array<float, access::sample> biasTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]],
const device float *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
......@@ -318,7 +313,7 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0);
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
......@@ -343,12 +338,11 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
#pragma mark - half
kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
......@@ -364,7 +358,7 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
......@@ -386,12 +380,11 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
}
kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
......@@ -406,7 +399,7 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
......@@ -441,10 +434,10 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
}
kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
......@@ -459,7 +452,7 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
const uint kernelHXW = 9;
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
......@@ -496,10 +489,10 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
}
kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
......@@ -512,7 +505,7 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
......@@ -536,10 +529,10 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
}
kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4x4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
uint x = gid.x, y = gid.y;
uint ow = outTexture.get_width();
......@@ -557,6 +550,12 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
uint tx = (x >> 1) << 1;
uint ty = (y >> 1) << 1;
uint tc = ((x % 2) << 1) + y % 2;
int hasComputedC = 4 * tc;
if (hasComputedC >= param.oC) {
return;
}
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
half4 inputs[16];
......@@ -584,6 +583,9 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
half4 res[4];
for (int c = 0; c < 4; ++c) {
if (hasComputedC + c >= param.oC) {
return;
}
half I[16];
for (int i = 0; i < 16; ++i) {
I[i] = inputs[i][c];
......@@ -640,13 +642,23 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
res[2][c] = T[4] - T[8] + T[12] + tmp1 + tmp2;
res[3][c] = T[7] - T[11] + T[15] + tmp1 - tmp2;
}
if (param.hasAddOp == 1) {
half4 base = biasTexture.sample(sample, uint2(tx, ty), tc);
res[0] += base;
base = biasTexture.sample(sample, uint2(tx + 1, ty), tc);
res[1] += base;
base = biasTexture.sample(sample, uint2(tx, ty + 1), tc);
res[2] += base;
base = biasTexture.sample(sample, uint2(tx + 1, ty + 1), tc);
res[3] += base;
}
if (param.hasReluOp == 1) {
half4 base = biase[tc];
outTexture.write(fmax(res[0] + base, 0.0), uint2(tx, ty), tc);
outTexture.write(fmax(res[1] + base, 0.0), uint2(tx + 1, ty), tc);
outTexture.write(fmax(res[2] + base, 0.0), uint2(tx, ty + 1), tc);
outTexture.write(fmax(res[3] + base, 0.0), uint2(tx + 1, ty + 1), tc);
outTexture.write(fmax(res[0], 0.0), uint2(tx, ty), tc);
outTexture.write(fmax(res[1], 0.0), uint2(tx + 1, ty), tc);
outTexture.write(fmax(res[2], 0.0), uint2(tx, ty + 1), tc);
outTexture.write(fmax(res[3], 0.0), uint2(tx + 1, ty + 1), tc);
} else {
outTexture.write(res[0], uint2(tx, ty), tc);
outTexture.write(res[1], uint2(tx + 1, ty), tc);
......@@ -656,10 +668,10 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
}
kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
......@@ -678,7 +690,7 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_y = param.dilationY;
half4 input[5];
......@@ -713,12 +725,11 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
}
kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
......@@ -736,7 +747,7 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX;
half4 input[5];
......
......@@ -13,6 +13,8 @@
16D3F3B522929C390067C45D /* SliceOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16D3F3B422929C390067C45D /* SliceOp.swift */; };
16D3F3B722929C660067C45D /* NearestInterpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16D3F3B622929C660067C45D /* NearestInterpOp.swift */; };
16D3F3B922929D070067C45D /* SliceKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16D3F3B822929D070067C45D /* SliceKernel.swift */; };
16F70989229BF975000755B0 /* ConvReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16F70988229BF975000755B0 /* ConvReluOp.swift */; };
16F7098B229BFA46000755B0 /* ConvReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16F7098A229BFA46000755B0 /* ConvReluKernel.swift */; };
16FBFB36229259C60025B406 /* ExpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB35229259C60025B406 /* ExpOp.swift */; };
16FBFB3822925B030025B406 /* ExpKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB3722925B030025B406 /* ExpKernel.swift */; };
16FBFB3A22925C3E0025B406 /* SigmoidKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB3922925C3E0025B406 /* SigmoidKernel.swift */; };
......@@ -123,6 +125,8 @@
16D3F3B422929C390067C45D /* SliceOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SliceOp.swift; sourceTree = "<group>"; };
16D3F3B622929C660067C45D /* NearestInterpOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = NearestInterpOp.swift; sourceTree = "<group>"; };
16D3F3B822929D070067C45D /* SliceKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SliceKernel.swift; sourceTree = "<group>"; };
16F70988229BF975000755B0 /* ConvReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvReluOp.swift; sourceTree = "<group>"; };
16F7098A229BFA46000755B0 /* ConvReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvReluKernel.swift; sourceTree = "<group>"; };
16FBFB35229259C60025B406 /* ExpOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ExpOp.swift; sourceTree = "<group>"; };
16FBFB3722925B030025B406 /* ExpKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ExpKernel.swift; sourceTree = "<group>"; };
16FBFB3922925C3E0025B406 /* SigmoidKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SigmoidKernel.swift; sourceTree = "<group>"; };
......@@ -363,6 +367,7 @@
16FBFB3F229266FE0025B406 /* LeakyReluOp.swift */,
16D3F3B422929C390067C45D /* SliceOp.swift */,
16D3F3B622929C660067C45D /* NearestInterpOp.swift */,
16F70988229BF975000755B0 /* ConvReluOp.swift */,
);
path = Operators;
sourceTree = "<group>";
......@@ -425,6 +430,7 @@
16FBFB412292684E0025B406 /* LeakyReluKernel.swift */,
16D3F3B822929D070067C45D /* SliceKernel.swift */,
16324D832292ABDB0047277D /* NearestInterpKernel.swift */,
16F7098A229BFA46000755B0 /* ConvReluKernel.swift */,
);
path = Kernels;
sourceTree = "<group>";
......@@ -640,6 +646,7 @@
FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */,
FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */,
FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */,
16F70989229BF975000755B0 /* ConvReluOp.swift in Sources */,
16D3F3B922929D070067C45D /* SliceKernel.swift in Sources */,
FCE9D7B7214F869000B520C3 /* Net.swift in Sources */,
FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */,
......@@ -660,6 +667,7 @@
FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */,
FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */,
FC039BC020E11CC20081E9F8 /* PMBlockDesc.swift in Sources */,
16F7098B229BFA46000755B0 /* ConvReluKernel.swift in Sources */,
A7F26FDC2284301500365D47 /* Relu6Kernel.swift in Sources */,
FCD04E6820F315020007374F /* PoolKernel.swift in Sources */,
FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */,
......
......@@ -80,6 +80,7 @@ class OpCreator<P: PrecisionProtocol> {
gFlatten2Type : Flatten2Op<P>.creat,
gSliceType : SliceOp<P>.creat,
gNearestInterpType : NearestInterpOp<P>.creat,
gConvReluType : ConvReluOp<P>.creat,
]
......
......@@ -162,6 +162,7 @@ let gPooType = "pool2d"
let gSoftmaxType = "softmax"
let gReshapeType = "reshape"
let gConvAddType = "conv_add"
let gConvReluType = "conv_relu"
let gDepthConvType = "depthwise_conv2d"
let gPriorBoxType = "prior_box"
let gTransposeType = "transpose"
......
......@@ -14,7 +14,7 @@
import Foundation
class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion {
typealias OpType = ConvAddOp<P>
static func fusionNode() -> Node {
......@@ -33,7 +33,6 @@ class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluPar
}
func inferShape() {
let inDims = para.input.dim
let filterDim = para.filter.dim
let strides = para.stride
......@@ -64,23 +63,6 @@ class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluPar
}
func delogOutput() {
// print("op \(type): ")
// print(" padding: ")
// print(para.paddings)
// print("stride: ")
// print(para.stride)
// print("dilations: ")
// print(para.dilations)
// print(" para input dim: ")
// print(para.input.dim)
// print(" para filter dim: ")
// print(para.filter.dim)
// print(" para output dim: ")
// print(para.output.dim)
// print(" biase: ")
// let biase: [Float32] = para.y.buffer.array()
// print(biase)
print(" \(type) output: ")
print(para.output.metalTexture)
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
......
......@@ -27,6 +27,15 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
do {
y = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
} catch {
do {
let yTensor: Tensor<P> = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
let device = input.metalTexture!.device
y = Texture.init(device: device, inDim: yTensor.dim)
let value: [P] = Array(UnsafeBufferPointer(start: yTensor.data.pointer, count: yTensor.dim.numel()))
y?.metalTexture = device.tensor2texture(value: value, dim: yTensor.dim.dims, transpose: [0, 2, 3, 1], inComputePrecision: GlobalConfig.shared.computePrecision)
self.yTensor = yTensor
} catch {
}
}
} catch let error {
throw error
......@@ -34,7 +43,6 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
}
let input: Texture
var y: Tensor<P>?
let filter: Tensor<P>
var output: Texture
let stride: [Int32]
......@@ -42,6 +50,9 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
let dilations: [Int32]
let groups: Int
var y: Texture?
var yTensor: Tensor<P>?
open class func hasY() -> Bool {
return true
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
import Foundation
class ConvReluOp<P: PrecisionProtocol>: Operator<ConvReluKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion {
typealias OpType = ConvReluOp<P>
static func fusionNode() -> Node {
let beginNode = Node.init(inType: gConvType)
_ = beginNode
--> Node.init(inType: gReluType)
return beginNode
}
static func change() -> [String : [(from: String, to: String)]] {
return [:]
}
static func fusionType() -> String {
return gConvReluType
}
func inferShape() {
let inDims = para.input.dim
let filterDim = para.filter.dim
let strides = para.stride
let paddings = para.paddings
let dilations = para.dilations
var outDim = [inDims[0]]
for i in 0..<strides.count {
let dilation: Int = Int(dilations[i])
let filterSize: Int = filterDim[i + 1]
let inputSize: Int = inDims[i + 1]
let padding: Int = Int(paddings[i])
let stride: Int = Int(strides[i])
let dKernel = dilation * (filterSize - 1) + 1
let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
outDim.append(outputSize)
}
outDim.append(filterDim[0])
para.output.dim = Dim.init(inDim: outDim)
}
func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
do {
try kernel.compute(commandBuffer: buffer, param: para)
} catch let error {
throw error
}
}
func delogOutput() {
print(" \(type) output: ")
print(para.output.metalTexture)
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
}
}
......@@ -23,9 +23,5 @@ class ConvAddKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
override func hasReluOp() -> Bool {
return false
}
override func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
return nil
}
}
......@@ -66,7 +66,7 @@ class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSourc
}
func label() -> String? {
return "conv_add_label"
return "conv_add_relu_label"
}
func copy(with zone: NSZone? = nil) -> Any {
......@@ -108,7 +108,7 @@ class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSourc
class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
var metalParam: MetalConvParam!
var mpsConvOp: Any?
var blankTensor: Tensor<P>?
var blankTexture: Texture?
required init(device: MTLDevice, param: ConvAddReluParam<P>, initContext: InitContext) throws {
do {
......@@ -120,14 +120,8 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
var shouldUseMPS = false
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
if initContext.useAggresiveOptimization {
if (param.input.tensorDim[1] == 1 || param.input.tensorDim[1] > 4) && (param.output.tensorDim[1] == 1 || param.output.tensorDim[1] > 4) {
shouldUseMPS = true
}
} else {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true
}
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true
}
}
if type(of: self).isWinoGrad(functionName: functionName) {
......@@ -162,14 +156,10 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
throw PaddleMobileError.predictError(message: " encode is nil")
}
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setTexture(param.y?.metalTexture, index: 1)
encoder.setTexture(param.output.metalTexture, index: 2)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
if let y = param.y {
encoder.setBuffer(y.buffer, offset: 0, index: 2)
} else {
encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2)
}
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil)
encoder.endEncoding()
}
......@@ -196,7 +186,8 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
desc.strideInPixelsX = Int(param.stride[0])
desc.strideInPixelsY = Int(param.stride[1])
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.y)
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.yTensor)
let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
conv.edgeMode = .zero
......@@ -219,11 +210,12 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
}
let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
if let y = param.y {
y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
} else {
blankTensor = Tensor<P>.init(inDim: Dim(inDim: [1, 1, 1, 4]), inLayout: DataLayout.NHWC(), originDimsCount: 4)
blankTensor?.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
if param.y == nil {
let blankTensor = Tensor<P>.init(inDim: Dim(inDim: [1, 1, 1, 4]), inLayout: DataLayout.NHWC(), originDimsCount: 4)
blankTexture = Texture.init(device: device, inDim: blankTensor.dim)
let value:[P] = [P(Float32(1.0)), P(Float32(1.0)), P(Float32(1.0)), P(Float32(1.0)),]
blankTexture?.metalTexture = device.tensor2texture(value: value, dim: blankTensor.dim.dims, transpose: [0, 2, 3, 1], inComputePrecision: GlobalConfig.shared.computePrecision)
}
}
......@@ -231,29 +223,32 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
if GlobalConfig.shared.computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_relu_1x1_half"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
}
if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
if useAggressiveOptimization {
let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3
&& param.filter.n == 16 && param.stride[0] == 1 && param.stride[1] == 1
&& param.filter.n <= 16 && param.stride[0] == 1 && param.stride[1] == 1
&& param.dilations[0] == 1 && param.dilations[1] == 1
if couldUseWinograd {
return "depthwise_conv_add_relu_3x3_half_winograd"
}
}
return "depthwise_conv_add_relu_3x3_half"
} else if param.filter.width == 3 && param.filter.height == 3 {
}
if param.filter.width == 3 && param.filter.height == 3 {
if param.groups == 1 {
return "conv_add_relu_3x3_half"
} else {
return "group_conv_add_relu_3x3_half"
}
} else if param.filter.width == 1 && param.filter.height == 5 {
}
if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_relu_5x1_half"
} else if param.filter.width == 5 && param.filter.height == 1 {
}
if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_relu_1x5_half"
} else {
return nil
}
return nil
} else if GlobalConfig.shared.computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_relu_1x1"
......
......@@ -30,14 +30,8 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
var shouldUseMPS = false
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
if initContext.useAggresiveOptimization {
if (param.input.tensorDim[1] == 1 || param.input.tensorDim[1] > 4) && (param.output.tensorDim[1] == 1 || param.output.tensorDim[1] > 4) {
shouldUseMPS = true
}
} else {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true
}
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true
}
}
if type(of: self).isWinoGrad(functionName: functionName) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
import Foundation
import MetalPerformanceShaders
class ConvReluKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
override func hasAddOp() -> Bool {
return false
}
override func hasReluOp() -> Bool {
return true
}
}
......@@ -185,6 +185,7 @@ class ProgramOptimize<P: PrecisionProtocol> {
// register fusion
let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
ConvAddReluOp<P>.self,
ConvReluOp<P>.self,
// ConvAddAddPreluOp<P>.self,
ConvAddPreluOp<P>.self,
ConvAddOp<P>.self,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册