未验证 提交 7c216695 编写于 作者: N NazgulLee 提交者: GitHub

1. fix add bias logic; 2. fix several typo (#1687)

上级 5b197f4b
...@@ -107,6 +107,15 @@ inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) { ...@@ -107,6 +107,15 @@ inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
} }
} }
struct ElementwiseAddParam {
int32_t fast;
int32_t axis;
int32_t ylen;
int32_t xdim[4];
int32_t xtrans[4];
int32_t ydim[4];
int32_t ytrans[4];
};
struct MetalConvParam { struct MetalConvParam {
short offsetX; short offsetX;
...@@ -122,4 +131,5 @@ struct MetalConvParam { ...@@ -122,4 +131,5 @@ struct MetalConvParam {
ushort oC; ushort oC;
ushort hasAddOp; ushort hasAddOp;
ushort hasReluOp; ushort hasReluOp;
ElementwiseAddParam addParam;
}; };
...@@ -204,3 +204,16 @@ struct ConcatParam { ...@@ -204,3 +204,16 @@ struct ConcatParam {
#undef N #undef N
#undef R #undef R
#undef V #undef V
#define V VY
#define R 4
#define N 3
#define P float
#include "ConcatKernel.inc.metal"
#undef P
#define P half
#include "ConcatKernel.inc.metal"
#undef P
#undef N
#undef R
#undef V
...@@ -17,6 +17,56 @@ ...@@ -17,6 +17,56 @@
using namespace metal; using namespace metal;
half4 getBiasHalf(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<half, access::sample> biasTexture) {
half4 output;
if (addParam.fast) {
output = biasTexture.read(gid.xy, gid.z);
} else {
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
int32_t xtrans[4] = {addParam.xtrans[0], addParam.xtrans[1], addParam.xtrans[2], addParam.xtrans[3]};
int32_t ytrans[4] = {addParam.ytrans[0], addParam.ytrans[1], addParam.ytrans[2], addParam.ytrans[3]};
int32_t yshift = 4 - addParam.ylen - addParam.axis;
for (int n = 0; n < 4; n++) {
x_xyzn[3] = n;
xyzn2abcd(addParam.xdim[3], x_xyzn, x_abcd);
invtrans(xtrans, x_abcd, t_abcd);
for (int k = addParam.axis; k < (addParam.axis + addParam.ylen); k++) {
y_abcd[yshift+k] = t_abcd[k];
}
trans(ytrans, y_abcd, t_abcd);
abcd2xyzn(addParam.ydim[3], t_abcd, y_xyzn);
output[n] = biasTexture.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
}
}
return output;
}
float4 getBias(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<float, access::sample> biasTexture) {
float4 output;
if (addParam.fast) {
output = float4(biasTexture.read(gid.xy, gid.z));
} else {
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
int32_t xtrans[4] = {addParam.xtrans[0], addParam.xtrans[1], addParam.xtrans[2], addParam.xtrans[3]};
int32_t ytrans[4] = {addParam.ytrans[0], addParam.ytrans[1], addParam.ytrans[2], addParam.ytrans[3]};
int32_t yshift = 4 - addParam.ylen - addParam.axis;
for (int n = 0; n < 4; n++) {
x_xyzn[3] = n;
xyzn2abcd(addParam.xdim[3], x_xyzn, x_abcd);
invtrans(xtrans, x_abcd, t_abcd);
for (int k = addParam.axis; k < (addParam.axis + addParam.ylen); k++) {
y_abcd[yshift+k] = t_abcd[k];
}
trans(ytrans, y_abcd, t_abcd);
abcd2xyzn(addParam.ydim[3], t_abcd, y_xyzn);
output[n] = biasTexture.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
}
}
return output;
}
#pragma mark - convAdd #pragma mark - convAdd
kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::sample> biasTexture [[texture(1)]], texture2d_array<float, access::sample> biasTexture [[texture(1)]],
...@@ -39,7 +89,11 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [ ...@@ -39,7 +89,11 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
uint input_arr_size = inTexture.get_array_size(); uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = getBias(gid, addParam, biasTexture);
}
float4 input; float4 input;
for (uint i = 0; i < input_arr_size; ++i) { for (uint i = 0; i < input_arr_size; ++i) {
...@@ -83,7 +137,11 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [ ...@@ -83,7 +137,11 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = getBias(gid, addParam, biasTexture);
}
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
...@@ -146,7 +204,11 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex ...@@ -146,7 +204,11 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
const uint kernelHXW = 9; const uint kernelHXW = 9;
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = getBias(gid, addParam, biasTexture);
}
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
...@@ -205,7 +267,11 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [ ...@@ -205,7 +267,11 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = getBias(gid, addParam, biasTexture);
}
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
float4 input[5]; float4 input[5];
...@@ -262,7 +328,11 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [ ...@@ -262,7 +328,11 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = getBias(gid, addParam, biasTexture);
}
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
float4 input[5]; float4 input[5];
...@@ -313,7 +383,13 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i ...@@ -313,7 +383,13 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9; const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4; uint weithTo = gid.z * kernelHXW * 4;
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = getBias(gid, addParam, biasTexture);
}
float4 inputs[9]; float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
...@@ -358,7 +434,11 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu ...@@ -358,7 +434,11 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size(); uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = float4(getBiasHalf(gid, addParam, biasTexture));
}
float4 input; float4 input;
for (uint i = 0; i < input_arr_size; ++i) { for (uint i = 0; i < input_arr_size; ++i) {
...@@ -399,11 +479,15 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu ...@@ -399,11 +479,15 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size(); uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = float4(getBiasHalf(gid, addParam, biasTexture));
}
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
half4 input[9]; half4 input[9];
for (uint i = 0; i < input_arr_size; ++i) { for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
...@@ -418,13 +502,13 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu ...@@ -418,13 +502,13 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(float4(input[j]), float4(weight_x)); output.x += dot(float4(input[j]), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(float4(input[j]), float4(weight_y)); output.y += dot(float4(input[j]), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(float4(input[j]), float4(weight_z)); output.z += dot(float4(input[j]), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(float4(input[j]), float4(weight_w)); output.w += dot(float4(input[j]), float4(weight_w));
} }
...@@ -452,7 +536,11 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i ...@@ -452,7 +536,11 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
const uint kernelHXW = 9; const uint kernelHXW = 9;
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = float4(getBiasHalf(gid, addParam, biasTexture));
}
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
...@@ -505,7 +593,13 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl ...@@ -505,7 +593,13 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9; const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4; uint weithTo = gid.z * kernelHXW * 4;
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = float4(getBiasHalf(gid, addParam, biasTexture));
}
half4 inputs[9]; half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
...@@ -523,7 +617,7 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl ...@@ -523,7 +617,7 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]); output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]); output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
} }
float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output; float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
outTexture.write(half4(relu), gid.xy, gid.z); outTexture.write(half4(relu), gid.xy, gid.z);
} }
...@@ -584,7 +678,7 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce ...@@ -584,7 +678,7 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
for (int c = 0; c < 4; ++c) { for (int c = 0; c < 4; ++c) {
if (hasComputedC + c >= param.oC) { if (hasComputedC + c >= param.oC) {
return; break;
} }
half I[16]; half I[16];
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
...@@ -644,13 +738,14 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce ...@@ -644,13 +738,14 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
} }
if (param.hasAddOp == 1) { if (param.hasAddOp == 1) {
half4 base = biasTexture.sample(sample, float2(tx, ty), tc); constant ElementwiseAddParam &addParam = param.addParam;
half4 base = getBiasHalf(uint3(tx, ty, tc), addParam, biasTexture);
res[0] += base; res[0] += base;
base = biasTexture.sample(sample, float2(tx + 1, ty), tc); base = getBiasHalf(uint3(tx + 1, ty, tc), addParam, biasTexture);
res[1] += base; res[1] += base;
base = biasTexture.sample(sample, float2(tx, ty + 1), tc); base = getBiasHalf(uint3(tx, ty + 1, tc), addParam, biasTexture);
res[2] += base; res[2] += base;
base = biasTexture.sample(sample, float2(tx + 1, ty + 1), tc); base = getBiasHalf(uint3(tx + 1, ty + 1, tc), addParam, biasTexture);
res[3] += base; res[3] += base;
} }
...@@ -690,8 +785,12 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu ...@@ -690,8 +785,12 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = float4(getBiasHalf(gid, addParam, biasTexture));
}
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
half4 input[5]; half4 input[5];
...@@ -747,8 +846,12 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu ...@@ -747,8 +846,12 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = float4(getBiasHalf(gid, addParam, biasTexture));
}
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
half4 input[5]; half4 input[5];
......
...@@ -17,16 +17,6 @@ ...@@ -17,16 +17,6 @@
using namespace metal; using namespace metal;
struct ElementwiseAddParam {
int32_t fast;
int32_t axis;
int32_t ylen;
int32_t xdim[4];
int32_t xtrans[4];
int32_t ydim[4];
int32_t ytrans[4];
};
kernel void elementwise_add(texture2d_array<float, access::read> inputX [[texture(0)]], kernel void elementwise_add(texture2d_array<float, access::read> inputX [[texture(0)]],
texture2d_array<float, access::read> inputY [[texture(1)]], texture2d_array<float, access::read> inputY [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]], texture2d_array<float, access::write> outTexture [[texture(2)]],
......
...@@ -16,16 +16,6 @@ ...@@ -16,16 +16,6 @@
#include "Common.metal" #include "Common.metal"
using namespace metal; using namespace metal;
struct ElementwiseAddParam {
int32_t fast;
int32_t axis;
int32_t ylen;
int32_t xdim[4];
int32_t xtrans[4];
int32_t ydim[4];
int32_t ytrans[4];
};
#define P float #define P float
#define PRELU_CHANNEL prelu_channel #define PRELU_CHANNEL prelu_channel
......
...@@ -287,7 +287,13 @@ extension MTLDevice { ...@@ -287,7 +287,13 @@ extension MTLDevice {
var rcount: Int = (ndim[0] * ndim[3] + 3) / 4 var rcount: Int = (ndim[0] * ndim[3] + 3) / 4
rcount = rcount * 4 * ndim[1] * ndim[2] rcount = rcount * 4 * ndim[1] * ndim[2]
var nvalue: [Float32] = .init(repeating: 0.0, count: rcount) var nvalue: [Float32] = .init(repeating: 0.0, count: rcount)
var value32: [Float32]?
if value is [Float16] {
var value16 = value as! [Float16]
value32 = float16To32(input: &value16, count: value.count)
} else {
value32 = value as? [Float32]
}
for i0 in 0..<tdim[0] { for i0 in 0..<tdim[0] {
for i1 in 0..<tdim[1] { for i1 in 0..<tdim[1] {
for i2 in 0..<tdim[2] { for i2 in 0..<tdim[2] {
...@@ -298,8 +304,11 @@ extension MTLDevice { ...@@ -298,8 +304,11 @@ extension MTLDevice {
let jg = transpose.map { ig[$0] } let jg = transpose.map { ig[$0] }
let k = jg[0] * ndim[3] + jg[3] let k = jg[0] * ndim[3] + jg[3]
let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4) let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4)
if let value32 = value32 {
nvalue[jx] = value[ix] as! Float32 nvalue[jx] = value32[ix]
} else {
fatalError("tensor2texture tensor value type not support")
}
} }
} }
} }
......
...@@ -325,7 +325,7 @@ public class PaddleMobileUnitTest { ...@@ -325,7 +325,7 @@ public class PaddleMobileUnitTest {
let fC = 4 let fC = 4
let oC = 4 let oC = 4
let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0)) let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize) let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
......
...@@ -105,8 +105,8 @@ public class Loader<P: PrecisionProtocol>: Loaderable { ...@@ -105,8 +105,8 @@ public class Loader<P: PrecisionProtocol>: Loaderable {
} while (false) } while (false)
} else { } else {
fseek(file, MemoryLayout<CChar>.size * tensorDescSize, SEEK_CUR) fseek(file, MemoryLayout<CChar>.size * tensorDescSize, SEEK_CUR)
nowIndex += MemoryLayout<CChar>.size * tensorDescSize
} }
nowIndex += MemoryLayout<CChar>.size * tensorDescSize
/* /*
这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度 这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
......
...@@ -24,6 +24,11 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam { ...@@ -24,6 +24,11 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
paddings = try ConvAddReluParam.getAttr(key: "paddings", attrs: opDesc.attrs) paddings = try ConvAddReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
dilations = try ConvAddReluParam.getAttr(key: "dilations", attrs: opDesc.attrs) dilations = try ConvAddReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
groups = try ConvAddReluParam.getAttr(key: "groups", attrs: opDesc.attrs) groups = try ConvAddReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
do {
axis = try ConvAddReluParam.getAttr(key: "axis", attrs: opDesc.attrs)
} catch {
axis = -1
}
do { do {
y = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope) y = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
} catch { } catch {
...@@ -32,7 +37,7 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam { ...@@ -32,7 +37,7 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
let device = input.metalTexture!.device let device = input.metalTexture!.device
y = Texture.init(device: device, inDim: yTensor.dim) y = Texture.init(device: device, inDim: yTensor.dim)
let value: [P] = Array(UnsafeBufferPointer(start: yTensor.data.pointer, count: yTensor.dim.numel())) let value: [P] = Array(UnsafeBufferPointer(start: yTensor.data.pointer, count: yTensor.dim.numel()))
y?.metalTexture = device.tensor2texture(value: value, dim: yTensor.dim.dims, transpose: [0, 2, 3, 1], inComputePrecision: GlobalConfig.shared.computePrecision) y?.metalTexture = device.tensor2texture(value: value, dim: yTensor.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
self.yTensor = yTensor self.yTensor = yTensor
} catch { } catch {
} }
...@@ -49,6 +54,7 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam { ...@@ -49,6 +54,7 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
let paddings: [Int32] let paddings: [Int32]
let dilations: [Int32] let dilations: [Int32]
let groups: Int let groups: Int
let axis: Int
var y: Texture? var y: Texture?
var yTensor: Tensor<P>? var yTensor: Tensor<P>?
......
...@@ -64,7 +64,7 @@ class FeedOp<P: PrecisionProtocol>: Operator<Texture2DTo2DArrayKernel<P>, FeedPa ...@@ -64,7 +64,7 @@ class FeedOp<P: PrecisionProtocol>: Operator<Texture2DTo2DArrayKernel<P>, FeedPa
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture) print(para.output.metalTexture)
print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray()) print(para.output.toTensor().strideArray())
} }
} }
...@@ -135,7 +135,7 @@ class ConvAddAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -135,7 +135,7 @@ class ConvAddAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
let iC = param.input.tensorDim[1]; let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1]; let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1]; let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0)) let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
// print("metal param: ") // print("metal param: ")
// print(inMetalParam) // print(inMetalParam)
......
...@@ -98,7 +98,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionProtocol>: Kernel, Computable, Test ...@@ -98,7 +98,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionProtocol>: Kernel, Computable, Test
let iC = param.input.tensorDim[1]; let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1]; let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1]; let oC = param.output.tensorDim[1];
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0)) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
var invs: [P] = [] var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
...@@ -16,11 +16,11 @@ import Foundation ...@@ -16,11 +16,11 @@ import Foundation
import MetalPerformanceShaders import MetalPerformanceShaders
class ConvAddKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> { class ConvAddKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
override func hasAddOp() -> Bool { override class func hasAddOp() -> Bool {
return true return true
} }
override func hasReluOp() -> Bool { override class func hasReluOp() -> Bool {
return false return false
} }
} }
......
...@@ -135,7 +135,7 @@ class ConvAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -135,7 +135,7 @@ class ConvAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
let iC = param.input.tensorDim[1]; let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1]; let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1]; let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0)) let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
// print("metal param: ") // print("metal param: ")
// print(inMetalParam) // print(inMetalParam)
......
...@@ -29,6 +29,7 @@ public struct MetalConvParam { ...@@ -29,6 +29,7 @@ public struct MetalConvParam {
let oC: UInt16 let oC: UInt16
let hasAddOp: UInt16 let hasAddOp: UInt16
let hasReluOp: UInt16 let hasReluOp: UInt16
let addParam: ElementwiseAddMetalParam
} }
@available(iOS 11.0, *) @available(iOS 11.0, *)
...@@ -124,7 +125,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -124,7 +125,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) { if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) {
let inputChannel = param.input.tensorDim[1] let inputChannel = param.input.tensorDim[1]
let outputChannel = param.output.tensorDim[1] let outputChannel = param.output.tensorDim[1]
if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) { if inputChannel > 4 && outputChannel > 4 {
shouldUseMPS = true shouldUseMPS = true
} }
} }
...@@ -135,6 +136,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -135,6 +136,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
if !isDepthWise && param.groups > 1 { if !isDepthWise && param.groups > 1 {
shouldUseMPS = false shouldUseMPS = false
} }
if type(of: self).hasAddOp() {
if !(type(of: self).canAddUseMPS(param: param)) {
shouldUseMPS = false
}
}
if shouldUseMPS { if shouldUseMPS {
super.init(device: device, inFunctionName: nil, initContext: initContext) super.init(device: device, inFunctionName: nil, initContext: initContext)
setupWithMPS(device: device, param: param) setupWithMPS(device: device, param: param)
...@@ -195,11 +201,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -195,11 +201,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
param.input.useMPS = true param.input.useMPS = true
param.output.useMPS = true param.output.useMPS = true
if #available(iOS 11.3, *) { if #available(iOS 11.3, *) {
if param.y != nil { if type(of: self).hasAddOp() && type(of: self).canMPSAddByElement(param: param) && !type(of: self).canMPSAddByChannel(param: param) {
mpsAddOp = MPSCNNAdd(device: device) mpsAddOp = MPSCNNAdd(device: device)
if hasReluOp() { }
mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0) if type(of: self).hasReluOp() {
} mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0)
} }
} }
let neuronFilter: MPSCNNNeuron? = param.y != nil ? nil : (neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) let neuronFilter: MPSCNNNeuron? = param.y != nil ? nil : (neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
...@@ -217,7 +223,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -217,7 +223,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
desc.strideInPixelsX = Int(param.stride[0]) desc.strideInPixelsX = Int(param.stride[0])
desc.strideInPixelsY = Int(param.stride[1]) desc.strideInPixelsY = Int(param.stride[1])
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init()) let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.yTensor) var biasTerms: Tensor<P>? = nil
if type(of: self).hasAddOp() && type(of: self).canMPSAddByChannel(param: param) {
biasTerms = param.yTensor
}
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: biasTerms)
let conv = MPSCNNConvolution.init(device: device, weights: dataSource) let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0) conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
...@@ -233,7 +243,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -233,7 +243,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
let iC = param.input.tensorDim[1]; let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1]; let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1]; let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0)) var addParam = ElementwiseAddMetalParam()
if let inputY = param.y {
addParam = ElementwiseAddKernel<P>.metalParamFrom(inputX: param.output, inputY: inputY, axis: param.axis)
}
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(type(of: self).hasAddOp() ? 1 : 0), hasReluOp: UInt16(type(of: self).hasReluOp() ? 1 : 0), addParam: addParam)
metalParam = inMetalParam metalParam = inMetalParam
if type(of: self).isWinoGrad(functionName: functionName) { if type(of: self).isWinoGrad(functionName: functionName) {
...@@ -304,7 +318,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -304,7 +318,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
} }
open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? { open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
if hasReluOp() { if type(of: self).hasReluOp() {
if #available(iOS 10.0, *) { if #available(iOS 10.0, *) {
return MPSCNNNeuronReLU(device: device, a: 0) return MPSCNNNeuronReLU(device: device, a: 0)
} }
...@@ -312,11 +326,29 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -312,11 +326,29 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
return nil return nil
} }
open func hasAddOp() -> Bool { open class func canAddUseMPS(param: ConvAddReluParam<P>) -> Bool {
return canMPSAddByChannel(param: param) || canMPSAddByElement(param: param)
}
private class func canMPSAddByChannel(param: ConvAddReluParam<P>) -> Bool {
if let yTensor = param.yTensor, yTensor.dim.cout() == 1 {
return true
}
return false
}
private class func canMPSAddByElement(param: ConvAddReluParam<P>) -> Bool {
if let y = param.y, y.dim.dims == param.input.dim.dims {
return true
}
return false
}
open class func hasAddOp() -> Bool {
return true return true
} }
open func hasReluOp() -> Bool { open class func hasReluOp() -> Bool {
return true return true
} }
......
...@@ -105,7 +105,7 @@ class ConvBNReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable { ...@@ -105,7 +105,7 @@ class ConvBNReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable {
let iC = param.input.tensorDim[1]; let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1]; let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1]; let oC = param.output.tensorDim[1];
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0)) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
var invs: [P] = [] var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
...@@ -66,7 +66,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -66,7 +66,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
throw PaddleMobileError.predictError(message: " encode is nil") throw PaddleMobileError.predictError(message: " encode is nil")
} }
encoder.setTexture(param.input.metalTexture, index: 0) encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1) encoder.setTexture(param.output.metalTexture, index: 2)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0) encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2) encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2)
...@@ -111,7 +111,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -111,7 +111,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
let iC = param.input.tensorDim[1]; let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1]; let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1]; let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0)) let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0), addParam: ElementwiseAddMetalParam())
metalParam = inMetalParam metalParam = inMetalParam
if type(of: self).isWinoGrad(functionName: functionName) { if type(of: self).isWinoGrad(functionName: functionName) {
...@@ -130,7 +130,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -130,7 +130,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
if useAggressiveOptimization { if useAggressiveOptimization {
let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3 let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3
&& param.filter.n == 16 && param.stride[0] == 1 && param.stride[1] == 1 && param.filter.n <= 16 && param.stride[0] == 1 && param.stride[1] == 1
&& param.dilations[0] == 1 && param.dilations[1] == 1 && param.dilations[0] == 1 && param.dilations[1] == 1
if couldUseWinograd { if couldUseWinograd {
return "depthwise_conv_add_relu_3x3_half_winograd" return "depthwise_conv_add_relu_3x3_half_winograd"
......
...@@ -16,11 +16,11 @@ import Foundation ...@@ -16,11 +16,11 @@ import Foundation
import MetalPerformanceShaders import MetalPerformanceShaders
class ConvReluKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> { class ConvReluKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
override func hasAddOp() -> Bool { override class func hasAddOp() -> Bool {
return false return false
} }
override func hasReluOp() -> Bool { override class func hasReluOp() -> Bool {
return true return true
} }
} }
......
...@@ -34,27 +34,8 @@ class ElementwiseAddKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -34,27 +34,8 @@ class ElementwiseAddKernel<P: PrecisionProtocol>: Kernel, Computable {
throw error throw error
} }
metalParam = ElementwiseAddMetalParam.init() metalParam = ElementwiseAddKernel.metalParamFrom(inputX: param.inputX, inputY: param.inputY, axis: param.axis)
let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
if param.axis == -1 {
metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
} else {
metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
}
metalParam.ylen = Int32(param.inputY.tensorDim.cout())
if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
// print("===> elementwise_add fast!!!")
metalParam.fast = 1
}
if GlobalConfig.shared.computePrecision == .Float32 { if GlobalConfig.shared.computePrecision == .Float32 {
super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext) super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext)
} else if GlobalConfig.shared.computePrecision == .Float16 { } else if GlobalConfig.shared.computePrecision == .Float16 {
...@@ -75,4 +56,29 @@ class ElementwiseAddKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -75,4 +56,29 @@ class ElementwiseAddKernel<P: PrecisionProtocol>: Kernel, Computable {
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding() encoder.endEncoding()
} }
static func metalParamFrom(inputX: Texture, inputY: Texture, axis: Int) -> ElementwiseAddMetalParam {
var metalParam = ElementwiseAddMetalParam.init()
let xdim: [Int32] = (0..<4).map { Int32(inputX.dim[$0]) }
let ydim: [Int32] = (0..<4).map { Int32(inputY.dim[$0]) }
let xtrans: [Int32] = (0..<4).map { Int32(inputX.transpose[$0]) }
let ytrans: [Int32] = (0..<4).map { Int32(inputY.transpose[$0]) }
metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
if axis == -1 {
metalParam.axis = 4 - Int32(inputY.tensorDim.cout())
} else {
metalParam.axis = 4 - Int32(inputX.tensorDim.cout()) + Int32(axis)
}
metalParam.ylen = Int32(inputY.tensorDim.cout())
if (inputX.dim == inputY.dim) && (inputX.transpose == inputY.transpose) {
// print("===> elementwise_add fast!!!")
metalParam.fast = 1
}
return metalParam
}
} }
...@@ -26,6 +26,11 @@ class ReluKernel<P: PrecisionProtocol>: Kernel, Computable{ ...@@ -26,6 +26,11 @@ class ReluKernel<P: PrecisionProtocol>: Kernel, Computable{
} }
required init(device: MTLDevice, param: ReluParam<P>, initContext: InitContext) throws { required init(device: MTLDevice, param: ReluParam<P>, initContext: InitContext) throws {
do {
try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
} catch let error {
throw error
}
if GlobalConfig.shared.computePrecision == .Float32 { if GlobalConfig.shared.computePrecision == .Float32 {
super.init(device: device, inFunctionName: "relu", initContext: initContext) super.init(device: device, inFunctionName: "relu", initContext: initContext)
} else if GlobalConfig.shared.computePrecision == .Float16 { } else if GlobalConfig.shared.computePrecision == .Float16 {
......
...@@ -34,10 +34,10 @@ class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{ ...@@ -34,10 +34,10 @@ class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{
} }
var shouldUseMPS = false var shouldUseMPS = false
if initContext.useMPS && param.biasAfterScale { if initContext.useMPS && param.biasAfterScale && param.input.tensorDim.cout() == 4 && param.output.tensorDim.cout() == 4 {
let inputChannel = param.input.tensorDim[1] let inputChannel = param.input.tensorDim[1]
let outputChannel = param.output.tensorDim[1] let outputChannel = param.output.tensorDim[1]
if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) { if (inputChannel > 4) && (outputChannel > 4) {
shouldUseMPS = true shouldUseMPS = true
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册