提交 c1ffeb30 编写于 作者: Y Yanzhan Yang 提交者: GitHub

1.enable both tensor and texture as bias of conv_add_relu. 2. add con… (#1662)

* 1.enable both tensor and texture as bias of conv_add_relu. 2. add conv_relu fusion op.

* winograd could be used when. out channel is less than or equal to 16
上级 b56e720e
...@@ -19,12 +19,11 @@ using namespace metal; ...@@ -19,12 +19,11 @@ using namespace metal;
#pragma mark - convAdd #pragma mark - convAdd
kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::sample> biasTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]], const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
...@@ -40,7 +39,7 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [ ...@@ -40,7 +39,7 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
uint input_arr_size = inTexture.get_array_size(); uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0); float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
float4 input; float4 input;
for (uint i = 0; i < input_arr_size; ++i) { for (uint i = 0; i < input_arr_size; ++i) {
...@@ -62,12 +61,11 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [ ...@@ -62,12 +61,11 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
} }
kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::sample> biasTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]], const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
...@@ -85,7 +83,7 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [ ...@@ -85,7 +83,7 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0); float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
...@@ -130,10 +128,10 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [ ...@@ -130,10 +128,10 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
} }
kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::sample> biasTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float *weights [[buffer(1)]], const device float *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
...@@ -148,7 +146,7 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex ...@@ -148,7 +146,7 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
const uint kernelHXW = 9; const uint kernelHXW = 9;
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0); float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
...@@ -185,12 +183,11 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex ...@@ -185,12 +183,11 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
} }
kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::sample> biasTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]], const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
...@@ -208,7 +205,7 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [ ...@@ -208,7 +205,7 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0); float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
float4 input[5]; float4 input[5];
...@@ -243,12 +240,11 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [ ...@@ -243,12 +240,11 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
} }
kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::sample> biasTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]], const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
...@@ -266,7 +262,7 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [ ...@@ -266,7 +262,7 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0); float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
float4 input[5]; float4 input[5];
...@@ -301,12 +297,11 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [ ...@@ -301,12 +297,11 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
} }
kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::sample> biasTexture [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float *weights [[buffer(1)]], const device float *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
...@@ -318,7 +313,7 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i ...@@ -318,7 +313,7 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9; const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4; uint weithTo = gid.z * kernelHXW * 4;
float4 output = param.hasAddOp == 1 ? biase[gid.z] : float4(0.0, 0.0, 0.0, 0.0); float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
float4 inputs[9]; float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
...@@ -343,12 +338,11 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i ...@@ -343,12 +338,11 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
#pragma mark - half #pragma mark - half
kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]], texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]], const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
...@@ -364,7 +358,7 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu ...@@ -364,7 +358,7 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size(); uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0); float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
float4 input; float4 input;
for (uint i = 0; i < input_arr_size; ++i) { for (uint i = 0; i < input_arr_size; ++i) {
...@@ -386,12 +380,11 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu ...@@ -386,12 +380,11 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
} }
kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]], texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]], const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
...@@ -406,7 +399,7 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu ...@@ -406,7 +399,7 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size(); uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0); float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
...@@ -441,10 +434,10 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu ...@@ -441,10 +434,10 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
} }
kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]], texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]], const device half *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
...@@ -459,7 +452,7 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i ...@@ -459,7 +452,7 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
const uint kernelHXW = 9; const uint kernelHXW = 9;
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0); float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
...@@ -496,10 +489,10 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i ...@@ -496,10 +489,10 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
} }
kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]], texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]], const device half *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
...@@ -512,7 +505,7 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl ...@@ -512,7 +505,7 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9; const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4; uint weithTo = gid.z * kernelHXW * 4;
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0); float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
half4 inputs[9]; half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
...@@ -536,10 +529,10 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl ...@@ -536,10 +529,10 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
} }
kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]], texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device half4x4 *weights [[buffer(1)]], const device half4x4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
uint x = gid.x, y = gid.y; uint x = gid.x, y = gid.y;
uint ow = outTexture.get_width(); uint ow = outTexture.get_width();
...@@ -558,6 +551,12 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce ...@@ -558,6 +551,12 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
uint ty = (y >> 1) << 1; uint ty = (y >> 1) << 1;
uint tc = ((x % 2) << 1) + y % 2; uint tc = ((x % 2) << 1) + y % 2;
int hasComputedC = 4 * tc;
if (hasComputedC >= param.oC) {
return;
}
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
half4 inputs[16]; half4 inputs[16];
inputs[0] = inTexture.sample(sample, float2(tx - 1, ty - 1), tc); inputs[0] = inTexture.sample(sample, float2(tx - 1, ty - 1), tc);
...@@ -584,6 +583,9 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce ...@@ -584,6 +583,9 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
half4 res[4]; half4 res[4];
for (int c = 0; c < 4; ++c) { for (int c = 0; c < 4; ++c) {
if (hasComputedC + c >= param.oC) {
return;
}
half I[16]; half I[16];
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
I[i] = inputs[i][c]; I[i] = inputs[i][c];
...@@ -641,12 +643,22 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce ...@@ -641,12 +643,22 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
res[3][c] = T[7] - T[11] + T[15] + tmp1 - tmp2; res[3][c] = T[7] - T[11] + T[15] + tmp1 - tmp2;
} }
if (param.hasAddOp == 1) {
half4 base = biasTexture.sample(sample, uint2(tx, ty), tc);
res[0] += base;
base = biasTexture.sample(sample, uint2(tx + 1, ty), tc);
res[1] += base;
base = biasTexture.sample(sample, uint2(tx, ty + 1), tc);
res[2] += base;
base = biasTexture.sample(sample, uint2(tx + 1, ty + 1), tc);
res[3] += base;
}
if (param.hasReluOp == 1) { if (param.hasReluOp == 1) {
half4 base = biase[tc]; outTexture.write(fmax(res[0], 0.0), uint2(tx, ty), tc);
outTexture.write(fmax(res[0] + base, 0.0), uint2(tx, ty), tc); outTexture.write(fmax(res[1], 0.0), uint2(tx + 1, ty), tc);
outTexture.write(fmax(res[1] + base, 0.0), uint2(tx + 1, ty), tc); outTexture.write(fmax(res[2], 0.0), uint2(tx, ty + 1), tc);
outTexture.write(fmax(res[2] + base, 0.0), uint2(tx, ty + 1), tc); outTexture.write(fmax(res[3], 0.0), uint2(tx + 1, ty + 1), tc);
outTexture.write(fmax(res[3] + base, 0.0), uint2(tx + 1, ty + 1), tc);
} else { } else {
outTexture.write(res[0], uint2(tx, ty), tc); outTexture.write(res[0], uint2(tx, ty), tc);
outTexture.write(res[1], uint2(tx + 1, ty), tc); outTexture.write(res[1], uint2(tx + 1, ty), tc);
...@@ -656,10 +668,10 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce ...@@ -656,10 +668,10 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
} }
kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]], texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]], const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
...@@ -678,7 +690,7 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu ...@@ -678,7 +690,7 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0); float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
half4 input[5]; half4 input[5];
...@@ -713,12 +725,11 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu ...@@ -713,12 +725,11 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
} }
kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]], texture2d_array<half, access::sample> biasTexture [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]], const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
...@@ -736,7 +747,7 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu ...@@ -736,7 +747,7 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biase[gid.z]) : float4(0.0, 0.0, 0.0, 0.0); float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
ushort dilation_x = param.dilationX; ushort dilation_x = param.dilationX;
half4 input[5]; half4 input[5];
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
16D3F3B522929C390067C45D /* SliceOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16D3F3B422929C390067C45D /* SliceOp.swift */; }; 16D3F3B522929C390067C45D /* SliceOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16D3F3B422929C390067C45D /* SliceOp.swift */; };
16D3F3B722929C660067C45D /* NearestInterpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16D3F3B622929C660067C45D /* NearestInterpOp.swift */; }; 16D3F3B722929C660067C45D /* NearestInterpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16D3F3B622929C660067C45D /* NearestInterpOp.swift */; };
16D3F3B922929D070067C45D /* SliceKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16D3F3B822929D070067C45D /* SliceKernel.swift */; }; 16D3F3B922929D070067C45D /* SliceKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16D3F3B822929D070067C45D /* SliceKernel.swift */; };
16F70989229BF975000755B0 /* ConvReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16F70988229BF975000755B0 /* ConvReluOp.swift */; };
16F7098B229BFA46000755B0 /* ConvReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16F7098A229BFA46000755B0 /* ConvReluKernel.swift */; };
16FBFB36229259C60025B406 /* ExpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB35229259C60025B406 /* ExpOp.swift */; }; 16FBFB36229259C60025B406 /* ExpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB35229259C60025B406 /* ExpOp.swift */; };
16FBFB3822925B030025B406 /* ExpKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB3722925B030025B406 /* ExpKernel.swift */; }; 16FBFB3822925B030025B406 /* ExpKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB3722925B030025B406 /* ExpKernel.swift */; };
16FBFB3A22925C3E0025B406 /* SigmoidKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB3922925C3E0025B406 /* SigmoidKernel.swift */; }; 16FBFB3A22925C3E0025B406 /* SigmoidKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB3922925C3E0025B406 /* SigmoidKernel.swift */; };
...@@ -123,6 +125,8 @@ ...@@ -123,6 +125,8 @@
16D3F3B422929C390067C45D /* SliceOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SliceOp.swift; sourceTree = "<group>"; }; 16D3F3B422929C390067C45D /* SliceOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SliceOp.swift; sourceTree = "<group>"; };
16D3F3B622929C660067C45D /* NearestInterpOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = NearestInterpOp.swift; sourceTree = "<group>"; }; 16D3F3B622929C660067C45D /* NearestInterpOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = NearestInterpOp.swift; sourceTree = "<group>"; };
16D3F3B822929D070067C45D /* SliceKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SliceKernel.swift; sourceTree = "<group>"; }; 16D3F3B822929D070067C45D /* SliceKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SliceKernel.swift; sourceTree = "<group>"; };
16F70988229BF975000755B0 /* ConvReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvReluOp.swift; sourceTree = "<group>"; };
16F7098A229BFA46000755B0 /* ConvReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvReluKernel.swift; sourceTree = "<group>"; };
16FBFB35229259C60025B406 /* ExpOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ExpOp.swift; sourceTree = "<group>"; }; 16FBFB35229259C60025B406 /* ExpOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ExpOp.swift; sourceTree = "<group>"; };
16FBFB3722925B030025B406 /* ExpKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ExpKernel.swift; sourceTree = "<group>"; }; 16FBFB3722925B030025B406 /* ExpKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ExpKernel.swift; sourceTree = "<group>"; };
16FBFB3922925C3E0025B406 /* SigmoidKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SigmoidKernel.swift; sourceTree = "<group>"; }; 16FBFB3922925C3E0025B406 /* SigmoidKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SigmoidKernel.swift; sourceTree = "<group>"; };
...@@ -363,6 +367,7 @@ ...@@ -363,6 +367,7 @@
16FBFB3F229266FE0025B406 /* LeakyReluOp.swift */, 16FBFB3F229266FE0025B406 /* LeakyReluOp.swift */,
16D3F3B422929C390067C45D /* SliceOp.swift */, 16D3F3B422929C390067C45D /* SliceOp.swift */,
16D3F3B622929C660067C45D /* NearestInterpOp.swift */, 16D3F3B622929C660067C45D /* NearestInterpOp.swift */,
16F70988229BF975000755B0 /* ConvReluOp.swift */,
); );
path = Operators; path = Operators;
sourceTree = "<group>"; sourceTree = "<group>";
...@@ -425,6 +430,7 @@ ...@@ -425,6 +430,7 @@
16FBFB412292684E0025B406 /* LeakyReluKernel.swift */, 16FBFB412292684E0025B406 /* LeakyReluKernel.swift */,
16D3F3B822929D070067C45D /* SliceKernel.swift */, 16D3F3B822929D070067C45D /* SliceKernel.swift */,
16324D832292ABDB0047277D /* NearestInterpKernel.swift */, 16324D832292ABDB0047277D /* NearestInterpKernel.swift */,
16F7098A229BFA46000755B0 /* ConvReluKernel.swift */,
); );
path = Kernels; path = Kernels;
sourceTree = "<group>"; sourceTree = "<group>";
...@@ -640,6 +646,7 @@ ...@@ -640,6 +646,7 @@
FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */, FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */,
FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */, FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */,
FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */, FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */,
16F70989229BF975000755B0 /* ConvReluOp.swift in Sources */,
16D3F3B922929D070067C45D /* SliceKernel.swift in Sources */, 16D3F3B922929D070067C45D /* SliceKernel.swift in Sources */,
FCE9D7B7214F869000B520C3 /* Net.swift in Sources */, FCE9D7B7214F869000B520C3 /* Net.swift in Sources */,
FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */, FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */,
...@@ -660,6 +667,7 @@ ...@@ -660,6 +667,7 @@
FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */, FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */,
FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */, FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */,
FC039BC020E11CC20081E9F8 /* PMBlockDesc.swift in Sources */, FC039BC020E11CC20081E9F8 /* PMBlockDesc.swift in Sources */,
16F7098B229BFA46000755B0 /* ConvReluKernel.swift in Sources */,
A7F26FDC2284301500365D47 /* Relu6Kernel.swift in Sources */, A7F26FDC2284301500365D47 /* Relu6Kernel.swift in Sources */,
FCD04E6820F315020007374F /* PoolKernel.swift in Sources */, FCD04E6820F315020007374F /* PoolKernel.swift in Sources */,
FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */, FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */,
......
...@@ -80,6 +80,7 @@ class OpCreator<P: PrecisionProtocol> { ...@@ -80,6 +80,7 @@ class OpCreator<P: PrecisionProtocol> {
gFlatten2Type : Flatten2Op<P>.creat, gFlatten2Type : Flatten2Op<P>.creat,
gSliceType : SliceOp<P>.creat, gSliceType : SliceOp<P>.creat,
gNearestInterpType : NearestInterpOp<P>.creat, gNearestInterpType : NearestInterpOp<P>.creat,
gConvReluType : ConvReluOp<P>.creat,
] ]
......
...@@ -162,6 +162,7 @@ let gPooType = "pool2d" ...@@ -162,6 +162,7 @@ let gPooType = "pool2d"
let gSoftmaxType = "softmax" let gSoftmaxType = "softmax"
let gReshapeType = "reshape" let gReshapeType = "reshape"
let gConvAddType = "conv_add" let gConvAddType = "conv_add"
let gConvReluType = "conv_relu"
let gDepthConvType = "depthwise_conv2d" let gDepthConvType = "depthwise_conv2d"
let gPriorBoxType = "prior_box" let gPriorBoxType = "prior_box"
let gTransposeType = "transpose" let gTransposeType = "transpose"
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
import Foundation import Foundation
class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion{ class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion {
typealias OpType = ConvAddOp<P> typealias OpType = ConvAddOp<P>
static func fusionNode() -> Node { static func fusionNode() -> Node {
...@@ -33,7 +33,6 @@ class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluPar ...@@ -33,7 +33,6 @@ class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluPar
} }
func inferShape() { func inferShape() {
let inDims = para.input.dim let inDims = para.input.dim
let filterDim = para.filter.dim let filterDim = para.filter.dim
let strides = para.stride let strides = para.stride
...@@ -64,23 +63,6 @@ class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluPar ...@@ -64,23 +63,6 @@ class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluPar
} }
func delogOutput() { func delogOutput() {
// print("op \(type): ")
// print(" padding: ")
// print(para.paddings)
// print("stride: ")
// print(para.stride)
// print("dilations: ")
// print(para.dilations)
// print(" para input dim: ")
// print(para.input.dim)
// print(" para filter dim: ")
// print(para.filter.dim)
// print(" para output dim: ")
// print(para.output.dim)
// print(" biase: ")
// let biase: [Float32] = para.y.buffer.array()
// print(biase)
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture) print(para.output.metalTexture)
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
......
...@@ -26,15 +26,23 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam { ...@@ -26,15 +26,23 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
groups = try ConvAddReluParam.getAttr(key: "groups", attrs: opDesc.attrs) groups = try ConvAddReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
do { do {
y = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope) y = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
} catch {
do {
let yTensor: Tensor<P> = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
let device = input.metalTexture!.device
y = Texture.init(device: device, inDim: yTensor.dim)
let value: [P] = Array(UnsafeBufferPointer(start: yTensor.data.pointer, count: yTensor.dim.numel()))
y?.metalTexture = device.tensor2texture(value: value, dim: yTensor.dim.dims, transpose: [0, 2, 3, 1], inComputePrecision: GlobalConfig.shared.computePrecision)
self.yTensor = yTensor
} catch { } catch {
} }
}
} catch let error { } catch let error {
throw error throw error
} }
} }
let input: Texture let input: Texture
var y: Tensor<P>?
let filter: Tensor<P> let filter: Tensor<P>
var output: Texture var output: Texture
let stride: [Int32] let stride: [Int32]
...@@ -42,6 +50,9 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam { ...@@ -42,6 +50,9 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
let dilations: [Int32] let dilations: [Int32]
let groups: Int let groups: Int
var y: Texture?
var yTensor: Tensor<P>?
open class func hasY() -> Bool { open class func hasY() -> Bool {
return true return true
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
import Foundation
class ConvReluOp<P: PrecisionProtocol>: Operator<ConvReluKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion {
typealias OpType = ConvReluOp<P>
static func fusionNode() -> Node {
let beginNode = Node.init(inType: gConvType)
_ = beginNode
--> Node.init(inType: gReluType)
return beginNode
}
static func change() -> [String : [(from: String, to: String)]] {
return [:]
}
static func fusionType() -> String {
return gConvReluType
}
func inferShape() {
let inDims = para.input.dim
let filterDim = para.filter.dim
let strides = para.stride
let paddings = para.paddings
let dilations = para.dilations
var outDim = [inDims[0]]
for i in 0..<strides.count {
let dilation: Int = Int(dilations[i])
let filterSize: Int = filterDim[i + 1]
let inputSize: Int = inDims[i + 1]
let padding: Int = Int(paddings[i])
let stride: Int = Int(strides[i])
let dKernel = dilation * (filterSize - 1) + 1
let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
outDim.append(outputSize)
}
outDim.append(filterDim[0])
para.output.dim = Dim.init(inDim: outDim)
}
func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
do {
try kernel.compute(commandBuffer: buffer, param: para)
} catch let error {
throw error
}
}
func delogOutput() {
print(" \(type) output: ")
print(para.output.metalTexture)
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
}
}
...@@ -23,9 +23,5 @@ class ConvAddKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> { ...@@ -23,9 +23,5 @@ class ConvAddKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
override func hasReluOp() -> Bool { override func hasReluOp() -> Bool {
return false return false
} }
override func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
return nil
}
} }
...@@ -66,7 +66,7 @@ class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSourc ...@@ -66,7 +66,7 @@ class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSourc
} }
func label() -> String? { func label() -> String? {
return "conv_add_label" return "conv_add_relu_label"
} }
func copy(with zone: NSZone? = nil) -> Any { func copy(with zone: NSZone? = nil) -> Any {
...@@ -108,7 +108,7 @@ class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSourc ...@@ -108,7 +108,7 @@ class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSourc
class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
var metalParam: MetalConvParam! var metalParam: MetalConvParam!
var mpsConvOp: Any? var mpsConvOp: Any?
var blankTensor: Tensor<P>? var blankTexture: Texture?
required init(device: MTLDevice, param: ConvAddReluParam<P>, initContext: InitContext) throws { required init(device: MTLDevice, param: ConvAddReluParam<P>, initContext: InitContext) throws {
do { do {
...@@ -120,16 +120,10 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -120,16 +120,10 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
var shouldUseMPS = false var shouldUseMPS = false
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization) let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) { if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
if initContext.useAggresiveOptimization {
if (param.input.tensorDim[1] == 1 || param.input.tensorDim[1] > 4) && (param.output.tensorDim[1] == 1 || param.output.tensorDim[1] > 4) {
shouldUseMPS = true
}
} else {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 { if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true shouldUseMPS = true
} }
} }
}
if type(of: self).isWinoGrad(functionName: functionName) { if type(of: self).isWinoGrad(functionName: functionName) {
shouldUseMPS = false shouldUseMPS = false
} }
...@@ -162,14 +156,10 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -162,14 +156,10 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
throw PaddleMobileError.predictError(message: " encode is nil") throw PaddleMobileError.predictError(message: " encode is nil")
} }
encoder.setTexture(param.input.metalTexture, index: 0) encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1) encoder.setTexture(param.y?.metalTexture, index: 1)
encoder.setTexture(param.output.metalTexture, index: 2)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0) encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
if let y = param.y {
encoder.setBuffer(y.buffer, offset: 0, index: 2)
} else {
encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2)
}
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil) encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil)
encoder.endEncoding() encoder.endEncoding()
} }
...@@ -196,7 +186,8 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -196,7 +186,8 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
desc.strideInPixelsX = Int(param.stride[0]) desc.strideInPixelsX = Int(param.stride[0])
desc.strideInPixelsY = Int(param.stride[1]) desc.strideInPixelsY = Int(param.stride[1])
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init()) let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.y) let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.yTensor)
let conv = MPSCNNConvolution.init(device: device, weights: dataSource) let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0) conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
conv.edgeMode = .zero conv.edgeMode = .zero
...@@ -219,11 +210,12 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -219,11 +210,12 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
} }
let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1]) let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC) param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
if let y = param.y {
y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) if param.y == nil {
} else { let blankTensor = Tensor<P>.init(inDim: Dim(inDim: [1, 1, 1, 4]), inLayout: DataLayout.NHWC(), originDimsCount: 4)
blankTensor = Tensor<P>.init(inDim: Dim(inDim: [1, 1, 1, 4]), inLayout: DataLayout.NHWC(), originDimsCount: 4) blankTexture = Texture.init(device: device, inDim: blankTensor.dim)
blankTensor?.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) let value:[P] = [P(Float32(1.0)), P(Float32(1.0)), P(Float32(1.0)), P(Float32(1.0)),]
blankTexture?.metalTexture = device.tensor2texture(value: value, dim: blankTensor.dim.dims, transpose: [0, 2, 3, 1], inComputePrecision: GlobalConfig.shared.computePrecision)
} }
} }
...@@ -231,29 +223,32 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -231,29 +223,32 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
if GlobalConfig.shared.computePrecision == .Float16 { if GlobalConfig.shared.computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_relu_1x1_half" return "conv_add_relu_1x1_half"
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { }
if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
if useAggressiveOptimization { if useAggressiveOptimization {
let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3 let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3
&& param.filter.n == 16 && param.stride[0] == 1 && param.stride[1] == 1 && param.filter.n <= 16 && param.stride[0] == 1 && param.stride[1] == 1
&& param.dilations[0] == 1 && param.dilations[1] == 1 && param.dilations[0] == 1 && param.dilations[1] == 1
if couldUseWinograd { if couldUseWinograd {
return "depthwise_conv_add_relu_3x3_half_winograd" return "depthwise_conv_add_relu_3x3_half_winograd"
} }
} }
return "depthwise_conv_add_relu_3x3_half" return "depthwise_conv_add_relu_3x3_half"
} else if param.filter.width == 3 && param.filter.height == 3 { }
if param.filter.width == 3 && param.filter.height == 3 {
if param.groups == 1 { if param.groups == 1 {
return "conv_add_relu_3x3_half" return "conv_add_relu_3x3_half"
} else { } else {
return "group_conv_add_relu_3x3_half" return "group_conv_add_relu_3x3_half"
} }
} else if param.filter.width == 1 && param.filter.height == 5 { }
if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_relu_5x1_half" return "conv_add_relu_5x1_half"
} else if param.filter.width == 5 && param.filter.height == 1 { }
if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_relu_1x5_half" return "conv_add_relu_1x5_half"
} else {
return nil
} }
return nil
} else if GlobalConfig.shared.computePrecision == .Float32 { } else if GlobalConfig.shared.computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
return "conv_add_relu_1x1" return "conv_add_relu_1x1"
......
...@@ -30,16 +30,10 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -30,16 +30,10 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
var shouldUseMPS = false var shouldUseMPS = false
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization) let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) { if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
if initContext.useAggresiveOptimization {
if (param.input.tensorDim[1] == 1 || param.input.tensorDim[1] > 4) && (param.output.tensorDim[1] == 1 || param.output.tensorDim[1] > 4) {
shouldUseMPS = true
}
} else {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 { if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true shouldUseMPS = true
} }
} }
}
if type(of: self).isWinoGrad(functionName: functionName) { if type(of: self).isWinoGrad(functionName: functionName) {
shouldUseMPS = false shouldUseMPS = false
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
import Foundation
import MetalPerformanceShaders
class ConvReluKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
override func hasAddOp() -> Bool {
return false
}
override func hasReluOp() -> Bool {
return true
}
}
...@@ -185,6 +185,7 @@ class ProgramOptimize<P: PrecisionProtocol> { ...@@ -185,6 +185,7 @@ class ProgramOptimize<P: PrecisionProtocol> {
// register fusion // register fusion
let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self, let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
ConvAddReluOp<P>.self, ConvAddReluOp<P>.self,
ConvReluOp<P>.self,
// ConvAddAddPreluOp<P>.self, // ConvAddAddPreluOp<P>.self,
ConvAddPreluOp<P>.self, ConvAddPreluOp<P>.self,
ConvAddOp<P>.self, ConvAddOp<P>.self,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册