未验证 提交 7c216695 编写于 作者: N NazgulLee 提交者: GitHub

1. fix add bias logic; 2. fix several typo (#1687)

上级 5b197f4b
......@@ -107,6 +107,15 @@ inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
}
}
struct ElementwiseAddParam {
int32_t fast;
int32_t axis;
int32_t ylen;
int32_t xdim[4];
int32_t xtrans[4];
int32_t ydim[4];
int32_t ytrans[4];
};
struct MetalConvParam {
short offsetX;
......@@ -122,4 +131,5 @@ struct MetalConvParam {
ushort oC;
ushort hasAddOp;
ushort hasReluOp;
ElementwiseAddParam addParam;
};
......@@ -204,3 +204,16 @@ struct ConcatParam {
#undef N
#undef R
#undef V
#define V VY
#define R 4
#define N 3
#define P float
#include "ConcatKernel.inc.metal"
#undef P
#define P half
#include "ConcatKernel.inc.metal"
#undef P
#undef N
#undef R
#undef V
......@@ -17,6 +17,56 @@
using namespace metal;
half4 getBiasHalf(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<half, access::sample> biasTexture) {
half4 output;
if (addParam.fast) {
output = biasTexture.read(gid.xy, gid.z);
} else {
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
int32_t xtrans[4] = {addParam.xtrans[0], addParam.xtrans[1], addParam.xtrans[2], addParam.xtrans[3]};
int32_t ytrans[4] = {addParam.ytrans[0], addParam.ytrans[1], addParam.ytrans[2], addParam.ytrans[3]};
int32_t yshift = 4 - addParam.ylen - addParam.axis;
for (int n = 0; n < 4; n++) {
x_xyzn[3] = n;
xyzn2abcd(addParam.xdim[3], x_xyzn, x_abcd);
invtrans(xtrans, x_abcd, t_abcd);
for (int k = addParam.axis; k < (addParam.axis + addParam.ylen); k++) {
y_abcd[yshift+k] = t_abcd[k];
}
trans(ytrans, y_abcd, t_abcd);
abcd2xyzn(addParam.ydim[3], t_abcd, y_xyzn);
output[n] = biasTexture.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
}
}
return output;
}
float4 getBias(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<float, access::sample> biasTexture) {
float4 output;
if (addParam.fast) {
output = float4(biasTexture.read(gid.xy, gid.z));
} else {
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
int32_t xtrans[4] = {addParam.xtrans[0], addParam.xtrans[1], addParam.xtrans[2], addParam.xtrans[3]};
int32_t ytrans[4] = {addParam.ytrans[0], addParam.ytrans[1], addParam.ytrans[2], addParam.ytrans[3]};
int32_t yshift = 4 - addParam.ylen - addParam.axis;
for (int n = 0; n < 4; n++) {
x_xyzn[3] = n;
xyzn2abcd(addParam.xdim[3], x_xyzn, x_abcd);
invtrans(xtrans, x_abcd, t_abcd);
for (int k = addParam.axis; k < (addParam.axis + addParam.ylen); k++) {
y_abcd[yshift+k] = t_abcd[k];
}
trans(ytrans, y_abcd, t_abcd);
abcd2xyzn(addParam.ydim[3], t_abcd, y_xyzn);
output[n] = biasTexture.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
}
}
return output;
}
#pragma mark - convAdd
kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::sample> biasTexture [[texture(1)]],
......@@ -39,7 +89,11 @@ kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = getBias(gid, addParam, biasTexture);
}
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
......@@ -83,7 +137,11 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = getBias(gid, addParam, biasTexture);
}
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
......@@ -146,7 +204,11 @@ kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTex
const uint kernelHXW = 9;
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = getBias(gid, addParam, biasTexture);
}
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
......@@ -205,7 +267,11 @@ kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = getBias(gid, addParam, biasTexture);
}
ushort dilation_y = param.dilationY;
float4 input[5];
......@@ -262,7 +328,11 @@ kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = getBias(gid, addParam, biasTexture);
}
ushort dilation_x = param.dilationX;
float4 input[5];
......@@ -313,7 +383,13 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> i
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = getBias(gid, addParam, biasTexture);
}
float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
......@@ -358,7 +434,11 @@ kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = float4(getBiasHalf(gid, addParam, biasTexture));
}
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
......@@ -399,7 +479,11 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = float4(getBiasHalf(gid, addParam, biasTexture));
}
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
......@@ -452,7 +536,11 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> i
const uint kernelHXW = 9;
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = float4(getBiasHalf(gid, addParam, biasTexture));
}
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
......@@ -505,7 +593,13 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sampl
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = float4(getBiasHalf(gid, addParam, biasTexture));
}
half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
......@@ -584,7 +678,7 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
for (int c = 0; c < 4; ++c) {
if (hasComputedC + c >= param.oC) {
return;
break;
}
half I[16];
for (int i = 0; i < 16; ++i) {
......@@ -644,13 +738,14 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
}
if (param.hasAddOp == 1) {
half4 base = biasTexture.sample(sample, float2(tx, ty), tc);
constant ElementwiseAddParam &addParam = param.addParam;
half4 base = getBiasHalf(uint3(tx, ty, tc), addParam, biasTexture);
res[0] += base;
base = biasTexture.sample(sample, float2(tx + 1, ty), tc);
base = getBiasHalf(uint3(tx + 1, ty, tc), addParam, biasTexture);
res[1] += base;
base = biasTexture.sample(sample, float2(tx, ty + 1), tc);
base = getBiasHalf(uint3(tx, ty + 1, tc), addParam, biasTexture);
res[2] += base;
base = biasTexture.sample(sample, float2(tx + 1, ty + 1), tc);
base = getBiasHalf(uint3(tx + 1, ty + 1, tc), addParam, biasTexture);
res[3] += base;
}
......@@ -690,7 +785,11 @@ kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = float4(getBiasHalf(gid, addParam, biasTexture));
}
ushort dilation_y = param.dilationY;
half4 input[5];
......@@ -747,7 +846,11 @@ kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTextu
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0);
float4 output = float4(0.0, 0.0, 0.0, 0.0);
if (param.hasAddOp) {
constant ElementwiseAddParam &addParam = param.addParam;
output = float4(getBiasHalf(gid, addParam, biasTexture));
}
ushort dilation_x = param.dilationX;
half4 input[5];
......
......@@ -17,16 +17,6 @@
using namespace metal;
struct ElementwiseAddParam {
int32_t fast;
int32_t axis;
int32_t ylen;
int32_t xdim[4];
int32_t xtrans[4];
int32_t ydim[4];
int32_t ytrans[4];
};
kernel void elementwise_add(texture2d_array<float, access::read> inputX [[texture(0)]],
texture2d_array<float, access::read> inputY [[texture(1)]],
texture2d_array<float, access::write> outTexture [[texture(2)]],
......
......@@ -16,16 +16,6 @@
#include "Common.metal"
using namespace metal;
struct ElementwiseAddParam {
int32_t fast;
int32_t axis;
int32_t ylen;
int32_t xdim[4];
int32_t xtrans[4];
int32_t ydim[4];
int32_t ytrans[4];
};
#define P float
#define PRELU_CHANNEL prelu_channel
......
......@@ -287,7 +287,13 @@ extension MTLDevice {
var rcount: Int = (ndim[0] * ndim[3] + 3) / 4
rcount = rcount * 4 * ndim[1] * ndim[2]
var nvalue: [Float32] = .init(repeating: 0.0, count: rcount)
var value32: [Float32]?
if value is [Float16] {
var value16 = value as! [Float16]
value32 = float16To32(input: &value16, count: value.count)
} else {
value32 = value as? [Float32]
}
for i0 in 0..<tdim[0] {
for i1 in 0..<tdim[1] {
for i2 in 0..<tdim[2] {
......@@ -298,8 +304,11 @@ extension MTLDevice {
let jg = transpose.map { ig[$0] }
let k = jg[0] * ndim[3] + jg[3]
let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4)
nvalue[jx] = value[ix] as! Float32
if let value32 = value32 {
nvalue[jx] = value32[ix]
} else {
fatalError("tensor2texture tensor value type not support")
}
}
}
}
......
......@@ -325,7 +325,7 @@ public class PaddleMobileUnitTest {
let fC = 4
let oC = 4
let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
......
......@@ -105,8 +105,8 @@ public class Loader<P: PrecisionProtocol>: Loaderable {
} while (false)
} else {
fseek(file, MemoryLayout<CChar>.size * tensorDescSize, SEEK_CUR)
}
nowIndex += MemoryLayout<CChar>.size * tensorDescSize
}
/*
这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
......
......@@ -24,6 +24,11 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
paddings = try ConvAddReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
dilations = try ConvAddReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
groups = try ConvAddReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
do {
axis = try ConvAddReluParam.getAttr(key: "axis", attrs: opDesc.attrs)
} catch {
axis = -1
}
do {
y = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
} catch {
......@@ -32,7 +37,7 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
let device = input.metalTexture!.device
y = Texture.init(device: device, inDim: yTensor.dim)
let value: [P] = Array(UnsafeBufferPointer(start: yTensor.data.pointer, count: yTensor.dim.numel()))
y?.metalTexture = device.tensor2texture(value: value, dim: yTensor.dim.dims, transpose: [0, 2, 3, 1], inComputePrecision: GlobalConfig.shared.computePrecision)
y?.metalTexture = device.tensor2texture(value: value, dim: yTensor.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
self.yTensor = yTensor
} catch {
}
......@@ -49,6 +54,7 @@ class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
let paddings: [Int32]
let dilations: [Int32]
let groups: Int
let axis: Int
var y: Texture?
var yTensor: Tensor<P>?
......
......@@ -64,7 +64,7 @@ class FeedOp<P: PrecisionProtocol>: Operator<Texture2DTo2DArrayKernel<P>, FeedPa
func delogOutput() {
print(" \(type) output: ")
print(para.output.metalTexture)
print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray())
print(para.output.toTensor().strideArray())
}
}
......@@ -135,7 +135,7 @@ class ConvAddAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
// print("metal param: ")
// print(inMetalParam)
......
......@@ -98,7 +98,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionProtocol>: Kernel, Computable, Test
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
......@@ -16,11 +16,11 @@ import Foundation
import MetalPerformanceShaders
class ConvAddKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
override func hasAddOp() -> Bool {
override class func hasAddOp() -> Bool {
return true
}
override func hasReluOp() -> Bool {
override class func hasReluOp() -> Bool {
return false
}
}
......
......@@ -135,7 +135,7 @@ class ConvAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
// print("metal param: ")
// print(inMetalParam)
......
......@@ -29,6 +29,7 @@ public struct MetalConvParam {
let oC: UInt16
let hasAddOp: UInt16
let hasReluOp: UInt16
let addParam: ElementwiseAddMetalParam
}
@available(iOS 11.0, *)
......@@ -124,7 +125,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) {
let inputChannel = param.input.tensorDim[1]
let outputChannel = param.output.tensorDim[1]
if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) {
if inputChannel > 4 && outputChannel > 4 {
shouldUseMPS = true
}
}
......@@ -135,6 +136,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
if !isDepthWise && param.groups > 1 {
shouldUseMPS = false
}
if type(of: self).hasAddOp() {
if !(type(of: self).canAddUseMPS(param: param)) {
shouldUseMPS = false
}
}
if shouldUseMPS {
super.init(device: device, inFunctionName: nil, initContext: initContext)
setupWithMPS(device: device, param: param)
......@@ -195,11 +201,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
param.input.useMPS = true
param.output.useMPS = true
if #available(iOS 11.3, *) {
if param.y != nil {
if type(of: self).hasAddOp() && type(of: self).canMPSAddByElement(param: param) && !type(of: self).canMPSAddByChannel(param: param) {
mpsAddOp = MPSCNNAdd(device: device)
if hasReluOp() {
mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0)
}
if type(of: self).hasReluOp() {
mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0)
}
}
let neuronFilter: MPSCNNNeuron? = param.y != nil ? nil : (neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
......@@ -217,7 +223,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
desc.strideInPixelsX = Int(param.stride[0])
desc.strideInPixelsY = Int(param.stride[1])
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.yTensor)
var biasTerms: Tensor<P>? = nil
if type(of: self).hasAddOp() && type(of: self).canMPSAddByChannel(param: param) {
biasTerms = param.yTensor
}
let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: biasTerms)
let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
......@@ -233,7 +243,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0))
var addParam = ElementwiseAddMetalParam()
if let inputY = param.y {
addParam = ElementwiseAddKernel<P>.metalParamFrom(inputX: param.output, inputY: inputY, axis: param.axis)
}
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(type(of: self).hasAddOp() ? 1 : 0), hasReluOp: UInt16(type(of: self).hasReluOp() ? 1 : 0), addParam: addParam)
metalParam = inMetalParam
if type(of: self).isWinoGrad(functionName: functionName) {
......@@ -304,7 +318,7 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
}
open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
if hasReluOp() {
if type(of: self).hasReluOp() {
if #available(iOS 10.0, *) {
return MPSCNNNeuronReLU(device: device, a: 0)
}
......@@ -312,11 +326,29 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
return nil
}
open func hasAddOp() -> Bool {
open class func canAddUseMPS(param: ConvAddReluParam<P>) -> Bool {
return canMPSAddByChannel(param: param) || canMPSAddByElement(param: param)
}
private class func canMPSAddByChannel(param: ConvAddReluParam<P>) -> Bool {
if let yTensor = param.yTensor, yTensor.dim.cout() == 1 {
return true
}
return false
}
private class func canMPSAddByElement(param: ConvAddReluParam<P>) -> Bool {
if let y = param.y, y.dim.dims == param.input.dim.dims {
return true
}
return false
}
open class func hasAddOp() -> Bool {
return true
}
open func hasReluOp() -> Bool {
open class func hasReluOp() -> Bool {
return true
}
......
......@@ -105,7 +105,7 @@ class ConvBNReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable {
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0))
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
......@@ -66,7 +66,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
throw PaddleMobileError.predictError(message: " encode is nil")
}
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setTexture(param.output.metalTexture, index: 2)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2)
......@@ -111,7 +111,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0))
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0), addParam: ElementwiseAddMetalParam())
metalParam = inMetalParam
if type(of: self).isWinoGrad(functionName: functionName) {
......@@ -130,7 +130,7 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
} else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
if useAggressiveOptimization {
let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3
&& param.filter.n == 16 && param.stride[0] == 1 && param.stride[1] == 1
&& param.filter.n <= 16 && param.stride[0] == 1 && param.stride[1] == 1
&& param.dilations[0] == 1 && param.dilations[1] == 1
if couldUseWinograd {
return "depthwise_conv_add_relu_3x3_half_winograd"
......
......@@ -16,11 +16,11 @@ import Foundation
import MetalPerformanceShaders
class ConvReluKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
override func hasAddOp() -> Bool {
override class func hasAddOp() -> Bool {
return false
}
override func hasReluOp() -> Bool {
override class func hasReluOp() -> Bool {
return true
}
}
......
......@@ -34,27 +34,8 @@ class ElementwiseAddKernel<P: PrecisionProtocol>: Kernel, Computable {
throw error
}
metalParam = ElementwiseAddMetalParam.init()
metalParam = ElementwiseAddKernel.metalParamFrom(inputX: param.inputX, inputY: param.inputY, axis: param.axis)
let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
if param.axis == -1 {
metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
} else {
metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
}
metalParam.ylen = Int32(param.inputY.tensorDim.cout())
if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
// print("===> elementwise_add fast!!!")
metalParam.fast = 1
}
if GlobalConfig.shared.computePrecision == .Float32 {
super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext)
} else if GlobalConfig.shared.computePrecision == .Float16 {
......@@ -75,4 +56,29 @@ class ElementwiseAddKernel<P: PrecisionProtocol>: Kernel, Computable {
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
}
static func metalParamFrom(inputX: Texture, inputY: Texture, axis: Int) -> ElementwiseAddMetalParam {
var metalParam = ElementwiseAddMetalParam.init()
let xdim: [Int32] = (0..<4).map { Int32(inputX.dim[$0]) }
let ydim: [Int32] = (0..<4).map { Int32(inputY.dim[$0]) }
let xtrans: [Int32] = (0..<4).map { Int32(inputX.transpose[$0]) }
let ytrans: [Int32] = (0..<4).map { Int32(inputY.transpose[$0]) }
metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
if axis == -1 {
metalParam.axis = 4 - Int32(inputY.tensorDim.cout())
} else {
metalParam.axis = 4 - Int32(inputX.tensorDim.cout()) + Int32(axis)
}
metalParam.ylen = Int32(inputY.tensorDim.cout())
if (inputX.dim == inputY.dim) && (inputX.transpose == inputY.transpose) {
// print("===> elementwise_add fast!!!")
metalParam.fast = 1
}
return metalParam
}
}
......@@ -26,6 +26,11 @@ class ReluKernel<P: PrecisionProtocol>: Kernel, Computable{
}
required init(device: MTLDevice, param: ReluParam<P>, initContext: InitContext) throws {
do {
try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
} catch let error {
throw error
}
if GlobalConfig.shared.computePrecision == .Float32 {
super.init(device: device, inFunctionName: "relu", initContext: initContext)
} else if GlobalConfig.shared.computePrecision == .Float16 {
......
......@@ -34,10 +34,10 @@ class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{
}
var shouldUseMPS = false
if initContext.useMPS && param.biasAfterScale {
if initContext.useMPS && param.biasAfterScale && param.input.tensorDim.cout() == 4 && param.output.tensorDim.cout() == 4 {
let inputChannel = param.input.tensorDim[1]
let outputChannel = param.output.tensorDim[1]
if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) {
if (inputChannel > 4) && (outputChannel > 4) {
shouldUseMPS = true
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册