提交 6b32f9ad 编写于 作者: Y Yanzhan Yang 提交者: GitHub

1.add groups to conv3x3. 2.add original size to tensor and tenture and adjust...

1.add groups to conv3x3. 2.add original size to tensor and tenture and adjust concat layer to handle tensor truncation logic. (#1643)
上级 75efc594
......@@ -6,14 +6,6 @@
#include <metal_stdlib>
using namespace metal;
struct MetalConvParam {
short offsetX;
short offsetY;
short offsetZ;
ushort strideX;
ushort strideY;
};
kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
const device float4 *new_scale [[buffer(0)]],
......
......@@ -116,4 +116,8 @@ struct MetalConvParam {
ushort strideY;
ushort dilationX;
ushort dilationY;
ushort groups;
ushort iC;
ushort fC;
ushort oC;
};
......@@ -67,8 +67,8 @@ struct ConcatParam {
#undef R
#undef V
// lens: (R=4, N=3, V=y)
#define V VY
// lens: (R=4, N=3, V=normal)
#define V VNORMAL
#define R 4
#define N 3
#define P float
......
......@@ -129,6 +129,61 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
outTexture.write(relu, gid.xy, gid.z);
}
kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
float4 output = biase[gid.z];
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
float input[9];
uint iC = param.iC, fC = param.fC, oC = param.oC;
uint filter_array_size = (fC + 3) / 4;
for (uint c = 0; c < 4; ++c) {
uint output_depth = gid.z * 4 + c, output_c = output_depth % oC, output_n = output_depth / oC;
for (uint i = 0; i < fC; ++i) {
uint input_depth = output_n * iC + output_c * fC + i;
uint input_array_index = input_depth / 4;
uint input_array_item_index = input_depth % 4;
input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), input_array_index)[input_array_item_index];
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), input_array_index)[input_array_item_index];
input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), input_array_index)[input_array_item_index];
input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
for (int j = 0; j < 9; ++j) {
float weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i];
output[c] += input[j] * weight;
}
}
}
float4 relu = fmax(output, 0.0);
outTexture.write(relu, gid.xy, gid.z);
}
kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
......@@ -385,6 +440,61 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
outTexture.write(half4(relu), gid.xy, gid.z);
}
kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
half4 output = biase[gid.z];
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
half input[9];
uint iC = param.iC, fC = param.fC, oC = param.oC;
uint filter_array_size = (fC + 3) / 4;
for (uint c = 0; c < 4; ++c) {
uint output_depth = gid.z * 4 + c, output_c = output_depth % oC, output_n = output_depth / oC;
for (uint i = 0; i < fC; ++i) {
uint input_depth = output_n * iC + output_c * fC + i;
uint input_array_index = input_depth / 4;
uint input_array_item_index = input_depth % 4;
input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), input_array_index)[input_array_item_index];
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), input_array_index)[input_array_item_index];
input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), input_array_index)[input_array_item_index];
input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
for (int j = 0; j < 9; ++j) {
half weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i];
output[c] += input[j] * weight;
}
}
}
half4 relu = fmax(output, 0.0);
outTexture.write(relu, gid.xy, gid.z);
}
kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
......
......@@ -320,7 +320,12 @@ public class PaddleMobileUnitTest {
let offsetX = filterSize.width/2 - paddings.0
let offsetY = filterSize.height/2 - paddings.1
let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1))
let groups = 1
let iC = 4
let fC = 4
let oC = 4
let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
......
......@@ -244,7 +244,7 @@ public class Loader<P: PrecisionProtocol>: Loaderable{
}
let dim = Dim.init(inDim: dimArr)
let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout, originDimsCount: tensorDesc.originDimsCount)
do {
if paraLoaderPointer != nil {
try paraLoaderPointer!.read(tensor: tensor)
......@@ -261,7 +261,9 @@ public class Loader<P: PrecisionProtocol>: Loaderable{
scope[varDesc.name] = tensor
} else {
let dim = Dim.init(inDim: tensorDesc.dims)
scope[varDesc.name] = Texture.init(device: device, inDim: dim)
let texture = Texture.init(device: device, inDim: dim)
texture.originDimsCount = tensorDesc.originDimsCount
scope[varDesc.name] = texture
}
} else {
if varDesc.name == fetchKey {
......
......@@ -202,6 +202,7 @@ class Tensor<P: PrecisionProtocol>: Tensorial {
var data: Data
var dim: Dim
var originDimsCount: Int
/// 模型中的维度: 未经过转换 paddle 模型维度为 N C H W
var tensorDim: Dim
......@@ -243,12 +244,13 @@ class Tensor<P: PrecisionProtocol>: Tensorial {
}
}
init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) {
init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW(), originDimsCount: Int?) {
tensorDim = inDim
dim = inDim
let pointer = UnsafeMutablePointer<P>.allocate(capacity: inDim.numel())
data = Data.init(inCount: inDim.numel(), inPointer: pointer)
layout = inLayout
self.originDimsCount = originDimsCount ?? inDim.cout()
}
func convert(converter: DataConverter<P>) -> UnsafeMutablePointer<P> {
......
......@@ -72,6 +72,7 @@ public class Texture: Tensorial {
public var dim: Dim
public var tensorDim: Dim
public var useMPS = false
public var originDimsCount: Int?
/// tensor dim pad to four
public var padToFourDim: Dim
......
......@@ -35,6 +35,15 @@ class ConcatParam<P: PrecisionProtocol>: OpParam {
input.append(v)
}
axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs)
if input.count > 0 {
if let originDimsCount = input[0].originDimsCount {
let nowDimsCount = input[0].dim.cout()
let diff = originDimsCount - nowDimsCount
if diff > 0 {
axis -= diff
}
}
}
output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope)
} catch let error {
throw error
......@@ -43,7 +52,7 @@ class ConcatParam<P: PrecisionProtocol>: OpParam {
var input: [Texture] = []
var output: Texture
var transpose: [Int] = []
let axis: Int
var axis: Int
}
class ConcatOp<P: PrecisionProtocol>: Operator<ConcatKernel<P>, ConcatParam<P>>, Runable, Creator, InferShaperable{
......
......@@ -132,7 +132,10 @@ class ConvAddAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
// print("offset y: \(offsetY)")
let offsetZ = 0.0
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
// print("metal param: ")
// print(inMetalParam)
......
......@@ -95,7 +95,10 @@ class ConvAddBatchNormReluKernel<P: PrecisionProtocol>: Kernel, Computable, Test
print("offset y: \(offsetY)")
let offsetZ = 0.0
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
......@@ -181,7 +181,10 @@ class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
let offsetZ = 0.0
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
metalParam = inMetalParam
if type(of: self).isWinoGrad(functionName: functionName) {
......
......@@ -132,7 +132,10 @@ class ConvAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
// print("offset y: \(offsetY)")
let offsetZ = 0.0
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
// print("metal param: ")
// print(inMetalParam)
......
......@@ -25,7 +25,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
}
return "depthwise_conv_add_relu_3x3_half"
} else if param.filter.width == 3 && param.filter.height == 3 {
if param.groups == 1 {
return "conv_add_relu_3x3_half"
} else {
return "group_conv_add_relu_3x3_half"
}
} else if param.filter.width == 1 && param.filter.height == 5 {
return "conv_add_relu_5x1_half"
} else if param.filter.width == 5 && param.filter.height == 1 {
......@@ -43,7 +47,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
} else if param.filter.width == 5 && param.filter.height == 1 {
return "conv_add_relu_1x5"
} else if param.filter.width == 3 && param.filter.height == 3 {
if param.groups == 1 {
return "conv_add_relu_3x3"
} else {
return "group_conv_add_relu_3x3"
}
} else {
return nil
}
......
......@@ -102,8 +102,10 @@ class ConvBNReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable {
// print("ConvBNReluKernel offset y: \(offsetY)")
let offsetZ = 0.0
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
......@@ -22,6 +22,10 @@ public struct MetalConvParam {
let strideY: UInt16
let dilationX: UInt16
let dilationY: UInt16
let groups: UInt16
let iC: UInt16
let fC: UInt16
let oC: UInt16
}
class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
......@@ -41,8 +45,11 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
let offsetZ = 0.0
let iC = param.input.tensorDim[1];
let fC = param.filter.tensorDim[1];
let oC = param.output.tensorDim[1];
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
}
func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
......
......@@ -16,6 +16,7 @@ import Foundation
class TensorDesc {
let dims: [Int]
let originDimsCount: Int
let dataType: VarTypeType
let dataLayout: DataLayout = DataLayout.NCHW()
var NCHWDim: [Int] {
......@@ -63,6 +64,8 @@ class TensorDesc {
dimsArray.append(dim)
}
originDimsCount = Int(dimsCount)
if dimsCount > 4 {
let headDims = Int(dimsCount - 4)
for i in 0..<headDims {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册