diff --git a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift index 2764146929dbd6057dfeed15cbeb3add3fb76ee1..8d73f73087253f624256482326e6553632c0c0d8 100644 --- a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift +++ b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift @@ -314,7 +314,7 @@ public class PaddleMobileUnitTest { let offsetX = filterSize.width/2 - paddings.0 let offsetY = filterSize.height/2 - paddings.1 - let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), paddedZ: UInt16(paddings.0), dilationX: UInt16(1), dilationY: UInt16(1)) + let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1)) let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize) diff --git a/metal/paddle-mobile/paddle-mobile/Executor.swift b/metal/paddle-mobile/paddle-mobile/Executor.swift index 47769527d355fae1fde4fd2c4e82631df8d6bf04..55a8c9bed49bd56534a4b726a2723f93c3d5a0a8 100644 --- a/metal/paddle-mobile/paddle-mobile/Executor.swift +++ b/metal/paddle-mobile/paddle-mobile/Executor.swift @@ -15,6 +15,8 @@ import Foundation let testTo = 54 +var isTest = false + let computePrecision: ComputePrecision = .Float32 @@ -131,8 +133,9 @@ public class Executor { print(" 第 \(i) 个 op: ") op.delogOutput() } +// self.ops[53].delogOutput() -// return + return let afterDate = Date.init() diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift index 5e344014188061c3dbb411226b2655a3bc2659b8..b4220c87a648299bb212f910832c6d9aadebe2d5 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift @@ -93,11 +93,23 @@ class ConvAddOp: Operator, ConvAddParam

>, } func delogOutput() { + + print(" padding: ") + print(para.paddings) print("stride: ") print(para.stride) print("dilations: ") print(para.dilations) print(" \(type) output: ") + print(" para input dim: ") + print(para.input.dim) + print(" para filter dim: ") + print(para.filter.dim) + print(" para output dim: ") + print(para.output.dim) + print(" biase: ") + let biase: [Float32] = para.y.buffer.array() + print(biase) print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift index 092207cfb7b9fda63cd6b5aa7082640bae515149..bcdcc732b08e5f578bf1b35eb754cf221217144c 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift @@ -77,7 +77,7 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable print("offset y: \(offsetY)") let offsetZ = 0.0 - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) var invs: [P] = [] let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift index ce1e0f6560e9911e862ead537089d37fdb4fe1c4..51195fcf8e4456db61730412f86bfc26e2817936 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift @@ -17,7 +17,6 @@ import Foundation class ConvAddKernel: Kernel, Computable { var metalParam: MetalConvParam! required init(device: MTLDevice, param: ConvAddParam

) { - if computePrecision == .Float16 { if param.filter.width == 1 && param.filter.height == 1 { super.init(device: device, inFunctionName: "conv_add_1x1_half") @@ -47,11 +46,12 @@ class ConvAddKernel: Kernel, Computable { param.filter.initBuffer(device: device, precision: computePrecision) param.y.initBuffer(device: device, precision: computePrecision) + print(" function: \(functionName)") print("offset x: \(offsetX)") print("offset y: \(offsetY)") let offsetZ = 0.0 - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) print("metal param: ") print(inMetalParam) diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift index cd528bb588849958722b24ea77a0e14a6abc502e..4b978e24d2b968d4ee2ee8443d74ce54f502be8b 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift @@ -93,7 +93,7 @@ class ConvBNReluKernel: Kernel, Computable, Testable { let offsetZ = 0.0 print(" fuck ") - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) var invs: [P] = [] let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift index e0485851fd610781f475eb43be1ce6fd4937a4ef..6c10ba8d18f3c0e386769c11867e8bb361f49b21 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift @@ -20,7 +20,6 @@ public struct MetalConvParam { let offsetZ: Int16 let strideX: UInt16 let strideY: UInt16 - let paddedZ: UInt16 let dilationX: UInt16 let dilationY: UInt16 } @@ -41,7 +40,7 @@ class ConvKernel: Kernel, Computable { let offsetZ = 0.0 param.filter.initBuffer(device: device, precision: ComputePrecision.Float32) - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) } func compute(commandBuffer: MTLCommandBuffer, param: ConvParam

) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal index d37be42be64f8fdd7325fd62a68e646737b6dedf..da703d163f1f78dbfeb0d33e106c4f8e4ab0c4a2 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal @@ -68,3 +68,4 @@ struct MetalConvParam { ushort dilationX; ushort dilationY; }; + diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal index 9244b2ec4631015ffd192567f734bee4cc1c7c85..e8eb31733e5a4a1832cf0499326046cbee73e75d 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal @@ -66,8 +66,6 @@ kernel void conv_add_3x3(texture2d_array inTexture [[text constant MetalConvParam ¶m [[buffer(0)]], const device float4 *weights [[buffer(1)]], const device float4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || @@ -80,8 +78,11 @@ kernel void conv_add_3x3(texture2d_array inTexture [[text const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; float4 output = float4(0.0); @@ -90,16 +91,35 @@ kernel void conv_add_3x3(texture2d_array inTexture [[text ushort dilation_y = param.dilationY; float4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); +// input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); +// input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); +// input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); +// input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); +// input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); +// input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); +// input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); +// input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); +// input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + for (int j = 0; j < 9; ++j) { float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.x += dot(input[j], weight_x); @@ -118,13 +138,83 @@ kernel void conv_add_3x3(texture2d_array inTexture [[text outTexture.write(output, gid.xy, gid.z); } + +kernel void test_conv_add_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 9; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + float4 input[9]; + + for (uint i = 0; i < input_arr_size; ++i) { + + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } +// output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); +} + kernel void depthwise_conv_add_3x3(texture2d_array inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], constant MetalConvParam ¶m [[buffer(0)]], const device float *weights [[buffer(1)]], const device float4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || @@ -211,8 +301,6 @@ kernel void conv_add_3x3_half(texture2d_array inTexture [[ constant MetalConvParam ¶m [[buffer(0)]], const device half4 *weights [[buffer(1)]], const device half4 *biase [[buffer(2)]], - const device half4 *new_scale [[buffer(3)]], - const device half4 *new_biase [[buffer(4)]], uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || @@ -268,8 +356,6 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array in constant MetalConvParam ¶m [[buffer(0)]], const device half *weights [[buffer(1)]], const device half4 *biase [[buffer(2)]], - const device half4 *new_scale [[buffer(3)]], - const device half4 *new_biase [[buffer(4)]], uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() ||