提交 40642fe0 编写于 作者: L liuruilong

fix dilation bug

上级 8cbed747
...@@ -314,7 +314,7 @@ public class PaddleMobileUnitTest { ...@@ -314,7 +314,7 @@ public class PaddleMobileUnitTest {
let offsetX = filterSize.width/2 - paddings.0 let offsetX = filterSize.width/2 - paddings.0
let offsetY = filterSize.height/2 - paddings.1 let offsetY = filterSize.height/2 - paddings.1
let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), paddedZ: UInt16(paddings.0), dilationX: UInt16(1), dilationY: UInt16(1)) let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1))
let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize) let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
import Foundation import Foundation
let testTo = 54 let testTo = 54
var isTest = false
let computePrecision: ComputePrecision = .Float32 let computePrecision: ComputePrecision = .Float32
...@@ -131,8 +133,9 @@ public class Executor<P: PrecisionType> { ...@@ -131,8 +133,9 @@ public class Executor<P: PrecisionType> {
print(" 第 \(i) 个 op: ") print(" 第 \(i) 个 op: ")
op.delogOutput() op.delogOutput()
} }
// self.ops[53].delogOutput()
// return return
let afterDate = Date.init() let afterDate = Date.init()
......
...@@ -93,11 +93,23 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, ...@@ -93,11 +93,23 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
} }
func delogOutput() { func delogOutput() {
print(" padding: ")
print(para.paddings)
print("stride: ") print("stride: ")
print(para.stride) print(para.stride)
print("dilations: ") print("dilations: ")
print(para.dilations) print(para.dilations)
print(" \(type) output: ") print(" \(type) output: ")
print(" para input dim: ")
print(para.input.dim)
print(" para filter dim: ")
print(para.filter.dim)
print(" para output dim: ")
print(para.output.dim)
print(" biase: ")
let biase: [Float32] = para.y.buffer.array()
print(biase)
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
} }
......
...@@ -77,7 +77,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable ...@@ -77,7 +77,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
print("offset y: \(offsetY)") print("offset y: \(offsetY)")
let offsetZ = 0.0 let offsetZ = 0.0
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
var invs: [P] = [] var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
...@@ -17,7 +17,6 @@ import Foundation ...@@ -17,7 +17,6 @@ import Foundation
class ConvAddKernel<P: PrecisionType>: Kernel, Computable { class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
var metalParam: MetalConvParam! var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvAddParam<P>) { required init(device: MTLDevice, param: ConvAddParam<P>) {
if computePrecision == .Float16 { if computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_1x1_half") super.init(device: device, inFunctionName: "conv_add_1x1_half")
...@@ -47,11 +46,12 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable { ...@@ -47,11 +46,12 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
param.filter.initBuffer(device: device, precision: computePrecision) param.filter.initBuffer(device: device, precision: computePrecision)
param.y.initBuffer(device: device, precision: computePrecision) param.y.initBuffer(device: device, precision: computePrecision)
print(" function: \(functionName)")
print("offset x: \(offsetX)") print("offset x: \(offsetX)")
print("offset y: \(offsetY)") print("offset y: \(offsetY)")
let offsetZ = 0.0 let offsetZ = 0.0
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
print("metal param: ") print("metal param: ")
print(inMetalParam) print(inMetalParam)
......
...@@ -93,7 +93,7 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable { ...@@ -93,7 +93,7 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
let offsetZ = 0.0 let offsetZ = 0.0
print(" fuck ") print(" fuck ")
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
var invs: [P] = [] var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
...@@ -20,7 +20,6 @@ public struct MetalConvParam { ...@@ -20,7 +20,6 @@ public struct MetalConvParam {
let offsetZ: Int16 let offsetZ: Int16
let strideX: UInt16 let strideX: UInt16
let strideY: UInt16 let strideY: UInt16
let paddedZ: UInt16
let dilationX: UInt16 let dilationX: UInt16
let dilationY: UInt16 let dilationY: UInt16
} }
...@@ -41,7 +40,7 @@ class ConvKernel<P: PrecisionType>: Kernel, Computable { ...@@ -41,7 +40,7 @@ class ConvKernel<P: PrecisionType>: Kernel, Computable {
let offsetZ = 0.0 let offsetZ = 0.0
param.filter.initBuffer(device: device, precision: ComputePrecision.Float32) param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
} }
func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
......
...@@ -68,3 +68,4 @@ struct MetalConvParam { ...@@ -68,3 +68,4 @@ struct MetalConvParam {
ushort dilationX; ushort dilationX;
ushort dilationY; ushort dilationY;
}; };
...@@ -66,8 +66,6 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text ...@@ -66,8 +66,6 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]], const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]], const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
...@@ -80,8 +78,11 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text ...@@ -80,8 +78,11 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9; const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size(); uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0); float4 output = float4(0.0);
...@@ -90,16 +91,35 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text ...@@ -90,16 +91,35 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
ushort dilation_y = param.dilationY; ushort dilation_y = param.dilationY;
float4 input[9]; float4 input[9];
for (uint i = 0; i < input_arr_size; ++i) { for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); // input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
// input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
// input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
// input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
// input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
// input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
// input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
// input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
// input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i);
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
...@@ -118,13 +138,83 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text ...@@ -118,13 +138,83 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
outTexture.write(output, gid.xy, gid.z); outTexture.write(output, gid.xy, gid.z);
} }
kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; }
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
float4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i);
for (int j = 0; j < 9; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w);
}
}
// output = output + biase[gid.z];
outTexture.write(output, gid.xy, gid.z);
}
kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float *weights [[buffer(1)]], const device float *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]], const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
...@@ -211,8 +301,6 @@ kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[ ...@@ -211,8 +301,6 @@ kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]], const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]], const device half4 *biase [[buffer(2)]],
const device half4 *new_scale [[buffer(3)]],
const device half4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
...@@ -268,8 +356,6 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> in ...@@ -268,8 +356,6 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> in
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]], const device half *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]], const device half4 *biase [[buffer(2)]],
const device half4 *new_scale [[buffer(3)]],
const device half4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册