未验证 提交 76a6a4c5 编写于 作者: D dolphin8 提交者: GitHub

Merge pull request #978 from dolphin8/metal

xx
...@@ -71,7 +71,128 @@ extension MTLDevice { ...@@ -71,7 +71,128 @@ extension MTLDevice {
return buffer! return buffer!
} }
func texture2tensor_loop<P>(texture: MTLTexture, cb: ([Int], P)->Void) -> Void {
let bpR = texture.width * 4 * MemoryLayout<P>.size
let bpI = texture.height * bpR
let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1))
for i in 0..<texture.arrayLength {
let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: bpI)
texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
for tx in 0..<texture.width * texture.height * 4 {
var k = tx
var xyzn: [Int] = [0, 0, 0, 0]
xyzn[1] = k / (texture.width * 4)
k %= (texture.width * 4)
xyzn[3] = k % 4
xyzn[0] = k / 4
xyzn[2] = i
cb(xyzn, pointer[tx])
}
}
}
func texture2tensor_3<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
var tdim: [Int] = [1, 1, 1, 1]
for i in 0..<dim.count {
tdim[4 - dim.count + i] = dim[i]
}
let count = dim.reduce(1) { $0 * $1 }
var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
let ndim: [Int] = transpose.map { tdim[$0] }
assert(dim.count == 3)
assert(texture.width == ndim[3])
assert(texture.height == ndim[2])
assert(ndim[0] == 1)
assert(texture.arrayLength == (ndim[1] + 3) / 4)
texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
var tg: [Int] = [0, 0, 0, 0]
tg[1] = xyzn[2] * 4 + xyzn[3]
tg[2] = xyzn[1]
tg[3] = xyzn[0]
var ig: [Int] = [0, 0, 0, 0]
for k in 0..<4 {
ig[transpose[k]] = tg[k]
}
let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
if ix < count {
tensor[ix] = v
}
}
return tensor
}
func texture2tensor_2<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
var tdim: [Int] = [1, 1, 1, 1]
for i in 0..<dim.count {
tdim[4 - dim.count + i] = dim[i]
}
let count = dim.reduce(1) { $0 * $1 }
var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
let ndim: [Int] = transpose.map { tdim[$0] }
assert(dim.count == 2)
let w = (ndim[3] + 3) / 4
assert(texture.width == w)
assert(texture.height == ndim[2])
assert(ndim[0] == 1)
assert(ndim[1] == 1)
assert(texture.arrayLength == 1)
texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
var tg: [Int] = [0, 0, 0, 0]
tg[2] = xyzn[1]
tg[3] = xyzn[0] * 4 + xyzn[3]
var ig: [Int] = [0, 0, 0, 0]
for k in 0..<4 {
ig[transpose[k]] = tg[k]
}
let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
if ix < count {
tensor[ix] = v
}
}
return tensor
}
func texture2tensor_1<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
var tdim: [Int] = [1, 1, 1, 1]
for i in 0..<dim.count {
tdim[4 - dim.count + i] = dim[i]
}
let count = dim.reduce(1) { $0 * $1 }
var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
let ndim: [Int] = transpose.map { tdim[$0] }
assert(dim.count == 1)
let w = (ndim[3] + 3) / 4
assert(texture.width == w)
assert(texture.height == 1)
assert(ndim[0] == 1)
assert(ndim[1] == 1)
assert(ndim[2] == 1)
assert(texture.arrayLength == 1)
texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
var tg: [Int] = [0, 0, 0, 0]
tg[3] = xyzn[0] * 4 + xyzn[3]
var ig: [Int] = [0, 0, 0, 0]
for k in 0..<4 {
ig[transpose[k]] = tg[k]
}
let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
if ix < count {
tensor[ix] = v
}
}
return tensor
}
func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
if dim.count == 3 {
return texture2tensor_3(texture: texture, dim: dim, transpose: transpose)
} else if dim.count == 2 {
return texture2tensor_2(texture: texture, dim: dim, transpose: transpose)
} else if dim.count == 1 {
return texture2tensor_1(texture: texture, dim: dim, transpose: transpose)
}
var tdim: [Int] = [1, 1, 1, 1] var tdim: [Int] = [1, 1, 1, 1]
for i in 0..<dim.count { for i in 0..<dim.count {
tdim[4 - dim.count + i] = dim[i] tdim[4 - dim.count + i] = dim[i]
...@@ -84,30 +205,19 @@ extension MTLDevice { ...@@ -84,30 +205,19 @@ extension MTLDevice {
assert(texture.height == ndim[1]) assert(texture.height == ndim[1])
assert(texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4) assert(texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4)
let bpR = ndim[2] * 4 * MemoryLayout<P>.size texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
let bpI = ndim[1] * bpR var tg: [Int] = [0, 0, 0, 0]
let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1)) tg[1] = xyzn[1]
for i in 0..<texture.arrayLength { tg[2] = xyzn[0]
let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: ndim[1] * ndim[2] * 4 * MemoryLayout<P>.size) tg[0] = (xyzn[2] * 4 + xyzn[3]) / ndim[3]
texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i) tg[3] = (xyzn[2] * 4 + xyzn[3]) % ndim[3]
var ig: [Int] = [0, 0, 0, 0]
for h in 0..<ndim[1] { for k in 0..<4 {
for w in 0..<ndim[2] { ig[transpose[k]] = tg[k]
for k in 0..<4 { }
let tx = (h * ndim[2] + w) * 4 + k let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
let n = (i * 4 + k) / ndim[3] if ix < count {
let c = (i * 4 + k) % ndim[3] tensor[ix] = v
let jg = [n, h, w, c]
var ig = [0, 0, 0, 0]
for d in 0..<4 {
ig[transpose[d]] = jg[d]
}
let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
if ix < count {
tensor[ix] = pointer[tx]
}
}
}
} }
} }
return tensor return tensor
......
...@@ -30,7 +30,7 @@ public class MobileNet_ssd_AR: Net{ ...@@ -30,7 +30,7 @@ public class MobileNet_ssd_AR: Net{
class MobilenetssdPreProccess: CusomKernel { class MobilenetssdPreProccess: CusomKernel {
init(device: MTLDevice) { init(device: MTLDevice) {
let s = CusomKernel.Shape.init(inWidth: 160, inHeight: 160, inChannel: 3) let s = CusomKernel.Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
super.init(device: device, inFunctionName: "mobilent_ar_preprocess_half", outputDim: s, usePaddleMobileLib: false) super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, usePaddleMobileLib: false)
} }
} }
......
...@@ -56,9 +56,11 @@ class BatchNormOp<P: PrecisionType>: Operator<BatchNormKernel<P>, BatchNormParam ...@@ -56,9 +56,11 @@ class BatchNormOp<P: PrecisionType>: Operator<BatchNormKernel<P>, BatchNormParam
throw error throw error
} }
} }
func delogOutput() {
print(" \(type) output: ")
let device = para.output.metalTexture!.device
let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
print(outputArray.strideArray())
}
} }
...@@ -53,15 +53,10 @@ class BilinearInterpOp<P: PrecisionType>: Operator<BilinearInterpKernel<P>, Bili ...@@ -53,15 +53,10 @@ class BilinearInterpOp<P: PrecisionType>: Operator<BilinearInterpKernel<P>, Bili
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
let padToFourDim = para.output.padToFourDim let device = para.output.metalTexture!.device
if para.output.transpose == [0, 1, 2, 3] { let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) // print(outputArray)
print(outputArray.strideArray()) print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])).strideArray())
} else {
fatalError(" not implemet")
}
} }
} }
......
...@@ -65,15 +65,10 @@ class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Run ...@@ -65,15 +65,10 @@ class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Run
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
let padToFourDim = para.output.padToFourDim
if para.output.transpose == [0, 1, 2, 3] { let device = para.output.metalTexture!.device
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
print(outputArray.strideArray()) print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])).strideArray())
} else {
fatalError(" not implemet")
}
} }
} }
......
...@@ -20,12 +20,13 @@ class BatchNormKernel<P: PrecisionType>: Kernel, Computable { ...@@ -20,12 +20,13 @@ class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
let varianceP = param.variance.data.pointer let varianceP = param.variance.data.pointer
let meanP = param.mean.data.pointer let meanP = param.mean.data.pointer
let scaleP = param.scale.data.pointer let scaleP = param.scale.data.pointer
let biasP = param.scale.data.pointer let biasP = param.bias.data.pointer
for i in 0..<count { for i in 0..<count {
let invStd = P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot()) let invStd = P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot())
biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i] biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i]
scaleP[i] = invStd * scaleP[i] scaleP[i] = invStd * scaleP[i]
} }
param.bias.initBuffer(device: device, precision: computePrecision) param.bias.initBuffer(device: device, precision: computePrecision)
param.scale.initBuffer(device: device, precision: computePrecision) param.scale.initBuffer(device: device, precision: computePrecision)
param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision) param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
......
...@@ -27,10 +27,16 @@ class BilinearInterpKernel<P: PrecisionType>: Kernel, Computable{ ...@@ -27,10 +27,16 @@ class BilinearInterpKernel<P: PrecisionType>: Kernel, Computable{
encoder.setTexture(param.input.metalTexture, index: 0) encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1) encoder.setTexture(param.output.metalTexture, index: 1)
let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2]) var ratio_h: Float32 = 0
let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3]) var ratio_w: Float32 = 0
if param.output.tensorDim.dims[2] > 1 {
ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1)
}
if param.output.tensorDim.dims[3] > 1 {
ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1)
}
var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w) var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0) encoder.setBytes(&p, length: MemoryLayout<BilinearInterpMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding() encoder.endEncoding()
} }
......
...@@ -17,14 +17,14 @@ using namespace metal; ...@@ -17,14 +17,14 @@ using namespace metal;
kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]], kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
const device float4 * newScale [[buffer(0)]], const device float4 * nscale [[buffer(0)]],
const device float4 * newBias [[buffer(1)]], const device float4 * nbias [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return; gid.z >= outTexture.get_array_size()) return;
const float4 input = inTexture.read(gid.xy, gid.z); const float4 input = inTexture.read(gid.xy, gid.z);
float4 output = input * newScale[gid.z] + newBias[gid.z]; float4 output = input * nscale[gid.z] + nbias[gid.z];
outTexture.write(output, gid.xy, gid.z); outTexture.write(output, gid.xy, gid.z);
} }
......
...@@ -14,8 +14,8 @@ kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[te ...@@ -14,8 +14,8 @@ kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[te
if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) { if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
r = input.read(gid.xy, gid.z); r = input.read(gid.xy, gid.z);
} else { } else {
float w = gid.x * pm.ratio_w; P w = gid.x * pm.ratio_w;
float h = gid.y * pm.ratio_h; P h = gid.y * pm.ratio_h;
uint w0 = w, h0 = h; uint w0 = w, h0 = h;
uint w1 = w0 + 1, h1 = h0 + 1; uint w1 = w0 + 1, h1 = h0 + 1;
P w1lambda = w - w0, h1lambda = h - h0; P w1lambda = w - w0, h1lambda = h - h0;
...@@ -26,7 +26,8 @@ kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[te ...@@ -26,7 +26,8 @@ kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[te
VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z); VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z); VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z); VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3); r = h2lambda * (w2lambda * r0 + w1lambda * r1)
+ h1lambda * (w2lambda * r2 + w1lambda * r3);
} }
output.write(r, gid.xy, gid.z); output.write(r, gid.xy, gid.z);
} }
......
...@@ -16,8 +16,6 @@ ...@@ -16,8 +16,6 @@
using namespace metal; using namespace metal;
struct bilinear_interp_param { struct bilinear_interp_param {
// int32_t out_h;
// int32_t out_w;
float ratio_h; float ratio_h;
float ratio_w; float ratio_w;
}; };
......
...@@ -17,16 +17,16 @@ using namespace metal; ...@@ -17,16 +17,16 @@ using namespace metal;
inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) { inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) {
abcd[0] = abcd[1] = abcd[2] = 1; abcd[0] = abcd[1] = abcd[2] = 0;
abcd[3] = xyzn[0] * 4 + xyzn[3]; abcd[3] = xyzn[0] * 4 + xyzn[3];
} }
inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) { inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) {
abcd[0] = abcd[1] = 1; abcd[0] = abcd[1] = 0;
abcd[2] = xyzn[1]; abcd[2] = xyzn[1];
abcd[3] = xyzn[0] * 4 + xyzn[3]; abcd[3] = xyzn[0] * 4 + xyzn[3];
} }
inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) { inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) {
abcd[0] = 1; abcd[0] = 0;
abcd[3] = xyzn[0]; abcd[3] = xyzn[0];
abcd[2] = xyzn[1]; abcd[2] = xyzn[1];
abcd[1] = xyzn[2] * 4 + xyzn[3]; abcd[1] = xyzn[2] * 4 + xyzn[3];
...@@ -40,15 +40,15 @@ inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) { ...@@ -40,15 +40,15 @@ inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) {
} }
inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) { inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) {
xyzn[1] = xyzn[2] = 1; xyzn[1] = xyzn[2] = 0;
xyzn[0] = abcd[3] / 4; xyzn[0] = abcd[3] / 4;
xyzn[1] = abcd[3] % 4; xyzn[1] = abcd[3] % 4;
} }
inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) { inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) {
xyzn[2] = 1; xyzn[2] = 0;
xyzn[1] = abcd[2]; xyzn[1] = abcd[2];
xyzn[0] = abcd[3] / 4; xyzn[0] = abcd[3] / 4;
xyzn[1] = abcd[3] % 4; xyzn[3] = abcd[3] % 4;
} }
inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) { inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) {
xyzn[0] = abcd[3]; xyzn[0] = abcd[3];
......
...@@ -122,20 +122,20 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex ...@@ -122,20 +122,20 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
int x = gid.x - pm.offset; int x = gid.x - pm.offset;
if (x < 0) return; if (x < 0) return;
if (x < pm.vdim[0]) { if (x < pm.vdim[0]) {
VECTOR(P, 4) r = in0.read(gid.xy, gid.z); VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
out.write(r, gid.xy, gid.z); out.write(r, gid.xy, gid.z);
return; return;
} }
x -= pm.vdim[0]; x -= pm.vdim[0];
if (x < pm.vdim[1]) { if (x < pm.vdim[1]) {
VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z); VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z);
out.write(r, gid.xy, gid.z); out.write(r, gid.xy, gid.z);
return; return;
} }
#if N >= 3 #if N >= 3
x -= pm.vdim[1]; x -= pm.vdim[1];
if (x < pm.vdim[2]) { if (x < pm.vdim[2]) {
VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z); VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z);
out.write(r, gid.xy, gid.z); out.write(r, gid.xy, gid.z);
return; return;
} }
...@@ -143,7 +143,7 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex ...@@ -143,7 +143,7 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
#if N >= 4 #if N >= 4
x -= pm.vdim[2]; x -= pm.vdim[2];
if (x < pm.vdim[3]) { if (x < pm.vdim[3]) {
VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z); VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z);
out.write(r, gid.xy, gid.z); out.write(r, gid.xy, gid.z);
return; return;
} }
...@@ -151,7 +151,7 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex ...@@ -151,7 +151,7 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
#if N >= 5 #if N >= 5
x -= pm.vdim[3]; x -= pm.vdim[3];
if (x < pm.vdim[4]) { if (x < pm.vdim[4]) {
VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z); VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z);
out.write(r, gid.xy, gid.z); out.write(r, gid.xy, gid.z);
return; return;
} }
...@@ -159,7 +159,7 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex ...@@ -159,7 +159,7 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
#if N >= 6 #if N >= 6
x -= pm.vdim[4]; x -= pm.vdim[4];
if (x < pm.vdim[5]) { if (x < pm.vdim[5]) {
VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z); VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z);
out.write(r, gid.xy, gid.z); out.write(r, gid.xy, gid.z);
return; return;
} }
......
...@@ -36,7 +36,6 @@ kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array<P, access::read> inTextu ...@@ -36,7 +36,6 @@ kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array<P, access::read> inTextu
if (index < count) { if (index < count) {
index2abcd(lrp.idim, index, tabcd); index2abcd(lrp.idim, index, tabcd);
trans(lrp.itrans, tabcd, iabcd); trans(lrp.itrans, tabcd, iabcd);
abcd2xyzn(iC, iabcd, ixyzn);
#if RIN == 4 #if RIN == 4
abcd2xyzn_4(iC, iabcd, ixyzn); abcd2xyzn_4(iC, iabcd, ixyzn);
#else #else
......
...@@ -72,10 +72,21 @@ class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P> ...@@ -72,10 +72,21 @@ class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>
print(" \(type) output: ") print(" \(type) output: ")
// output // output
let outputArray = para.output.metalTexture.float32Array() // let outputArray = para.output.metalTexture.float32Array()
print(outputArray) // print(outputArray.strideArray())
let device = para.input.metalTexture!.device
let boxes:[Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: [2,0,1,3])
let variances:[Float32] = device.texture2tensor(texture: para.outputVariances.metalTexture!, dim: para.outputVariances.tensorDim.dims, transpose: [2,0,1,3])
print("boxes: ")
print(boxes.strideArray())
print("variances: ")
print(variances.strideArray())
// output // output
// print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.realNHWC(dim: (para.output.dim[0], para.output.dim[1], para.output.dim[2], para.output.dim[3])).strideArray())
// print(para.output.realNHWC().strideArray())
// let padToFourDim = para.output.padToFourDim // let padToFourDim = para.output.padToFourDim
// if para.output.transpose == [0, 1, 2, 3] { // if para.output.transpose == [0, 1, 2, 3] {
// let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision) // let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision)
......
...@@ -47,6 +47,9 @@ class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable, ...@@ -47,6 +47,9 @@ class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable,
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
let device = para.output.metalTexture!.device
let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
print(outputArray.strideArray())
} }
} }
......
...@@ -69,15 +69,9 @@ class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>, ...@@ -69,15 +69,9 @@ class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>,
} }
func delogOutput() { func delogOutput() {
print("reshape delog") print("reshape delog")
// let _: P? = para.input.metalTexture.logDesc(header: "reshape input: ", stridable: false) let device = para.output.metalTexture!.device
// let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
// let _: P? = para.output.metalTexture.logDesc(header: "reshape output: ", stridable: false)
let padToFourDim = para.output.padToFourDim
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
// print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])).strideArray())
print(outputArray.strideArray()) print(outputArray.strideArray())
// print(outputArray)
} }
} }
...@@ -64,6 +64,11 @@ class SplitOp<P: PrecisionType>: Operator<SplitKernel<P>, SplitParam<P>>, Runabl ...@@ -64,6 +64,11 @@ class SplitOp<P: PrecisionType>: Operator<SplitKernel<P>, SplitParam<P>>, Runabl
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
let device = para.input.metalTexture!.device
for out in para.outputList {
let arr: [Float32] = device.texture2tensor(texture: out.metalTexture, dim: out.tensorDim.dims, transpose: out.transpose)
print(arr.strideArray())
}
} }
} }
......
...@@ -57,6 +57,9 @@ class TransposeOp<P: PrecisionType>: Operator<TransposeKernel<P>, TransposeParam ...@@ -57,6 +57,9 @@ class TransposeOp<P: PrecisionType>: Operator<TransposeKernel<P>, TransposeParam
} else { } else {
print(" not implement") print(" not implement")
} }
let device = para.output.metalTexture!.device
let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
print(outputArray.strideArray())
} }
} }
......
...@@ -16,7 +16,7 @@ import Foundation ...@@ -16,7 +16,7 @@ import Foundation
class ScaleKernel: CusomKernel { class ScaleKernel: CusomKernel {
init(device: MTLDevice, shape: Shape) { init(device: MTLDevice, shape: Shape) {
super.init(device: device, inFunctionName: "scale_half", outputDim: shape, usePaddleMobileLib: false) super.init(device: device, inFunctionName: "scale", outputDim: shape, usePaddleMobileLib: false)
} }
} }
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
import Foundation import Foundation
let testTo = 2 let testTo = 113
var isTest = false var isTest = false
let computePrecision: ComputePrecision = .Float16 let computePrecision: ComputePrecision = .Float32
public class ResultHolder { public class ResultHolder {
public let dim: [Int] public let dim: [Int]
...@@ -120,10 +120,10 @@ public class Executor<P: PrecisionType> { ...@@ -120,10 +120,10 @@ public class Executor<P: PrecisionType> {
let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2])) let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
print(inputArr.strideArray()) print(inputArr.strideArray())
// print(dim) print(dim)
// writeToLibrary(fileName: "test_image_ssd_ar", array: inputArr) writeToLibrary(fileName: "test_image_ssd_ar", array: inputArr)
//
// print("write to library done") print("write to library done")
// return // return
// print(inputArr) // print(inputArr)
// //
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册