Merge pull request #978 from dolphin8/metal

xx

Merge pull request #978 from dolphin8/metal
xx
70bc0ed0 · dolphin8 · GitHub · 0bb2c1cd · 64e6ac2b · 70bc0ed0
20 changed file
--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -71,7 +71,128 @@ extension MTLDevice {
    return buffer!
  }
+  func texture2tensor_loop<P>(texture: MTLTexture, cb: ([Int], P)->Void) -> Void {
+    let bpR = texture.width * 4 * MemoryLayout<P>.size
+    let bpI = texture.height * bpR
+    let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1))
+    for i in 0..<texture.arrayLength {
+      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: bpI)
+      texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
+      for tx in 0..<texture.width * texture.height * 4 {
+        var k = tx
+        var xyzn: [Int] = [0, 0, 0, 0]
+        xyzn[1] = k / (texture.width * 4)
+        k %= (texture.width * 4)
+        xyzn[3] = k % 4
+        xyzn[0] = k / 4
+        xyzn[2] = i
+        cb(xyzn, pointer[tx])
+      }
+    }
+  }
+  func texture2tensor_3<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 3)
+    assert(texture.width == ndim[3])
+    assert(texture.height == ndim[2])
+    assert(ndim[0] == 1)
+    assert(texture.arrayLength == (ndim[1] + 3) / 4)
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[1] = xyzn[2] * 4 + xyzn[3]
+      tg[2] = xyzn[1]
+      tg[3] = xyzn[0]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  func texture2tensor_2<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 2)
+    let w = (ndim[3] + 3) / 4
+    assert(texture.width == w)
+    assert(texture.height == ndim[2])
+    assert(ndim[0] == 1)
+    assert(ndim[1] == 1)
+    assert(texture.arrayLength == 1)
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[2] = xyzn[1]
+      tg[3] = xyzn[0] * 4 + xyzn[3]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  func texture2tensor_1<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 1)
+    let w = (ndim[3] + 3) / 4
+    assert(texture.width == w)
+    assert(texture.height == 1)
+    assert(ndim[0] == 1)
+    assert(ndim[1] == 1)
+    assert(ndim[2] == 1)
+    assert(texture.arrayLength == 1)
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[3] = xyzn[0] * 4 + xyzn[3]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
  func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    if dim.count == 3 {
+      return texture2tensor_3(texture: texture, dim: dim, transpose: transpose)
+    } else if dim.count == 2 {
+      return texture2tensor_2(texture: texture, dim: dim, transpose: transpose)
+    } else if dim.count == 1 {
+      return texture2tensor_1(texture: texture, dim: dim, transpose: transpose)
+    }
    var tdim: [Int] = [1, 1, 1, 1]
    for i in 0..<dim.count {
      tdim[4 - dim.count + i] = dim[i]
@@ -84,30 +205,19 @@ extension MTLDevice {
    assert(texture.height == ndim[1])
    assert(texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4)
-    let bpR = ndim[2] * 4 * MemoryLayout<P>.size
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-    let bpI = ndim[1] * bpR
+      var tg: [Int] = [0, 0, 0, 0]
-    let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
+      tg[1] = xyzn[1]
-    for i in 0..<texture.arrayLength {
+      tg[2] = xyzn[0]
-      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: ndim[1] * ndim[2] * 4 * MemoryLayout<P>.size)
+      tg[0] = (xyzn[2] * 4 + xyzn[3]) / ndim[3]
-      texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
+      tg[3] = (xyzn[2] * 4 + xyzn[3]) % ndim[3]
+      var ig: [Int] = [0, 0, 0, 0]
-      for h in 0..<ndim[1] {
+      for k in 0..<4 {
-        for w in 0..<ndim[2] {
+        ig[transpose[k]] = tg[k]
-          for k in 0..<4 {
+      }
-            let tx = (h * ndim[2] + w) * 4 + k
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-            let n = (i * 4 + k) / ndim[3]
+      if ix < count {
-            let c = (i * 4 + k) % ndim[3]
+        tensor[ix] = v
-            let jg = [n, h, w, c]
-            var ig = [0, 0, 0, 0]
-            for d in 0..<4 {
-              ig[transpose[d]] = jg[d]
-            }
-            let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-            if ix < count {
-              tensor[ix] = pointer[tx]
-            }
-          }
-        }
      }
    }
    return tensor

--- a/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
@@ -30,7 +30,7 @@ public class MobileNet_ssd_AR: Net{
  class MobilenetssdPreProccess: CusomKernel {
    init(device: MTLDevice) {
      let s = CusomKernel.Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
-      super.init(device: device, inFunctionName: "mobilent_ar_preprocess_half", outputDim: s, usePaddleMobileLib: false)
+      super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, usePaddleMobileLib: false)
    }
  }

--- a/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift
@@ -56,9 +56,11 @@ class BatchNormOp<P: PrecisionType>: Operator<BatchNormKernel<P>, BatchNormParam
      throw error
    }
  }
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift
@@ -53,15 +53,10 @@ class BilinearInterpOp<P: PrecisionType>: Operator<BilinearInterpKernel<P>, Bili
  func delogOutput() {
    print(" \(type) output: ")
-    let padToFourDim = para.output.padToFourDim
+    let device = para.output.metalTexture!.device
-    if para.output.transpose == [0, 1, 2, 3] {
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+//    print(outputArray)
-      print(outputArray.strideArray())
+    print(outputArray.strideArray())
-    } else if para.output.transpose == [0, 2, 3, 1] {
-      print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])).strideArray())
-    } else {
-      fatalError(" not implemet")
-    }
  }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
@@ -65,15 +65,10 @@ class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Run
  func delogOutput() {
    print(" \(type) output: ")
-    let padToFourDim = para.output.padToFourDim
-    if para.output.transpose == [0, 1, 2, 3] {
+    let device = para.output.metalTexture!.device
-      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-      print(outputArray.strideArray())
+    print(outputArray.strideArray())
-    } else if para.output.transpose == [0, 2, 3, 1] {
-      print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])).strideArray())
-    } else {
-      fatalError(" not implemet")
-    }
  }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
@@ -20,12 +20,13 @@ class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
    let varianceP = param.variance.data.pointer
    let meanP = param.mean.data.pointer
    let scaleP = param.scale.data.pointer
-    let biasP = param.scale.data.pointer
+    let biasP = param.bias.data.pointer
    for i in 0..<count {
      let invStd = P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot())
      biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i]
      scaleP[i] = invStd * scaleP[i]
    }
    param.bias.initBuffer(device: device, precision: computePrecision)
    param.scale.initBuffer(device: device, precision: computePrecision)
    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift
@@ -27,10 +27,16 @@ class BilinearInterpKernel<P: PrecisionType>: Kernel, Computable{
    encoder.setTexture(param.input.metalTexture, index: 0)
    encoder.setTexture(param.output.metalTexture, index: 1)
-    let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2])
+    var ratio_h: Float32 = 0
-    let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3])
+    var ratio_w: Float32 = 0
+    if param.output.tensorDim.dims[2] > 1 {
+      ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1)
+    }
+    if param.output.tensorDim.dims[3] > 1 {
+      ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1)
+    }
    var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
-    encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
+    encoder.setBytes(&p, length: MemoryLayout<BilinearInterpMetalParam>.size, index: 0)
    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
    encoder.endEncoding()
  }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
@@ -17,14 +17,14 @@ using namespace metal;
 kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
                      texture2d_array<float, access::write> outTexture [[texture(1)]],
-                      const device float4 * newScale [[buffer(0)]],
+                      const device float4 * nscale [[buffer(0)]],
-                      const device float4 * newBias [[buffer(1)]],
+                      const device float4 * nbias [[buffer(1)]],
                      uint3 gid [[thread_position_in_grid]]) {
  if (gid.x >= outTexture.get_width() ||
      gid.y >= outTexture.get_height() ||
      gid.z >= outTexture.get_array_size()) return;
  const float4 input = inTexture.read(gid.xy, gid.z);
-  float4 output = input * newScale[gid.z] + newBias[gid.z];
+  float4 output = input * nscale[gid.z] + nbias[gid.z];
  outTexture.write(output, gid.xy, gid.z);
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal
@@ -14,8 +14,8 @@ kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[te
  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
    r = input.read(gid.xy, gid.z);
  } else {
-    float w = gid.x * pm.ratio_w;
+    P w = gid.x * pm.ratio_w;
-    float h = gid.y * pm.ratio_h;
+    P h = gid.y * pm.ratio_h;
    uint w0 = w, h0 = h;
    uint w1 = w0 + 1, h1 = h0 + 1;
    P w1lambda = w - w0, h1lambda = h - h0;
@@ -26,7 +26,8 @@ kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[te
    VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
    VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
    VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1)
+      + h1lambda * (w2lambda * r2 + w1lambda * r3);
  }
  output.write(r, gid.xy, gid.z);
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal
@@ -16,8 +16,6 @@
 using namespace metal;
 struct bilinear_interp_param {
-//  int32_t out_h;
-//  int32_t out_w;
  float ratio_h;
  float ratio_w;
 };

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
@@ -17,16 +17,16 @@ using namespace metal;
 inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) {
-  abcd[0] = abcd[1] = abcd[2] = 1;
+  abcd[0] = abcd[1] = abcd[2] = 0;
  abcd[3] = xyzn[0] * 4 + xyzn[3];
 }
 inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) {
-  abcd[0] = abcd[1] = 1;
+  abcd[0] = abcd[1] = 0;
  abcd[2] = xyzn[1];
  abcd[3] = xyzn[0] * 4 + xyzn[3];
 }
 inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) {
-  abcd[0] = 1;
+  abcd[0] = 0;
  abcd[3] = xyzn[0];
  abcd[2] = xyzn[1];
  abcd[1] = xyzn[2] * 4 + xyzn[3];
@@ -40,15 +40,15 @@ inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) {
 }
 inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) {
-  xyzn[1] = xyzn[2] = 1;
+  xyzn[1] = xyzn[2] = 0;
  xyzn[0] = abcd[3] / 4;
  xyzn[1] = abcd[3] % 4;
 }
 inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) {
-  xyzn[2] = 1;
+  xyzn[2] = 0;
  xyzn[1] = abcd[2];
  xyzn[0] = abcd[3] / 4;
-  xyzn[1] = abcd[3] % 4;
+  xyzn[3] = abcd[3] % 4;
 }
 inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) {
  xyzn[0] = abcd[3];

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal
@@ -122,20 +122,20 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
  int x = gid.x - pm.offset;
  if (x < 0) return;
  if (x < pm.vdim[0]) {
-    VECTOR(P, 4)  r = in0.read(gid.xy, gid.z);
+    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
    out.write(r, gid.xy, gid.z);
    return;
  }
  x -= pm.vdim[0];
  if (x < pm.vdim[1]) {
-    VECTOR(P, 4)  r = in1.read(uint2(x, gid.y), gid.z);
+    VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z);
    out.write(r, gid.xy, gid.z);
    return;
  }
 #if N >= 3
  x -= pm.vdim[1];
  if (x < pm.vdim[2]) {
-    VECTOR(P, 4)  r = in2.read(uint2(x, gid.y), gid.z);
+    VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z);
    out.write(r, gid.xy, gid.z);
    return;
  }
@@ -143,7 +143,7 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
 #if N >= 4
  x -= pm.vdim[2];
  if (x < pm.vdim[3]) {
-    VECTOR(P, 4)  r = in3.read(uint2(x, gid.y), gid.z);
+    VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z);
    out.write(r, gid.xy, gid.z);
    return;
  }
@@ -151,7 +151,7 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
 #if N >= 5
  x -= pm.vdim[3];
  if (x < pm.vdim[4]) {
-    VECTOR(P, 4)  r = in4.read(uint2(x, gid.y), gid.z);
+    VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z);
    out.write(r, gid.xy, gid.z);
    return;
  }
@@ -159,7 +159,7 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
 #if N >= 6
  x -= pm.vdim[4];
  if (x < pm.vdim[5]) {
-    VECTOR(P, 4)  r = in5.read(uint2(x, gid.y), gid.z);
+    VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z);
    out.write(r, gid.xy, gid.z);
    return;
  }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal
@@ -36,7 +36,6 @@ kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array<P, access::read> inTextu
    if (index < count) {
      index2abcd(lrp.idim, index, tabcd);
      trans(lrp.itrans, tabcd, iabcd);
-      abcd2xyzn(iC, iabcd, ixyzn);
 #if RIN == 4
      abcd2xyzn_4(iC, iabcd, ixyzn);
 #else

--- a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
@@ -72,10 +72,21 @@ class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>
    print(" \(type) output: ")
    // output
-    let outputArray = para.output.metalTexture.float32Array()
+//    let outputArray = para.output.metalTexture.float32Array()
-    print(outputArray)
+//    print(outputArray.strideArray())
+    let device = para.input.metalTexture!.device
+    let boxes:[Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: [2,0,1,3])
+    let variances:[Float32] = device.texture2tensor(texture: para.outputVariances.metalTexture!, dim: para.outputVariances.tensorDim.dims, transpose: [2,0,1,3])
+    print("boxes: ")
+    print(boxes.strideArray())
+    print("variances: ")
+    print(variances.strideArray())
    // output
-//    print(" \(type) output: ")
+    print(" \(type) output: ")
+    print(para.output.metalTexture.realNHWC(dim: (para.output.dim[0], para.output.dim[1], para.output.dim[2], para.output.dim[3])).strideArray())
+//    print(para.output.realNHWC().strideArray())
 //    let padToFourDim = para.output.padToFourDim
 //    if para.output.transpose == [0, 1, 2, 3] {
 //      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision)

--- a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
@@ -47,6 +47,9 @@ class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable,
  func delogOutput() {
    print(" \(type) output: ")
    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
  }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
@@ -69,15 +69,9 @@ class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>,
  }
  func delogOutput() {
    print("reshape delog")
-//    let _: P? = para.input.metalTexture.logDesc(header: "reshape input: ", stridable: false)
+    let device = para.output.metalTexture!.device
-//
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-//    let _: P? = para.output.metalTexture.logDesc(header: "reshape output: ", stridable: false)
-    let padToFourDim = para.output.padToFourDim
-    let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
-//    print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])).strideArray())
    print(outputArray.strideArray())
+//    print(outputArray)
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift
@@ -64,6 +64,11 @@ class SplitOp<P: PrecisionType>: Operator<SplitKernel<P>, SplitParam<P>>, Runabl
  func delogOutput() {
    print(" \(type) output: ")
+    let device = para.input.metalTexture!.device
+    for out in para.outputList {
+      let arr: [Float32] = device.texture2tensor(texture: out.metalTexture, dim: out.tensorDim.dims, transpose: out.transpose)
+      print(arr.strideArray())
+    }
  }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
@@ -57,6 +57,9 @@ class TransposeOp<P: PrecisionType>: Operator<TransposeKernel<P>, TransposeParam
    } else {
      print(" not implement")
    }
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
  }
 }

--- a/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
@@ -16,7 +16,7 @@ import Foundation
 class ScaleKernel: CusomKernel {
  init(device: MTLDevice, shape: Shape) {
-    super.init(device: device, inFunctionName: "scale_half", outputDim: shape, usePaddleMobileLib: false)
+    super.init(device: device, inFunctionName: "scale", outputDim: shape, usePaddleMobileLib: false)
  }
 }

--- a/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
@@ -14,10 +14,10 @@
 import Foundation
-let testTo = 2
+let testTo = 113
 var isTest = false
-let computePrecision: ComputePrecision = .Float16
+let computePrecision: ComputePrecision = .Float32
 public class ResultHolder {
  public let dim: [Int]
@@ -120,10 +120,10 @@ public class Executor<P: PrecisionType> {
      let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
      print(inputArr.strideArray())
-//      print(dim)
+      print(dim)
-//      writeToLibrary(fileName: "test_image_ssd_ar", array: inputArr)
+      writeToLibrary(fileName: "test_image_ssd_ar", array: inputArr)
-//
-//      print("write to library done")
+      print("write to library done")
 //      return
 //                  print(inputArr)
 //