Merge pull request #891 from dolphin8/metal

fix

Merge pull request #891 from dolphin8/metal
fix
5f9f24a3 · dolphin8 · GitHub · f9a87f61 · f70fc456 · 5f9f24a3
6 changed file
--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -145,7 +145,7 @@ extension MTLDevice {
    if value.count > 0 {
      var rcount: Int = (ndim[0] * ndim[3] + 3) / 4
      rcount = rcount * 4 * ndim[1] * ndim[2]
-      var nvalue: [P] = .init(repeating: Float32(0.0) as! P, count: rcount)
+      var nvalue: [Float32] = .init(repeating: 0.0, count: rcount)
      for i0 in 0..<tdim[0] {
        for i1 in 0..<tdim[1] {
@@ -158,19 +158,32 @@ extension MTLDevice {
              let k = jg[0] * ndim[3] + jg[3]
              let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4)
-              nvalue[jx] = value[ix]
+              nvalue[jx] = value[ix] as! Float32
            }
          }
        }
      }
-      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer(mutating: nvalue)
      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
-      let bpR = ndim[2] * 4 * MemoryLayout<P>.size
+      if inComputePrecision == .Float16 {
-      let bpI = ndim[1] * bpR
+        let xvalue: [UInt16] = .init(repeating: 0, count: rcount)
-      for i in 0..<textureDesc.arrayLength {
+        let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
-        let p = pointer + texture.width * texture.height * 4 * i
+        let outputP: UnsafeMutablePointer<UInt16> = UnsafeMutablePointer(mutating: xvalue)
-        texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+        float32ToFloat16(input: pointer, output: outputP, count: rcount)
+        let bpR = ndim[2] * 4 * 2
+        let bpI = ndim[1] * bpR
+        for i in 0..<textureDesc.arrayLength {
+          let p = outputP + texture.width * texture.height * 4 * i
+          texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+        }
+      } else {
+        let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
+        let bpR = ndim[2] * 4 * MemoryLayout<P>.size
+        let bpI = ndim[1] * bpR
+        for i in 0..<textureDesc.arrayLength {
+          let p = pointer + texture.width * texture.height * 4 * i
+          texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+        }
      }
    }
    return texture

--- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
@@ -31,9 +31,20 @@ class ElementwiseAddParam<P: PrecisionType>: OpParam {
      let device = inputX.metalTexture!.device
      inputY = Texture.init(device: device, inDim: tensorY.dim)
      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
-      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims)
+      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: computePrecision)
    }
+//    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+//      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+//      if computePrecision == .Float32 {
+//        super.init(device: device, inFunctionName: "elementwise_add")
+//      } else if computePrecision == .Float16 {
+//        super.init(device: device, inFunctionName: "elementwise_add_half")
+//      } else {
+//        fatalError()
+//      }
+//    }
    var offset = axis
    if axis == -1 {
      offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
@@ -65,14 +76,8 @@ class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, Elem
  }
  func delogOutput() {
-//    print(" \(type) inputX: ")
-//    print(para.inputX.metalTexture.toTensor(dim: (n: para.inputX.tensorDim[0], c: para.inputX.tensorDim[1], h: para.inputX.tensorDim[2], w: para.inputX.tensorDim[3])).strideArray())
-//    print(" \(type) inputY: ")
-//    print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray())
    print(" \(type) output: ")
+    print(para.output)
-    print(para.inputY)
    let padToFourDim = para.output.padToFourDim
    if para.output.transpose == [0, 1, 2, 3] {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
@@ -15,10 +15,9 @@
 import Foundation
 struct ElementwiseAddMetalParam {
-  var unsafe_one_dim: Int32 = 0
  var fast: Int32 = 0
  var axis: Int32 = 0
-  var yoff: Int32 = 0
+  var ylen: Int32 = 0
  var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
  var xtrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
  var ydim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
@@ -60,17 +59,12 @@ class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
    } else {
      emp.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
    }
-    emp.yoff = 4 - Int32(param.inputY.tensorDim.cout())
+    emp.ylen = Int32(param.inputY.tensorDim.cout())
    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
 //      print("===> elementwise_add fast!!!")
      emp.fast = 1
    }
-    // TODO: 
-    if param.inputY.tensorDim.cout() == 1 {
-      emp.unsafe_one_dim = 1;
-    }
    encoder.setBytes(&emp, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
    encoder.endEncoding()

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
@@ -50,9 +50,15 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
    encoder.setTexture(param.input.metalTexture, index: 0)
    encoder.setTexture(param.output.metalTexture, index: 1)
-    let id: [Int32] = (0..<4).map { Int32(param.input.dim[$0]) }
+    var id: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.input.tensorDim.cout() {
+      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
+    }
    let it: [Int32] = param.input.transpose.map { Int32($0) }
-    let od: [Int32] = (0..<4).map { Int32(param.output.dim[$0]) }
+    var od: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.output.tensorDim.cout() {
+      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
+    }
    let ot: [Int32] = param.output.transpose.map { Int32($0) }
    var rmp = ReshapeMetalParam.init(
      idim: (id[0], id[1], id[2], id[3]),

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
@@ -18,10 +18,9 @@
 using namespace metal;
 struct ElementwiseAddParam {
-  int32_t unsafe_one_dim;
  int32_t fast;
  int32_t axis;
-  int32_t yoff;
+  int32_t ylen;
  int32_t xdim[4];
  int32_t xtrans[4];
  int32_t ydim[4];
@@ -37,23 +36,23 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur
      gid.y >= outTexture.get_height() ||
      gid.z >= outTexture.get_array_size()) return;
  float4 rx, ry;
-  if (pm.unsafe_one_dim == 1) {
-    rx = inputX.read(gid.xy, gid.z);
+  if (pm.fast == 1) {
-    ry = inputY.read(uint2(0, 0), gid.z);
-  } else if (pm.fast == 1) {
    rx = inputX.read(gid.xy, gid.z);
    ry = inputY.read(gid.xy, gid.z);
  } else {
    rx = inputX.read(gid.xy, gid.z);
    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-    int32_t y_abcd[4] = {1, 1, 1, 1}, y_xyzn[4];
+    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+    int32_t yshift = 4 - pm.ylen - pm.axis;
    for (int n = 0; n < 4; n++) {
+      x_xyzn[3] = n;
      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
      invtrans(xtrans, x_abcd, t_abcd);
-      for (int k = pm.axis; k < (4 - pm.yoff); k++) {
+      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-        y_abcd[k+pm.yoff] = t_abcd[k];
+        y_abcd[yshift+k] = t_abcd[k];
      }
      trans(ytrans, y_abcd, t_abcd);
      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
@@ -73,23 +72,22 @@ kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[te
      gid.y >= outTexture.get_height() ||
      gid.z >= outTexture.get_array_size()) return;
  half4 rx, ry;
-  if (pm.unsafe_one_dim == 1) {
-    rx = inputX.read(gid.xy, gid.z);
+  if (pm.fast == 1) {
-    ry = inputY.read(uint2(0, 0), gid.z);
-  } else if (pm.fast == 1) {
    rx = inputX.read(gid.xy, gid.z);
    ry = inputY.read(gid.xy, gid.z);
  } else {
    rx = inputX.read(gid.xy, gid.z);
    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-    int32_t y_abcd[4] = {1, 1, 1, 1}, y_xyzn[4];
+    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+    int32_t yshift = 4 - pm.ylen - pm.axis;
    for (int n = 0; n < 4; n++) {
      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
      invtrans(xtrans, x_abcd, t_abcd);
-      for (int k = pm.axis; k < (4 - pm.yoff); k++) {
+      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-        y_abcd[k+pm.yoff] = t_abcd[k];
+        y_abcd[yshift+k] = t_abcd[k];
      }
      trans(ytrans, y_abcd, t_abcd);
      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);

--- a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
@@ -73,10 +73,13 @@ class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>,
  func delogOutput() {
    print("reshape delog")
 //    let _: P? = para.input.metalTexture.logDesc(header: "reshape input: ", stridable: false)
+//
+//    let _: P? = para.output.metalTexture.logDesc(header: "reshape output: ", stridable: false)
    let padToFourDim = para.output.padToFourDim
    let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+//    print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])).strideArray())
    print(outputArray.strideArray())
  }