Merge remote-tracking branch 'upstream/metal' into metal

882f5dae · liuruilong · e32e90c3 · 064116e8 · 882f5dae · 882f5dae
6 changed file
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
@@ -25,8 +25,31 @@ struct ElementwiseAddMetalParam {
 }

 class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: ElementwiseAddMetalParam
  required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    
+    metalParam = ElementwiseAddMetalParam.init()
+    
+    let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
+    let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
+    let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
+    let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
+    
+    metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
+    metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
+    metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
+    metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
+    if param.axis == -1 {
+      metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
+    } else {
+      metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+    }
+    metalParam.ylen = Int32(param.inputY.tensorDim.cout())
+    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
+      //      print("===> elementwise_add fast!!!")
+      metalParam.fast = 1
+    }
    if computePrecision == .Float32 {
      super.init(device: device, inFunctionName: "elementwise_add")
    } else if computePrecision == .Float16 {
@@ -40,32 +63,10 @@ class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
      throw PaddleMobileError.predictError(message: " encode is nil")
    }
-    var emp = ElementwiseAddMetalParam.init()
    encoder.setTexture(param.inputX.metalTexture, index: 0)
    encoder.setTexture(param.inputY.metalTexture, index: 1)
    encoder.setTexture(param.output.metalTexture, index: 2)
-    
-    let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
-    let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
-    let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
-    let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
-    
-    emp.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
-    emp.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
-    emp.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
-    emp.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
-    if param.axis == -1 {
-      emp.axis = 4 - Int32(param.inputY.tensorDim.cout())
-    } else {
-      emp.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
-    }
-    emp.ylen = Int32(param.inputY.tensorDim.cout())
-    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
-//      print("===> elementwise_add fast!!!")
-      emp.fast = 1
-    }
-    
-    encoder.setBytes(&emp, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
+    encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
    encoder.endEncoding()
  }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
@@ -25,24 +25,10 @@ struct PoolMetalParam {
 }

 class PoolKernel<P: PrecisionType>: Kernel, Computable{
-  
+  var metalParam: PoolMetalParam
  required init(device: MTLDevice, param: PoolParam<P>) {
    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
-    if computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "pool")
-    } else if computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "pool_half")
-    } else {
-      fatalError()
-    }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
-    }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
+    
    var poolType: Int32
    switch param.poolType {
    case "max":
@@ -50,9 +36,9 @@ class PoolKernel<P: PrecisionType>: Kernel, Computable{
    case "avg":
      poolType = 1
    default:
-      throw PaddleMobileError.predictError(message: " unknown pooltype " + param.poolType)
+      fatalError()
    }
-    var pmp = PoolMetalParam.init(
+    metalParam = PoolMetalParam.init(
      ksizeX: param.ksize[0],
      ksizeY: param.ksize[1],
      strideX: param.stride[0],
@@ -61,7 +47,24 @@ class PoolKernel<P: PrecisionType>: Kernel, Computable{
      paddingY: param.padding[1],
      poolType: poolType
    )
-    encoder.setBytes(&pmp, length: MemoryLayout<PoolMetalParam>.size, index: 0)
+    
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "pool")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "pool_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+
+    encoder.setBytes(&metalParam, length: MemoryLayout<PoolMetalParam>.size, index: 0)
    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
    encoder.endEncoding()
  }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
@@ -28,8 +28,27 @@ struct ReshapeTestParam: TestParam {
 }

 class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
+  
+  var metalParam: ReshapeMetalParam
+  
  required init(device: MTLDevice, param: ReshapeParam<P>) {
    param.output.initTexture(device: device, computePrecision: computePrecision)
+    var id: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.input.tensorDim.cout() {
+      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
+    }
+    let it: [Int32] = param.input.transpose.map { Int32($0) }
+    var od: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.output.tensorDim.cout() {
+      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
+    }
+    let ot: [Int32] = param.output.transpose.map { Int32($0) }
+    metalParam = ReshapeMetalParam.init(
+      idim: (id[0], id[1], id[2], id[3]),
+      itrans: (it[0], it[1], it[2], it[3]),
+      odim: (od[0], od[1], od[2], od[3]),
+      otrans: (ot[0], ot[1], ot[2], ot[3])
+    )
    if computePrecision == .Float32 {
      super.init(device: device, inFunctionName: "reshape")
    } else if computePrecision == .Float16 {
@@ -40,6 +59,12 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
  }
  
  required init(device: MTLDevice, testParam: ReshapeTestParam) {
+    metalParam = ReshapeMetalParam.init(
+    idim: (0, 0, 0, 0),
+    itrans: (0, 0, 0, 0),
+    odim: (0, 0, 0, 0),
+    otrans: (0, 0, 0, 0)
+    )
    super.init(device: device, inFunctionName: "reshape")
  }
  
@@ -50,23 +75,8 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
    
    encoder.setTexture(param.input.metalTexture, index: 0)
    encoder.setTexture(param.output.metalTexture, index: 1)
-    var id: [Int32] = [1, 1, 1, 1]
-    for i in 0..<param.input.tensorDim.cout() {
-      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
-    }
-    let it: [Int32] = param.input.transpose.map { Int32($0) }
-    var od: [Int32] = [1, 1, 1, 1]
-    for i in 0..<param.output.tensorDim.cout() {
-      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
-    }
-    let ot: [Int32] = param.output.transpose.map { Int32($0) }
-    var rmp = ReshapeMetalParam.init(
-      idim: (id[0], id[1], id[2], id[3]),
-      itrans: (it[0], it[1], it[2], it[3]),
-      odim: (od[0], od[1], od[2], od[3]),
-      otrans: (ot[0], ot[1], ot[2], ot[3])
-    )
-    encoder.setBytes(&rmp, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+
+    encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
    encoder.endEncoding()
  }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
@@ -21,8 +21,13 @@ struct SoftmaxMetalParam {

 class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{
  
+  var metalParam: SoftmaxMetalParam
  required init(device: MTLDevice, param: SoftmaxParam<P>) {
    param.output.initTexture(device: device, computePrecision: computePrecision)
+    metalParam = SoftmaxMetalParam.init(
+      N: Int32(param.input.tensorDim[0]),
+      K: Int32(param.input.tensorDim[1])
+    )
    if computePrecision == .Float32 {
      super.init(device: device, inFunctionName: "softmax")
    } else if computePrecision == .Float16 {
@@ -38,13 +43,7 @@ class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{
    }
    encoder.setTexture(param.input.metalTexture, index: 0)
    encoder.setTexture(param.output.metalTexture, index: 1)
-    
-    var smp = SoftmaxMetalParam.init(
-      N: Int32(param.input.tensorDim[0]),
-      K: Int32(param.input.tensorDim[1])
-    )
-
-    encoder.setBytes(&smp, length: MemoryLayout<SoftmaxMetalParam>.size, index: 0)
+    encoder.setBytes(&metalParam, length: MemoryLayout<SoftmaxMetalParam>.size, index: 0)
    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
    encoder.endEncoding()
  }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
@@ -57,13 +57,7 @@ class TransposeKernel<P: PrecisionType>: Kernel, Computable, Testable {
      invT[v] = i
    }
    var axis: [Int] = [0, 1, 2, 3]
-    
-    //    var doNothing = false
-    //    if param.axis.count == param.input.transpose.count {
-    //      doNothing = param.axis == param.input.transpose.map { Int32($0) }
-    //    }
-    
-    
+      
    for i in 0..<param.axis.count {
      axis[4-param.axis.count+i] = 4 - param.axis.count + Int(param.axis[i])
    }
@@ -72,9 +66,9 @@ class TransposeKernel<P: PrecisionType>: Kernel, Computable, Testable {
    tmp.iC = Int32(param.input.dim[param.input.transpose[3]])
    tmp.oC = Int32(param.output.dim[3])
    if realAxis == [0, 1, 2, 3] {
-      print("====> transpose! FAST :)")
+//      print("====> transpose! FAST :)")
    } else {
-      print("====> transpose! SLOW :(")
+//      print("====> transpose! SLOW :(")
    }
    metalParam = tmp
  }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
@@ -84,6 +84,7 @@ kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[te
    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
    int32_t yshift = 4 - pm.ylen - pm.axis;
    for (int n = 0; n < 4; n++) {
+      x_xyzn[3] = n;
      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
      invtrans(xtrans, x_abcd, t_abcd);
      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {