add dilation

6ec031ff · liuruilong · ee6ef4d9 · 6ec031ff · 6ec031ff · 6ec031ff
18 changed file
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -26,7 +26,7 @@ let modelHelperMap: [SupportModel : Net] = [.mobilenet_ssd : MobileNet_ssd_hand.
 enum SupportModel: String{
  //  case mobilenet = "mobilenet"
  case mobilenet_ssd = "mobilenetssd"
-  case genet          = "enet"
+  case genet          = "genet"
  static func supportedModels() -> [SupportModel] {
    //.mobilenet,
    return [.mobilenet_ssd ,.genet]
@@ -79,7 +79,7 @@ class ViewController: UIViewController {
      return
    }
    do {
-      let max = 1
+      let max = 10
      let startDate = Date.init()
      for i in 0..<max {
        try net.predict(inTexture: inTexture) { [weak self] (result) in
@@ -87,6 +87,7 @@ class ViewController: UIViewController {
            fatalError()
          }
          
+          print(result.resultArray)
          if i == max - 1 {
            let time = Date.init().timeIntervalSince(startDate)
            DispatchQueue.main.async {

--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -699,6 +699,7 @@
 					"@executable_path/Frameworks",
 					"@loader_path/Frameworks",
 				);
+				MACH_O_TYPE = mh_dylib;
 				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
 				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
@@ -727,6 +728,7 @@
 					"@executable_path/Frameworks",
 					"@loader_path/Frameworks",
 				);
+				MACH_O_TYPE = mh_dylib;
 				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
 				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";

--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -342,7 +342,7 @@ public extension MTLTexture {
  
  // n c h w - dim
  func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
-    print("origin dim: \(dim)")
+//    print("origin dim: \(dim)")
    print("texture: ")
    print(self)
    

--- a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
@@ -314,7 +314,7 @@ public class PaddleMobileUnitTest {
        let offsetX = filterSize.width/2 - paddings.0
        let offsetY = filterSize.height/2 - paddings.1
        
-        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), paddedZ: UInt16(paddings.0))
+        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), paddedZ: UInt16(paddings.0), dilationX: UInt16(1), dilationY: UInt16(1))
        
        let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
        

--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Executor.swift
@@ -14,7 +14,7 @@

 import Foundation

-let testTo = 12
+let testTo = 54

 public class ResultHolder<P: PrecisionType> {
  public let dim: [Int]
@@ -62,7 +62,7 @@ public class Executor<P: PrecisionType> {
    queue = inQueue
    for block in inProgram.programDesc.blocks {
      //block.ops.count
-      for i in 0..<testTo {
+      for i in 0..<block.ops.count {
        let op = block.ops[i]
        do {
          let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
@@ -124,13 +124,13 @@ public class Executor<P: PrecisionType> {
      //            print(stridableInput)
      
      //            let _: Flo? = input.logDesc(header: "input: ", stridable: true)
-      for i in 0..<self.ops.count {
-        let op = self.ops[i]
-        print(" 第 \(i) 个 op: ")
-        op.delogOutput()
-      }
-      
-      return
+//      for i in 0..<self.ops.count {
+//        let op = self.ops[i]
+//        print(" 第 \(i) 个 op: ")
+//        op.delogOutput()
+//      }
+//      
+//      return
      
      let afterDate = Date.init()
     
@@ -146,7 +146,6 @@ public class Executor<P: PrecisionType> {
        }), inElapsedTime: afterDate.timeIntervalSince(beforeDate))
      }

-
      completionHandle(resultHolder)
    }
    buffer.commit()

--- a/metal/paddle-mobile/paddle-mobile/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/Loader.swift
@@ -70,8 +70,8 @@ public class Loader<P: PrecisionType> {
       */
      
      //现在模型传入模型为  Float 类型, 这块应该根据模型来
-//            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
-//            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
+      //            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
+      //            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
      let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
      
      guard bytesRead == tensor.data.size else {
@@ -79,12 +79,12 @@ public class Loader<P: PrecisionType> {
      }
      
      // TODO: use script to convert
-//            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
-//            for i in 0..<tensor.numel() {
-//                tensor.data[i] = P.init(inFloat: tmpPointer[i])
-//            }
-//            tmpPointer.deinitialize(count: tmpCapacity)
-//            tmpPointer.deallocate()
+      //            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
+      //            for i in 0..<tensor.numel() {
+      //                tensor.data[i] = P.init(inFloat: tmpPointer[i])
+      //            }
+      //            tmpPointer.deinitialize(count: tmpCapacity)
+      //            tmpPointer.deallocate()
      
      nowIndex += bytesRead
    }
@@ -95,6 +95,7 @@ public class Loader<P: PrecisionType> {
  }
  public init(){}
  public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{
+    
    guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
      throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !")
    }
@@ -120,6 +121,7 @@ public class Loader<P: PrecisionType> {
      guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
        throw PaddleMobileError.loaderError(message: "at least two operator")
      }
+      
      guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
        throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch")
      }
@@ -158,7 +160,7 @@ public class Loader<P: PrecisionType> {
                throw error
              }
              tensor.convert(to: DataLayout.NHWC())
-//                            tensor.initBuffer(device: device)
+              //                            tensor.initBuffer(device: device)
              scope[varDesc.name] = tensor
            } else {
              let dim = Dim.init(inDim: tensorDesc.dims)

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
@@ -97,6 +97,13 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
  }
  
  func delogOutput() {
+    print("stride: ")
+    print(para.stride)
+    print("dilations: ")
+    print(para.dilations)
+    
+    
+    
    print(" \(type) output: ")
    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
  }

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
@@ -43,8 +43,15 @@ class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTr
  }
  
  func delogOutput() {
-    print("conv transpose delog")
-    let _: P? = para.input.metalTexture.logDesc(header: "conv transpose input: ", stridable: true)
-    let _: P? = para.output.metalTexture.logDesc(header: "conv transpose output: ", stridable: true)
+    print(" \(type) output: ")
+    let originDim = para.output.originDim
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    } else {
+      print(" not implement")
+    }
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
@@ -61,11 +61,18 @@ class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, Elem
    print(para.inputX.metalTexture.toTensor(dim: (n: para.inputX.tensorDim[0], c: para.inputX.tensorDim[1], h: para.inputX.tensorDim[2], w: para.inputX.tensorDim[3])).strideArray())
    print(" \(type) inputY: ")
    print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray())
+    
    print(" \(type) output: ")
    let originDim = para.output.originDim
+    if para.output.transpose == [0, 1, 2, 3] {
      let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    } else {
+      print(" not implement")
+    }
+    
  }
  
  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -75,7 +75,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
        print("offset y: \(offsetY)")
        
        let offsetZ = 0.0
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
        
        var invs: [P] = []
        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -27,8 +27,9 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
    
    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1])
    
-    let offsetX = param.filter.width/2 - Int(param.paddings[0])
-    let offsetY = param.filter.height/2 - Int(param.paddings[1])
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+    
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])

    param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
    param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
@@ -37,7 +38,11 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
    print("offset y: \(offsetY)")
    
    let offsetZ = 0.0
-    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    print("metal param: ")
+    print(inMetalParam)
+    
+    metalParam = inMetalParam
  }
  
  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
@@ -81,7 +81,7 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
    let offsetZ = 0.0
    
    print(" fuck ")
-    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
    
    var invs: [P] = []
    let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
@@ -21,6 +21,8 @@ public struct MetalConvParam {
  let strideX: UInt16
  let strideY: UInt16
  let paddedZ: UInt16
+  let dilationX: UInt16
+  let dilationY: UInt16
 }

 class ConvKernel<P: PrecisionType>: Kernel, Computable {
@@ -39,7 +41,7 @@ class ConvKernel<P: PrecisionType>: Kernel, Computable {
    let offsetZ = 0.0
    param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
    
-    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
  }
  
  func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
@@ -45,6 +45,7 @@ class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
    metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
    
    param.output.initTexture(device: device, inTranspose: param.input.transpose)
+    param.filter.initBuffer(device: device)
  }
  
  func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
@@ -55,7 +55,7 @@ class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
    }
    emp.yoff = 4 - Int32(param.inputY.tensorDim.cout())
    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
-      print("===> elementwise_add fast!!!")
+//      print("===> elementwise_add fast!!!")
      emp.fast = 1
    }
    

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
@@ -21,9 +21,10 @@ struct MetalConvParam {
  short offsetZ;
  ushort strideX;
  ushort strideY;
+  ushort dilationX;
+  ushort dilationY;
 };

-
 kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
                                              constant MetalConvParam &param [[buffer(0)]],
@@ -39,7 +40,6 @@ kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::samp
    return;
  }
  
-    
  ushort2 stride = ushort2(param.strideX, param.strideY);
  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
  
@@ -556,17 +556,20 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
  
  float4 output = float4(0.0);
  
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  
  float4 input[9];
  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
    for (int j = 0; j < 9; ++j) {
      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
      output.x += dot(input[j], weight_x);

--- a/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
@@ -48,7 +48,9 @@ extension BlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
    var description: String {
        var str = ""
        
-        for op in ops {
+        for i in 0..<ops.count {
+          str += " op \(i): "
+          let op = ops[i]
          str += op.description
        }
        

--- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
@@ -57,7 +57,7 @@ class Tensor<P: PrecisionType>: Tensorial {
      pointer.deallocate()
    }
    deinit {
-//            release()
+      //            release()
    }
  }
  
@@ -163,12 +163,13 @@ class Tensor<P: PrecisionType>: Tensorial {
        }
      }
    } else if dim.cout() == 1 {
-            buffer = device.makeBuffer(length: numel() * precisionSize)
+      let num = ((numel() + 3) / 4) * 4
+      buffer = device.makeBuffer(length: num * precisionSize)
      switch precision {
      case .Float32:
-                buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
+        buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout<P>.stride)
      case .Float16:
-                float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
+        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num)
      }
    } else {
      fatalError(" not support !")