提交 742b0fc0 编写于 作者: X xiaohaichun

merge metal branch to pointer

......@@ -24,6 +24,7 @@
# Executables
// !$*UTF8*$!
archiveVersion = 1;
classes = {
objectVersion = 50;
objects = {
/* Begin PBXBuildFile section */
FCEB6843212F00CC00D2448E /* PreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCEB6842212F00CC00D2448E /* PreluKernel.metal */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
FCEB683F212F00CC00D2448E /* PreluKernel.metallib */ = {isa = PBXFileReference; explicitFileType = "archive.metal-library"; includeInIndex = 0; path = PreluKernel.metallib; sourceTree = BUILT_PRODUCTS_DIR; };
FCEB6842212F00CC00D2448E /* PreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreluKernel.metal; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXGroup section */
FCEB6838212F00CC00D2448E = {
isa = PBXGroup;
children = (
FCEB6841212F00CC00D2448E /* PreluKernel */,
FCEB6840212F00CC00D2448E /* Products */,
sourceTree = "<group>";
FCEB6840212F00CC00D2448E /* Products */ = {
isa = PBXGroup;
children = (
FCEB683F212F00CC00D2448E /* PreluKernel.metallib */,
name = Products;
sourceTree = "<group>";
FCEB6841212F00CC00D2448E /* PreluKernel */ = {
isa = PBXGroup;
children = (
FCEB6842212F00CC00D2448E /* PreluKernel.metal */,
path = PreluKernel;
sourceTree = "<group>";
/* End PBXGroup section */
/* Begin PBXNativeTarget section */
FCEB683E212F00CC00D2448E /* PreluKernel */ = {
isa = PBXNativeTarget;
buildConfigurationList = FCEB6846212F00CC00D2448E /* Build configuration list for PBXNativeTarget "PreluKernel" */;
buildPhases = (
FCEB683D212F00CC00D2448E /* Sources */,
buildRules = (
dependencies = (
name = PreluKernel;
productName = PreluKernel;
productReference = FCEB683F212F00CC00D2448E /* PreluKernel.metallib */;
productType = "com.apple.product-type.metal-library";
/* End PBXNativeTarget section */
/* Begin PBXProject section */
FCEB6839212F00CC00D2448E /* Project object */ = {
isa = PBXProject;
attributes = {
LastUpgradeCheck = 0940;
TargetAttributes = {
FCEB683E212F00CC00D2448E = {
CreatedOnToolsVersion = 9.4.1;
buildConfigurationList = FCEB683C212F00CC00D2448E /* Build configuration list for PBXProject "PreluKernel" */;
compatibilityVersion = "Xcode 9.3";
developmentRegion = en;
hasScannedForEncodings = 0;
knownRegions = (
mainGroup = FCEB6838212F00CC00D2448E;
productRefGroup = FCEB6840212F00CC00D2448E /* Products */;
projectDirPath = "";
projectRoot = "";
targets = (
FCEB683E212F00CC00D2448E /* PreluKernel */,
/* End PBXProject section */
/* Begin PBXSourcesBuildPhase section */
FCEB683D212F00CC00D2448E /* Sources */ = {
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
FCEB6843212F00CC00D2448E /* PreluKernel.metal in Sources */,
runOnlyForDeploymentPostprocessing = 0;
/* End PBXSourcesBuildPhase section */
/* Begin XCBuildConfiguration section */
FCEB6844212F00CC00D2448E /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
SDKROOT = iphoneos;
name = Debug;
FCEB6845212F00CC00D2448E /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
SDKROOT = iphoneos;
name = Release;
FCEB6847212F00CC00D2448E /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
CODE_SIGN_STYLE = Automatic;
name = Debug;
FCEB6848212F00CC00D2448E /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
CODE_SIGN_STYLE = Automatic;
name = Release;
/* End XCBuildConfiguration section */
/* Begin XCConfigurationList section */
FCEB683C212F00CC00D2448E /* Build configuration list for PBXProject "PreluKernel" */ = {
isa = XCConfigurationList;
buildConfigurations = (
FCEB6844212F00CC00D2448E /* Debug */,
FCEB6845212F00CC00D2448E /* Release */,
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
FCEB6846212F00CC00D2448E /* Build configuration list for PBXNativeTarget "PreluKernel" */ = {
isa = XCConfigurationList;
buildConfigurations = (
FCEB6847212F00CC00D2448E /* Debug */,
FCEB6848212F00CC00D2448E /* Release */,
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
/* End XCConfigurationList section */
rootObject = FCEB6839212F00CC00D2448E /* Project object */;
<?xml version="1.0" encoding="UTF-8"?>
version = "1.0">
location = "self:PreluKernel.xcodeproj">
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
// PreluKernel.metal
// PreluKernel
// Created by liuRuiLong on 2018/8/23.
// Copyright © 2018年 orange. All rights reserved.
#include <metal_stdlib>
using namespace metal;
......@@ -11,6 +11,34 @@
<capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
<!--Multi Predict View Controller-->
<scene sceneID="ec4-AW-9Vs">
<viewController id="Vwd-lt-764" customClass="MultiPredictViewController" customModule="paddle_mobile_demo" customModuleProvider="target" sceneMemberID="viewController">
<view key="view" contentMode="scaleToFill" id="55D-rz-Ex6">
<rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
<autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
<button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="TQt-X9-PdF">
<rect key="frame" x="164" y="318" width="46" height="30"/>
<state key="normal" title="Button"/>
<action selector="predictAct:" destination="Vwd-lt-764" eventType="touchUpInside" id="d4z-Cv-6jY"/>
<color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
<constraint firstItem="TQt-X9-PdF" firstAttribute="centerY" secondItem="55D-rz-Ex6" secondAttribute="centerY" id="bL3-wr-TcH"/>
<constraint firstItem="TQt-X9-PdF" firstAttribute="centerX" secondItem="55D-rz-Ex6" secondAttribute="centerX" id="sBi-RQ-sJn"/>
<viewLayoutGuide key="safeArea" id="bsd-h4-RYZ"/>
<placeholder placeholderIdentifier="IBFirstResponder" id="68E-SG-96s" userLabel="First Responder" sceneMemberID="firstResponder"/>
<point key="canvasLocation" x="-559" y="686"/>
<!--View Controller-->
<scene sceneID="tne-QT-ifu">
......@@ -20,9 +48,9 @@
<autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
<imageView userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" translatesAutoresizingMaskIntoConstraints="NO" id="ZZh-fw-LwK">
<rect key="frame" x="0.0" y="20" width="375" height="247"/>
<rect key="frame" x="0.0" y="20" width="225" height="247"/>
<label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="Thread:" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2EB-m2-a3L">
<label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="Platform:" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2EB-m2-a3L">
<rect key="frame" x="10" y="538" width="68" height="24"/>
<constraint firstAttribute="width" constant="68" id="Q5J-tq-JSX"/>
......@@ -142,9 +170,14 @@
<fontDescription key="fontDescription" type="system" pointSize="15"/>
<textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
<view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Cil-py-NiA">
<rect key="frame" x="225" y="20" width="150" height="247"/>
<color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
<color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
<constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="Cil-py-NiA" secondAttribute="bottom" constant="10" id="16p-IK-b5X"/>
<constraint firstItem="6Tk-OE-BBY" firstAttribute="trailing" secondItem="VQn-bS-fWp" secondAttribute="trailing" constant="10" id="1Xg-0h-9SE"/>
<constraint firstItem="avL-VK-Kha" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="2t9-hS-VXa"/>
<constraint firstItem="R90-Yf-S6g" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="76b-Ny-1Og"/>
......@@ -159,11 +192,12 @@
<constraint firstItem="XpL-9M-UOp" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="KWW-qT-Rzf"/>
<constraint firstItem="6MG-gv-hD5" firstAttribute="centerY" secondItem="avL-VK-Kha" secondAttribute="centerY" id="KZa-YZ-DEs"/>
<constraint firstItem="2EB-m2-a3L" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="Le3-TN-zOL"/>
<constraint firstItem="ZZh-fw-LwK" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="MeS-HQ-voE"/>
<constraint firstItem="ZZh-fw-LwK" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" constant="-150" id="MeS-HQ-voE"/>
<constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="ZZh-fw-LwK" secondAttribute="bottom" constant="10" id="NUL-Ta-VI8"/>
<constraint firstItem="m5L-O7-P31" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="15" id="RFA-z1-9aB"/>
<constraint firstItem="wUL-9N-u1V" firstAttribute="width" secondItem="a3K-ri-NVs" secondAttribute="width" id="Rp6-Bh-BN3"/>
<constraint firstItem="6MG-gv-hD5" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="S0W-0G-75m"/>
<constraint firstItem="Cil-py-NiA" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="UNc-Et-9Yv"/>
<constraint firstItem="w7H-Sk-Rai" firstAttribute="leading" secondItem="wUL-9N-u1V" secondAttribute="trailing" id="VBM-8b-jP0"/>
<constraint firstItem="VQn-bS-fWp" firstAttribute="top" secondItem="m5L-O7-P31" secondAttribute="bottom" constant="8" id="VpS-4N-mOo"/>
<constraint firstItem="wUL-9N-u1V" firstAttribute="top" secondItem="2EB-m2-a3L" secondAttribute="bottom" constant="35" id="VpU-j2-gaE"/>
......@@ -175,10 +209,12 @@
<constraint firstItem="ZZh-fw-LwK" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="eIC-fZ-OEE"/>
<constraint firstItem="976-fk-Kx2" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="fFg-pB-eyU"/>
<constraint firstItem="6Tk-OE-BBY" firstAttribute="bottom" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="40" id="fG6-0p-I0P"/>
<constraint firstItem="Cil-py-NiA" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="gGK-DB-ibv"/>
<constraint firstItem="XpL-9M-UOp" firstAttribute="leading" secondItem="w7H-Sk-Rai" secondAttribute="trailing" id="guC-Db-cA9"/>
<constraint firstItem="6MG-gv-hD5" firstAttribute="leading" secondItem="avL-VK-Kha" secondAttribute="trailing" constant="10" id="jNW-iC-u7V"/>
<constraint firstItem="4ey-Xr-U4e" firstAttribute="bottom" secondItem="6Tk-OE-BBY" secondAttribute="bottom" id="o1X-q5-P7j"/>
<constraint firstItem="6MG-gv-hD5" firstAttribute="top" secondItem="VQn-bS-fWp" secondAttribute="bottom" constant="8" id="tAE-ss-jlA"/>
<constraint firstItem="Cil-py-NiA" firstAttribute="leading" secondItem="ZZh-fw-LwK" secondAttribute="trailing" id="teJ-PP-h2R"/>
<constraint firstItem="4ey-Xr-U4e" firstAttribute="top" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="10" id="udc-wT-jqd"/>
<constraint firstItem="ZZh-fw-LwK" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" id="vXI-l2-CjL"/>
<constraint firstItem="VQn-bS-fWp" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="wtI-Dl-YPq"/>
......@@ -195,11 +231,12 @@
<outlet property="resultTextView" destination="VQn-bS-fWp" id="306-c7-3vM"/>
<outlet property="selectImageView" destination="ZZh-fw-LwK" id="afR-Bv-6AW"/>
<outlet property="threadPickerView" destination="DlO-dk-RMr" id="Kk4-QV-b5o"/>
<outlet property="videoView" destination="Cil-py-NiA" id="QY2-BP-SNS"/>
<placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
<point key="canvasLocation" x="-724" y="98.50074962518741"/>
<point key="canvasLocation" x="-1127" y="-3"/>
// Multi-Predict-ViewController.swift
// paddle-mobile-demo
// Created by liuRuiLong on 2018/9/14.
// Copyright © 2018年 orange. All rights reserved.
import UIKit
import paddle_mobile
class MultiPredictViewController: UIViewController {
var runner1: Runner!
var runner2: Runner!
override func viewDidLoad() {
let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device)
let genet = Genet.init(device: MetalHelper.shared.device)
runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
let queue2 = MetalHelper.shared.device.makeCommandQueue()
runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
@IBAction func predictAct(_ sender: Any) {
let success = self.runner2.load()
// DispatchQueue.global().async {
let image1 = UIImage.init(named: "hand.jpg")
// let success = self.runner2.load()
// if success {
// for i in 0..<10000 {
// print(i)
// self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
// print("result1: ")
//// print(res)
// })
// }
// } else {
// print("load failed")
// }
// self.runner1.clear()
// }
// return
// DispatchQueue.global().async {
//// sleep(1)
// let image1 = UIImage.init(named: "banana.jpeg")
//// if success {
// for _ in 0..<10 {
// self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
// print("result2: ")
// print(res)
// })
// }
//// } else {
//// print("load failed")
//// }
//// self.runner2.clear()
// }
import Foundation
import QuartzCore
public class FPSCounter {
private(set) public var fps: Double = 0
var frames = 0
var startTime: CFTimeInterval = 0
public func start() {
frames = 0
startTime = CACurrentMediaTime()
public func frameCompleted() {
frames += 1
let now = CACurrentMediaTime()
let elapsed = now - startTime
if elapsed > 0.1 {
let current = Double(frames) / elapsed
let smoothing = 0.75
fps = smoothing*fps + (1 - smoothing)*current
if elapsed > 1 {
frames = 0
startTime = CACurrentMediaTime()
import UIKit
import Metal
import CoreVideo
import AVFoundation
@available(iOS 10.0, *)
@objc public protocol VideoCaptureDelegate: NSObjectProtocol {
@objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime)
@objc optional func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime)
@objc optional func videoCapture(_ capture: VideoCapture, didCapturePhoto previewImage: UIImage?)
@objc optional func videoCapture(_ capture: VideoCapture, didCapturePhotoTexture texture: MTLTexture?)
Simple interface to the iPhone's camera.
@available(iOS 10.0, *)
public class VideoCapture: NSObject {
public var previewLayer: AVCaptureVideoPreviewLayer?
public weak var delegate: VideoCaptureDelegate?
public var fps = -1
private let device: MTLDevice?
private let videoOrientation: AVCaptureVideoOrientation
private var textureCache: CVMetalTextureCache?
private let captureSession = AVCaptureSession()
private let videoOutput = AVCaptureVideoDataOutput()
private let photoOutput = AVCapturePhotoOutput()
private let queue = DispatchQueue(label: "net.machinethink.camera-queue")
private var lastTimestamp = CMTime()
private let cameraPosition: AVCaptureDevice.Position
public init(device: MTLDevice? = nil, orientation: AVCaptureVideoOrientation = .portrait, position: AVCaptureDevice.Position = .back) {
self.device = device
self.videoOrientation = orientation
self.cameraPosition = position
public func setUp(sessionPreset: AVCaptureSession.Preset = .medium,
completion: @escaping (Bool) -> Void) {
queue.async {
let success = self.setUpCamera(sessionPreset: sessionPreset)
DispatchQueue.main.async {
func fontCamera() -> AVCaptureDevice? {
let deveices = AVCaptureDevice.DiscoverySession.init(deviceTypes: [.builtInWideAngleCamera], mediaType: AVMediaType.video, position: .front).devices
return deveices.first
func setUpCamera(sessionPreset: AVCaptureSession.Preset) -> Bool {
if let inDevice = device{
guard CVMetalTextureCacheCreate(kCFAllocatorDefault, nil, inDevice, nil, &textureCache) == kCVReturnSuccess else {
print("Error: could not create a texture cache")
return false
captureSession.sessionPreset = sessionPreset
var oCaptureDevice: AVCaptureDevice?
switch cameraPosition {
case .back:
oCaptureDevice = AVCaptureDevice.default(for: AVMediaType.video)
case .front:
oCaptureDevice = fontCamera()
guard let captureDevice = oCaptureDevice else {
print("Error: no video devices available")
return false
guard let videoInput = try? AVCaptureDeviceInput(device: captureDevice) else {
print("Error: could not create AVCaptureDeviceInput")
return false
if captureSession.canAddInput(videoInput) {
let previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
previewLayer.videoGravity = AVLayerVideoGravity.resizeAspect
previewLayer.connection?.videoOrientation = self.videoOrientation
self.previewLayer = previewLayer
let settings: [String : Any] = [
kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)
videoOutput.videoSettings = settings
videoOutput.alwaysDiscardsLateVideoFrames = true
videoOutput.setSampleBufferDelegate(self, queue: queue)
if captureSession.canAddOutput(videoOutput) {
// We want the buffers to be in portrait orientation otherwise they are
// rotated by 90 degrees. Need to set this _after_ addOutput()!
videoOutput.connection(with: AVMediaType.video)?.videoOrientation = self.videoOrientation
if captureSession.canAddOutput(photoOutput) {
return true
public func start() {
if !captureSession.isRunning {
public func stop() {
if captureSession.isRunning {
/* Captures a single frame of the camera input. */
public func capturePhoto() {
let settings = AVCapturePhotoSettings(format: [kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)])
settings.previewPhotoFormat = [
kCVPixelBufferPixelFormatTypeKey as String: settings.__availablePreviewPhotoPixelFormatTypes[0],
kCVPixelBufferWidthKey as String: 480,
kCVPixelBufferHeightKey as String: 360,
photoOutput.capturePhoto(with: settings, delegate: self)
func convertToMTLTexture(sampleBuffer: CMSampleBuffer?) -> MTLTexture? {
if let textureCache = textureCache, let sampleBuffer = sampleBuffer, let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
let width = CVPixelBufferGetWidth(imageBuffer)
let height = CVPixelBufferGetHeight(imageBuffer)
var texture: CVMetalTexture?
CVMetalTextureCacheCreateTextureFromImage(kCFAllocatorDefault, textureCache, imageBuffer, nil, .bgra8Unorm, width, height, 0, &texture)
if let texture = texture {
return CVMetalTextureGetTexture(texture)
return nil
func convertToUIImage(sampleBuffer: CMSampleBuffer?) -> UIImage? {
if let sampleBuffer = sampleBuffer,
let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
let width = CVPixelBufferGetWidth(imageBuffer)
let height = CVPixelBufferGetHeight(imageBuffer)
let rect = CGRect(x: 0, y: 0, width: CGFloat(width), height: CGFloat(height))
let ciImage = CIImage(cvPixelBuffer: imageBuffer)
let ciContext = CIContext(options: nil)
if let cgImage = ciContext.createCGImage(ciImage, from: rect) {
return UIImage(cgImage: cgImage)
return nil
extension VideoCapture: AVCaptureVideoDataOutputSampleBufferDelegate {
public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
// Because lowering the capture device's FPS looks ugly in the preview,
// we capture at full speed but only call the delegate at its desired
// framerate. If `fps` is -1, we run at the full framerate.
let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
let deltaTime = timestamp - lastTimestamp
if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) {
lastTimestamp = timestamp
self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp)
if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{
let texture = convertToMTLTexture(sampleBuffer: sampleBuffer)
delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp)
public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
print("dropped frame")
extension VideoCapture: AVCapturePhotoCaptureDelegate {
public func photoOutput(_ captureOutput: AVCapturePhotoOutput,
didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?,
previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?,
resolvedSettings: AVCaptureResolvedPhotoSettings,
bracketSettings: AVCaptureBracketedStillImageSettings?,
error: Error?) {
var imageTexture: MTLTexture?
var previewImage: UIImage?
if error == nil {
if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{
imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer)
self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture)
if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{
previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer)
self.delegate?.videoCapture?(self, didCapturePhoto: previewImage)
......@@ -14,27 +14,32 @@
import UIKit
import MetalKit
import CoreMedia
import paddle_mobile
import MetalPerformanceShaders
let platform: Platform = .GPU
let threadSupport = [1]
var platform: Platform = .GPU
let threadSupport: [(Platform, String)] = [(.GPU, "GPU"), (.CPU, "CPU")]
let modelHelperMap: [SupportModel : Runner] = [.mobilenet_ssd : Runner.init(inNet: MobileNet_ssd_hand.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
//.mobilenet_ssd : Runner.init(inNet: MobileNet_ssd_hand.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
let modelHelperMap: [SupportModel : Runner] = [
.genet : Runner.init(inNet: Genet.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
.mobilenet_ssd_ar : Runner.init(inNet: MobileNet_ssd_AR.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform)]
//, .genet : Genet.init()
//let modelHelperMap: [SupportModel : Net] = [.mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init()]
let netSupport: [SupportModel : Net] = [.genet : Genet.init(device: MetalHelper.shared.device), .mobilenet_ssd_ar : MobileNet_ssd_AR.init(device: MetalHelper.shared.device)]
enum SupportModel: String{
// case mobilenet = "mobilenet"
case mobilenet_ssd = "mobilenetssd"
// case mobilenet_ssd = "mobilenetssd"
case genet = "genet"
case mobilenet_ssd_ar = "mobilenetssd_ar"
static func supportedModels() -> [SupportModel] {
return [.mobilenet_ssd, .genet, .mobilenet_ssd_ar]
// .mobilenet,
// .mobilenet_ssd,
return [.genet, .mobilenet_ssd_ar]
......@@ -44,24 +49,36 @@ class ViewController: UIViewController {
@IBOutlet weak var elapsedTimeLabel: UILabel!
@IBOutlet weak var modelPickerView: UIPickerView!
@IBOutlet weak var threadPickerView: UIPickerView!
@IBOutlet weak var videoView: UIView!
var videoCapture: VideoCapture!
var selectImage: UIImage?
var inputPointer: UnsafeMutablePointer<Float32>?
var modelType: SupportModel = SupportModel.supportedModels()[0]
var toPredictTexture: MTLTexture?
var runner: Runner {
get {
return modelHelperMap[modelType] ?! " has no this type "
set {
var runner: Runner!
var threadNum = 1
@IBAction func loadAct(_ sender: Any) {
runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue, inPlatform: platform)
if platform == .CPU {
if inputPointer == nil {
inputPointer = runner.preproccess(image: selectImage!.cgImage!)
} else if platform == .GPU {
if self.toPredictTexture == nil {
runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
self?.toPredictTexture = texture
} else {
fatalError( " unsupport " )
if runner.load() {
print(" load success ! ")
} else {
......@@ -81,7 +98,7 @@ class ViewController: UIViewController {
@IBAction func predictAct(_ sender: Any) {
let max = 1
let max = 50
switch platform {
case .GPU:
guard let inTexture = toPredictTexture else {
......@@ -91,7 +108,7 @@ class ViewController: UIViewController {
let startDate = Date.init()
for i in 0..<max {
runner.predict(texture: inTexture) { [weak self] (success, res) in
runner.predict(texture: inTexture) { [weak self] (success, resultHolder) in
guard let sSelf = self else {
......@@ -99,11 +116,19 @@ class ViewController: UIViewController {
if i == max - 1 {
let time = Date.init().timeIntervalSince(startDate)
DispatchQueue.main.async {
sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
// print(resultHolder!.result![0])
sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: resultHolder!)
sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
DispatchQueue.main.async {
// print("释放")
// print("sleep before ")
// usleep(33000)
......@@ -116,6 +141,7 @@ class ViewController: UIViewController {
for _ in 0..<10 {
runner.predict(inputPointer: inInputPointer) { (success, res) in
......@@ -129,11 +155,12 @@ class ViewController: UIViewController {
if i == max - 1 {
let time = Date.init().timeIntervalSince(startDate)
DispatchQueue.main.async {
sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
// sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
......@@ -141,6 +168,13 @@ class ViewController: UIViewController {
override func viewDidLoad() {
// if runner.load() {
// print(" load success ! ")
// } else {
// print(" load error ! ")
// }
modelPickerView.delegate = self
modelPickerView.dataSource = self
threadPickerView.delegate = self
......@@ -149,15 +183,29 @@ class ViewController: UIViewController {
selectImage = UIImage.init(named: "hand.jpg")
selectImageView.image = selectImage
if platform == .CPU {
inputPointer = runner.preproccess(image: selectImage!.cgImage!)
} else if platform == .GPU {
runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
self?.toPredictTexture = texture
} else {
fatalError( " unsupport " )
// if platform == .CPU {
// inputPointer = runner.preproccess(image: selectImage!.cgImage!)
// } else if platform == .GPU {
// runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
// self?.toPredictTexture = texture
// }
// } else {
// fatalError( " unsupport " )
// }
// videoCapture = VideoCapture.init(device: MetalHelper.shared.device, orientation: .portrait, position: .back)
// videoCapture.fps = 30
// videoCapture.delegate = self
// videoCapture.setUp { (success) in
// DispatchQueue.main.async {
// if let preViewLayer = self.videoCapture.previewLayer {
// self.videoView.layer.addSublayer(preViewLayer)
// self.videoCapture.previewLayer?.frame = self.videoView.bounds
// }
// self.videoCapture.start()
// }
// }
......@@ -186,7 +234,7 @@ extension ViewController: UIPickerViewDataSource, UIPickerViewDelegate{
if pickerView == modelPickerView {
return SupportModel.supportedModels()[row].rawValue
} else if pickerView == threadPickerView {
return "\(threadSupport[row])"
return threadSupport[row].1
} else {
......@@ -196,7 +244,8 @@ extension ViewController: UIPickerViewDataSource, UIPickerViewDelegate{
if pickerView == modelPickerView {
self.modelType = SupportModel.supportedModels()[row]
} else if pickerView == threadPickerView {
self.threadNum = threadSupport[row]
platform = threadSupport[row].0
} else {
......@@ -218,4 +267,32 @@ extension ViewController: UIImagePickerControllerDelegate, UINavigationControll
var bool1 = false
extension ViewController: VideoCaptureDelegate{
func predictTexture(texture: MTLTexture){
runner.scaleTexture(input: texture) { (scaledTexture) in
self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in
// print(resultHolder!.result![0])
func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime) {
// if !bool1 {
// DispatchQueue.main.asyncAfter(deadline: DispatchTime.init(uptimeNanoseconds: 500000000)) {
self.predictTexture(texture: texture!)
// }
// bool1 = true
// }
......@@ -33,7 +33,7 @@
buildConfiguration = "Release"
buildConfiguration = "Debug"
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
launchStyle = "0"
......@@ -17,7 +17,17 @@
#import <CoreImage/CoreImage.h>
#import <Foundation/Foundation.h>
@interface PaddleMobile : NSObject
@interface PaddleMobileCPUResult: NSObject
@property (assign, nonatomic, readonly) float *output;
@property (assign, nonatomic, readonly) int outputSize;
@interface PaddleMobileCPU : NSObject
......@@ -42,25 +52,8 @@
andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
* 进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
- (NSArray *)predict:(CGImageRef)image
dim:(NSArray<NSNumber *> *)dim
means:(NSArray<NSNumber *> *)means
* 预测输入
* */
- (NSArray *)predictInput:(float *)input
dim:(NSArray<NSNumber *> *)dim
means:(NSArray<NSNumber *> *)means
* 对图像进行预处理
* 对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存
* */
output:(float *)output
......@@ -68,6 +61,22 @@
dim:(NSArray<NSNumber *> *)dim;
* 预测预处理后的数据, 返回结果使用结束需要调用其 realseOutput 函数进行释放
* */
- (PaddleMobileCPUResult *)predictInput:(float *)input
dim:(NSArray<NSNumber *> *)dim;
进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
进行预测, 默认 means 为 0, scale 为 1.0
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
......@@ -16,6 +16,12 @@
#import <Foundation/Foundation.h>
@interface CPUResult: NSObject
@property (assign, nonatomic) float *output;
@property (assign, nonatomic) int outputSize;
@interface NMSCompute: NSObject
@property (assign, nonatomic) float scoreThredshold;
......@@ -34,6 +40,6 @@
@property (strong, nonatomic) NSArray<NSNumber *> *bboxDim;
-(NSArray<NSNumber *> *)computeWithScore:(float *)score andBBoxs:(float *)bbox;
-(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox;
......@@ -21,6 +21,8 @@
#import <algorithm>
struct NMSParam {
float *score_data;
......@@ -282,9 +284,12 @@ void MultiClassNMSCompute(NMSParam *param) {
param->output_size = output_size;
@implementation CPUResult
@implementation NMSCompute
-(NSArray<NSNumber *> *)computeWithScore:(float *)score andBBoxs:(float *)bbox {
-(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox {
NMSParam param;
param.box_data = bbox;
param.score_data = score;
......@@ -306,12 +311,10 @@ void MultiClassNMSCompute(NMSParam *param) {
param.box_dim = box_dim;
NSMutableArray<NSNumber *> *output = [NSMutableArray arrayWithCapacity:param.output_size];
for (int i = 0; i < param.output_size; ++i) {
[output addObject:[NSNumber numberWithFloat:param.output[i]]];
delete param.output;
return output;
CPUResult *cr = [[CPUResult alloc] init];
cr.output = param.output;
cr.outputSize = param.output_size;
return cr;
......@@ -71,7 +71,128 @@ extension MTLDevice {
return buffer!
func texture2tensor_loop<P>(texture: MTLTexture, cb: ([Int], P)->Void) -> Void {
let bpR = texture.width * 4 * MemoryLayout<P>.size
let bpI = texture.height * bpR
let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1))
for i in 0..<texture.arrayLength {
let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: bpI)
texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
for tx in 0..<texture.width * texture.height * 4 {
var k = tx
var xyzn: [Int] = [0, 0, 0, 0]
xyzn[1] = k / (texture.width * 4)
k %= (texture.width * 4)
xyzn[3] = k % 4
xyzn[0] = k / 4
xyzn[2] = i
cb(xyzn, pointer[tx])
func texture2tensor_3<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
var tdim: [Int] = [1, 1, 1, 1]
for i in 0..<dim.count {
tdim[4 - dim.count + i] = dim[i]
let count = dim.reduce(1) { $0 * $1 }
var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
let ndim: [Int] = transpose.map { tdim[$0] }
assert(dim.count == 3)
assert(texture.width == ndim[3])
assert(texture.height == ndim[2])
assert(ndim[0] == 1)
assert(texture.arrayLength == (ndim[1] + 3) / 4)
texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
var tg: [Int] = [0, 0, 0, 0]
tg[1] = xyzn[2] * 4 + xyzn[3]
tg[2] = xyzn[1]
tg[3] = xyzn[0]
var ig: [Int] = [0, 0, 0, 0]
for k in 0..<4 {
ig[transpose[k]] = tg[k]
let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
if ix < count {
tensor[ix] = v
return tensor
func texture2tensor_2<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
var tdim: [Int] = [1, 1, 1, 1]
for i in 0..<dim.count {
tdim[4 - dim.count + i] = dim[i]
let count = dim.reduce(1) { $0 * $1 }
var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
let ndim: [Int] = transpose.map { tdim[$0] }
assert(dim.count == 2)
let w = (ndim[3] + 3) / 4
assert(texture.width == w)
assert(texture.height == ndim[2])
assert(ndim[0] == 1)
assert(ndim[1] == 1)
assert(texture.arrayLength == 1)
texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
var tg: [Int] = [0, 0, 0, 0]
tg[2] = xyzn[1]
tg[3] = xyzn[0] * 4 + xyzn[3]
var ig: [Int] = [0, 0, 0, 0]
for k in 0..<4 {
ig[transpose[k]] = tg[k]
let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
if ix < count {
tensor[ix] = v
return tensor
func texture2tensor_1<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
var tdim: [Int] = [1, 1, 1, 1]
for i in 0..<dim.count {
tdim[4 - dim.count + i] = dim[i]
let count = dim.reduce(1) { $0 * $1 }
var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
let ndim: [Int] = transpose.map { tdim[$0] }
assert(dim.count == 1)
let w = (ndim[3] + 3) / 4
assert(texture.width == w)
assert(texture.height == 1)
assert(ndim[0] == 1)
assert(ndim[1] == 1)
assert(ndim[2] == 1)
assert(texture.arrayLength == 1)
texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
var tg: [Int] = [0, 0, 0, 0]
tg[3] = xyzn[0] * 4 + xyzn[3]
var ig: [Int] = [0, 0, 0, 0]
for k in 0..<4 {
ig[transpose[k]] = tg[k]
let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
if ix < count {
tensor[ix] = v
return tensor
func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
if dim.count == 3 {
return texture2tensor_3(texture: texture, dim: dim, transpose: transpose)
} else if dim.count == 2 {
return texture2tensor_2(texture: texture, dim: dim, transpose: transpose)
} else if dim.count == 1 {
return texture2tensor_1(texture: texture, dim: dim, transpose: transpose)
var tdim: [Int] = [1, 1, 1, 1]
for i in 0..<dim.count {
tdim[4 - dim.count + i] = dim[i]
......@@ -84,30 +205,19 @@ extension MTLDevice {
assert(texture.height == ndim[1])
assert(texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4)
let bpR = ndim[2] * 4 * MemoryLayout<P>.size
let bpI = ndim[1] * bpR
let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
for i in 0..<texture.arrayLength {
let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: ndim[1] * ndim[2] * 4 * MemoryLayout<P>.size)
texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
for h in 0..<ndim[1] {
for w in 0..<ndim[2] {
for k in 0..<4 {
let tx = (h * ndim[2] + w) * 4 + k
let n = (i * 4 + k) / ndim[3]
let c = (i * 4 + k) % ndim[3]
let jg = [n, h, w, c]
var ig = [0, 0, 0, 0]
for d in 0..<4 {
ig[transpose[d]] = jg[d]
let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
if ix < count {
tensor[ix] = pointer[tx]
texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
var tg: [Int] = [0, 0, 0, 0]
tg[1] = xyzn[1]
tg[2] = xyzn[0]
tg[0] = (xyzn[2] * 4 + xyzn[3]) / ndim[3]
tg[3] = (xyzn[2] * 4 + xyzn[3]) % ndim[3]
var ig: [Int] = [0, 0, 0, 0]
for k in 0..<4 {
ig[transpose[k]] = tg[k]
let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
if ix < count {
tensor[ix] = v
return tensor
......@@ -83,38 +83,38 @@ public class PaddleMobileUnitTest {
public func testConcat() {
let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
var it: [[Float32]] = []
for _ in 0..<7 {
it.append((0..<12).map { Float32($0) })
let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
let param = ConcatTestParam.init(
input: input,
output: output,
dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
axis: 1,
odim: [3, 28]
let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
concatKernel.test(cmdBuffer: buffer, param: param)
buffer.addCompletedHandler { (buffer) in
for i in 0..<it.count {
let _: Float32? = input[i].logDesc()
self.tensorPrint(tensor: it[i], dim: [3, 4])
let _: Float32? = output.logDesc()
let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
self.tensorPrint(tensor: tx, dim: [3, 28])
// let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
// var it: [[Float32]] = []
// for _ in 0..<7 {
// it.append((0..<12).map { Float32($0) })
// }
// let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
// let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
// let param = ConcatTestParam.init(
// input: input,
// output: output,
// dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
// axis: 1,
// odim: [3, 28]
// )
// let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
// concatKernel.test(cmdBuffer: buffer, param: param)
// buffer.addCompletedHandler { (buffer) in
// for i in 0..<it.count {
// let _: Float32? = input[i].logDesc()
// self.tensorPrint(tensor: it[i], dim: [3, 4])
// }
// let _: Float32? = output.logDesc()
// let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
// self.tensorPrint(tensor: tx, dim: [3, 28])
// }
// buffer.commit()
public func testReshape() {
let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
// let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
// let input: [Float32] = (0..<24).map { Float32($0) }
// let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
// let outTexture = device.tensor2texture(value: [Float32](), dim: [4, 6])
......@@ -139,32 +139,32 @@ public class PaddleMobileUnitTest {
// self.tensorPrint(tensor: tx, dim: [4, 6])
// }
let input: [Float32] = (0..<24).map { Float32($0) }
let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
let mp = ReshapeMetalParam.init(
idim: (1, 2, 3, 4),
itrans: (0, 1, 2, 3),
odim: (1, 1, 1, 24),
otrans: (0, 1, 2, 3)
let param = ReshapeTestParam.init(
inputTexture: inTexture,
outputTexture: outTexture,
param: mp
let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
reshapeKernel.test(commandBuffer: buffer, testParam: param)
buffer.addCompletedHandler { (buffer) in
let _: Float32? = inTexture.logDesc()
let _: Float32? = outTexture.logDesc()
self.tensorPrint(tensor: input, dim: [2, 3, 4])
let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
self.tensorPrint(tensor: tx, dim: [24])
// let input: [Float32] = (0..<24).map { Float32($0) }
// let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
// let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
// let mp = ReshapeMetalParam.init(
// idim: (1, 2, 3, 4),
// itrans: (0, 1, 2, 3),
// odim: (1, 1, 1, 24),
// otrans: (0, 1, 2, 3)
// )
// let param = ReshapeTestParam.init(
// inputTexture: inTexture,
// outputTexture: outTexture,
// param: mp
// )
// let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
// reshapeKernel.test(commandBuffer: buffer, testParam: param)
// buffer.addCompletedHandler { (buffer) in
// let _: Float32? = inTexture.logDesc()
// let _: Float32? = outTexture.logDesc()
// self.tensorPrint(tensor: input, dim: [2, 3, 4])
// let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
// self.tensorPrint(tensor: tx, dim: [24])
// }
// buffer.commit()
public func testTranspose() {
......@@ -195,23 +195,23 @@ public class PaddleMobileUnitTest {
// let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4])
// self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4])
// }
let input: [Float32] = (0..<24).map { Float32($0) }
let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2])
let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1])
let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
transposeKernel.test(commandBuffer: buffer, param: param)
buffer.addCompletedHandler { (buffer) in
let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
self.tensorPrint(tensor: input, dim: [2, 3, 4])
let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2])
self.tensorPrint(tensor: tx, dim: [3, 4, 2])
// let input: [Float32] = (0..<24).map { Float32($0) }
// let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
// let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2])
// let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1])
// let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
// transposeKernel.test(commandBuffer: buffer, param: param)
// buffer.addCompletedHandler { (buffer) in
// let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
// let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
// self.tensorPrint(tensor: input, dim: [2, 3, 4])
// let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2])
// self.tensorPrint(tensor: tx, dim: [3, 4, 2])
// }
......@@ -243,7 +243,7 @@ extension Tensor: Variant {
extension Texture: Variant {
extension ResultHolder: Variant {
extension GPUResultHolder: Variant {
extension InputTexture: Variant {
......@@ -252,3 +252,43 @@ extension InputTexture: Variant {
extension MTLTexture where Self: Variant {
class FetchHolder: Variant {
var resultBuffer: MTLBuffer?
var dim: [Int]
var capacity: Int
init(inCapacity: Int, inDim: [Int]) {
capacity = inCapacity
dim = inDim
func initBuffer(device: MTLDevice) {
resultBuffer = device.makeBuffer(length: capacity * 4, options: [])
var result: UnsafeMutablePointer<Float32> {
guard let inResultBuffer = resultBuffer else {
return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: capacity)
extension FetchHolder: CustomStringConvertible, CustomDebugStringConvertible {
var description: String {
// return "\(result)"
var debugDescription: String {
// return "\(result)"
......@@ -46,8 +46,9 @@ public class Genet: Net {
override public func resultStr(res: [Float]) -> String {
return " \(Array<Float>(res.suffix(10))) ... "
override public func resultStr(res: ResultHolder) -> String {
// fatalError()
return " \(res.result![0]) ... "
......@@ -42,9 +42,12 @@ class MobileNet: Net{
let labels = PreWords.init(fileName: "synset")
override public func resultStr(res: [Float]) -> String {
override public func resultStr(res: ResultHolder) -> String {
guard let resPointer = res.result else {
var s: [String] = []
res.top(r: 5).enumerated().forEach{
(0..<res.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
return s.joined(separator: "\n")
......@@ -46,51 +46,52 @@ public class MobileNet_ssd_hand: Net{
override public func resultStr(res: [Float]) -> String {
override public func resultStr(res: ResultHolder) -> String {
return " \(res)"
override func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] {
override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
guard let interRes = paddleMobileRes.intermediateResults else {
fatalError(" need have inter result ")
guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as? Texture<Float32> else {
fatalError(" need score ")
guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
// print("score: ")
// print(scoreFormatArr.strideArray())
// guard let interRes = paddleMobileRes.intermediateResults else {
// fatalError(" need have inter result ")
// }
var bboxArr = bbox.metalTexture.float32Array()
// print("bbox: ")
// print(bboxArr.strideArray())
let nmsCompute = NMSCompute.init()
nmsCompute.scoreThredshold = 0.01
nmsCompute.nmsTopK = 400
nmsCompute.keepTopK = 200
nmsCompute.nmsEta = 1.0
nmsCompute.nmsThreshold = 0.45
nmsCompute.background_label = 0;
nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
fatalError( " result error " )
let output: [Float32] = result.map { $0.floatValue }
return output
// guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as? Texture<Float32> else {
// fatalError(" need score ")
// }
// guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
// fatalError()
// }
// var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
//// print("score: ")
//// print(scoreFormatArr.strideArray())
// var bboxArr = bbox.metalTexture.float32Array()
//// print("bbox: ")
//// print(bboxArr.strideArray())
// let nmsCompute = NMSCompute.init()
// nmsCompute.scoreThredshold = 0.01
// nmsCompute.nmsTopK = 400
// nmsCompute.keepTopK = 200
// nmsCompute.nmsEta = 1.0
// nmsCompute.nmsThreshold = 0.45
// nmsCompute.background_label = 0;
// nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
// nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
// guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
// fatalError( " result error " )
// }
// let output: [Float32] = result.map { $0.floatValue }
// return output
......@@ -30,50 +30,112 @@ public class MobileNet_ssd_AR: Net{
class MobilenetssdPreProccess: CusomKernel {
init(device: MTLDevice) {
let s = CusomKernel.Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
super.init(device: device, inFunctionName: "mobilent_ar_preprocess_half", outputDim: s, usePaddleMobileLib: false)
super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, usePaddleMobileLib: false)
override public func resultStr(res: [Float]) -> String {
return " \(res)"
override public func resultStr(res: ResultHolder) -> String {
return " \(res.result![0])"
override func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] {
override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
guard let interRes = paddleMobileRes.intermediateResults else {
fatalError(" need have inter result ")
guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as? Texture<Float32> else {
guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as? FetchHolder else {
fatalError(" need score ")
guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else {
var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
// print("score: ")
// print(scoreFormatArr.strideArray())
var bboxArr = bbox.metalTexture.float32Array()
// print("bbox: ")
// print(bboxArr.strideArray())
// let startDate = Date.init()
// print("scoreFormatArr: ")
//print((0..<score.capacity).map{ score.result[$0] }.strideArray())
// print("bbox arr: ")
// print((0..<bbox.capacity).map{ bbox.result[$0] }.strideArray())
let nmsCompute = NMSCompute.init()
nmsCompute.scoreThredshold = 0.01
nmsCompute.nmsTopK = 400
nmsCompute.keepTopK = 200
nmsCompute.scoreThredshold = 0.25
nmsCompute.nmsTopK = 100
nmsCompute.keepTopK = 100
nmsCompute.nmsEta = 1.0
nmsCompute.nmsThreshold = 0.45
nmsCompute.nmsThreshold = 0.449999988
nmsCompute.background_label = 0;
nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
nmsCompute.scoreDim = [NSNumber.init(value: score.dim[0]), NSNumber.init(value: score.dim[1]), NSNumber.init(value: score.dim[2])]
nmsCompute.bboxDim = [NSNumber.init(value: bbox.dim[0]), NSNumber.init(value: bbox.dim[1]), NSNumber.init(value: bbox.dim[2])]
guard let result = nmsCompute.compute(withScore: score.result, andBBoxs: bbox.result) else {
fatalError( " result error " )
let resultHolder = ResultHolder.init(inResult: result.output, inCapacity: Int(result.outputSize))
// for i in 0..<Int(result.outputSize) {
// print("i \(i) : \(result.output[i])")
// }
// print(Date.init().timeIntervalSince(startDate))
// print(resultHolder.result![0])
return resultHolder
override func updateProgram(program: Program) {
for i in [56, 66, 76, 86, 93, 99] {
let opDesc = program.programDesc.blocks[0].ops[i]
let output = opDesc.outputs["Out"]!.first!
let v = program.scope[output]!
let originTexture = v as! Texture<Float32>
originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1] / 7, originTexture.tensorDim[0] * 7])
originTexture.dim = Dim.init(inDim: [1, 1, originTexture.dim[3] / 7, originTexture.dim[2] * 7])
originTexture.padToFourDim = Dim.init(inDim: [1, 1, originTexture.padToFourDim[3] / 7, originTexture.padToFourDim[2] * 7])
program.scope[output] = originTexture
if i == 99 {
opDesc.attrs["axis"] = 0
} else {
opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
let output: [Float32] = result.map { $0.floatValue }
return output
for i in [58, 59, 88, 89, 95, 96, 68, 69, 78, 79] {
let opDesc = program.programDesc.blocks[0].ops[i]
let output = opDesc.outputs["Out"]!.first!
let v = program.scope[output]!
let originTexture = v as! Texture<Float32>
originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
for i in [60, 101, 90, 97, 70, 80] {
let opDesc = program.programDesc.blocks[0].ops[i]
let output = opDesc.outputs["Out"]!.first!
let v = program.scope[output]!
let originTexture = v as! Texture<Float32>
originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
for i in [102] {
let opDesc = program.programDesc.blocks[0].ops[i]
for output in opDesc.outputs["Out"]! {
let v = program.scope[output]!
let originTexture = v as! Texture<Float32>
originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
print(" split axis \(opDesc.attrs["axis"])")
// 99
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
See the License for the specific language governing permissions and
limitations under the License. */
import Foundation
public class ResultHolder: NSObject {
@objc public let result: UnsafeMutablePointer<Float32>?
@objc public let capacity: Int
init(inResult: UnsafeMutablePointer<Float32>?, inCapacity: Int) {
result = inResult
capacity = inCapacity
public func releasePointer() {
result?.deinitialize(count: capacity)
public class Net: NSObject {
var except: Int = 0
var means: [Float] = []
var scale: Float = 0.0
var dim: (n: Int, h: Int, w: Int, c: Int) = (n: 0, h: 0, w: 0, c: 0)
var preprocessKernel: CusomKernel? = nil
var paramPointer: UnsafeMutableRawPointer? = nil
var paramSize: Int = 0
var modelPointer: UnsafeMutableRawPointer? = nil
var modelSize: Int = 0
var modelPath: String = ""
var paramPath: String = ""
var modelDir: String = ""
@objc public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
self.paramPointer = paramPointer
self.paramSize = paramSize
self.modelPointer = modePointer
self.modelSize = modelSize
public func resultStr(res: ResultHolder) -> String {
func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
return ResultHolder.init(inResult: paddleMobileRes.resultPointer, inCapacity: paddleMobileRes.capacity)
@objc public init(device: MTLDevice) {
func updateProgram(program: Program) {
......@@ -64,7 +64,8 @@ class OpCreator<P: PrecisionType> {
gBilinearInterpType : BilinearInterpOp<P>.creat,
gSplit : SplitOp<P>.creat,
gShape : ShapeOp<P>.creat,
gFlatten : FlattenOp<P>.creat]
gFlatten : FlattenOp<P>.creat,
gConvAddPreluType : ConvAddPreluOp<P>.creat]
private init(){}
......@@ -19,6 +19,12 @@ protocol Fusion {
static func fusionNode() -> Node
static func change() -> [String : [(from: String, to: String)]]
static func fusionType() -> String
static func needCheck() -> [(Int, String)]
extension Fusion {
static func needCheck() -> [(Int, String)] {
return []
protocol Runable {
......@@ -26,6 +32,7 @@ protocol Runable {
func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
func delogOutput()
func inputVariant() -> [String : [Variant]]
func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer)
extension Runable where Self: OperatorProtocol{
......@@ -38,11 +45,16 @@ extension Runable where Self: OperatorProtocol{
func inputVariant() -> [String : [Variant]] {
return [:]
// fatalError(" op \(type) need implement inputVariant")
// return [:]
fatalError(" op \(type) need implement inputVariant")
func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
fatalError(" need implement ")
func delogOutput() {
print(type + ": has no implementation" )
......@@ -144,6 +156,7 @@ let gBilinearInterpType = "bilinear_interp"
let gSplit = "split"
let gShape = "shape"
let gFlatten = "flatten"
let gConvAddPreluType = "conv_add_prelu"
let opInfos = [gConvType : (inputs: ["Input"], outputs: ["Output"]),
gBatchNormType : (inputs: ["X"], outputs: ["Y"]),
......@@ -169,5 +182,7 @@ let opInfos = [gConvType : (inputs: ["Input"], outputs: ["Out
gBilinearInterpType : (inputs: ["X"], outputs: ["Out"]),
gSplit : (inputs: ["X"], outputs: ["Out"]),
gShape : (inputs: ["Input"], outputs: ["Out"]),
gFlatten : (inputs: ["X"], outputs: ["Out"])
gFlatten : (inputs: ["X"], outputs: ["Out"]),
gConvAddPreluType : (inputs: ["Input"], outputs: ["Out"])
......@@ -19,11 +19,14 @@ class BatchNormParam<P: PrecisionType>: OpParam {
required init(opDesc: OpDesc, inScope: Scope) throws {
do {
input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope)
if input.transpose != [0, 2, 3, 1] {
fatalError("batch norm only accepts NHWC")
output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope)
inputBias = try BatchNormParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
inputMean = try BatchNormParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
inputScale = try BatchNormParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
inputVariance = try BatchNormParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope)
mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope)
scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope)
variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope)
epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs)
} catch let error {
......@@ -32,10 +35,10 @@ class BatchNormParam<P: PrecisionType>: OpParam {
let input: Texture<P>
var output: Texture<P>
let inputBias: Tensor<ParamPrecisionType>
let inputMean: Tensor<ParamPrecisionType>
let inputScale: Tensor<ParamPrecisionType>
let inputVariance: Tensor<ParamPrecisionType>
let bias: Tensor<P>
let mean: Tensor<P>
let scale: Tensor<P>
let variance: Tensor<P>
let epsilon: Float
let momentum: Float
......@@ -53,9 +56,11 @@ class BatchNormOp<P: PrecisionType>: Operator<BatchNormKernel<P>, BatchNormParam
throw error
func delogOutput() {
print(" \(type) output: ")
let device = para.output.metalTexture!.device
let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
......@@ -19,15 +19,15 @@ class BilinearInterpParam<P: PrecisionType>: OpParam {
required init(opDesc: OpDesc, inScope: Scope) throws {
do {
input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope)
// if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
// fatalError()
// }
output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope)
out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs)
out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs)
} catch let error {
throw error
if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
let input: Texture<P>
var output: Texture<P>
......@@ -53,6 +53,10 @@ class BilinearInterpOp<P: PrecisionType>: Operator<BilinearInterpKernel<P>, Bili
func delogOutput() {
print(" \(type) output: ")
let device = para.output.metalTexture!.device
let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
// print(outputArray)
......@@ -27,6 +27,10 @@ class BoxcoderParam<P: PrecisionType>: OpParam {
} catch let error {
throw error
assert(priorBox.tensorDim.cout() == 2)
assert(priorBoxVar.tensorDim.cout() == 2)
assert(targetBox.tensorDim.cout() == 3)
assert(output.tensorDim.cout() == 3)
assert(priorBox.transpose == [0, 1, 2, 3])
assert(priorBoxVar.transpose == [0, 1, 2, 3])
assert(targetBox.transpose == [0, 1, 2, 3])
......@@ -59,30 +63,19 @@ class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P>
func delogOutput() {
print(" \(type) output: ")
// let priorBoxpadToFourDim = para.priorBox.padToFourDim
// let priorBoxArray: [Float32] = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxpadToFourDim[0], h: priorBoxpadToFourDim[1], w: priorBoxpadToFourDim[2], c: priorBoxpadToFourDim[3]))
// print(" prior box ")
// print(priorBoxArray.strideArray())
// let priorBoxVarpadToFourDim = para.priorBoxVar.padToFourDim
// let priorBoxVarArray: [Float32] = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarpadToFourDim[0], h: priorBoxVarpadToFourDim[1], w: priorBoxVarpadToFourDim[2], c: priorBoxVarpadToFourDim[3]))
// print(" prior box var ")
// print(priorBoxVarArray.strideArray())
// let targetBoxpadToFourDim = para.targetBox.padToFourDim
// let targetBoxArray: [Float32] = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxpadToFourDim[0], h: targetBoxpadToFourDim[1], w: targetBoxpadToFourDim[2], c: targetBoxpadToFourDim[3]))
// print(" target box ")
// print(targetBoxArray.strideArray())
let targetBoxpadToFourDim = para.targetBox.padToFourDim
let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxpadToFourDim[0], h: targetBoxpadToFourDim[1], w: targetBoxpadToFourDim[2], c: targetBoxpadToFourDim[3]))
let device = para.output.metalTexture!.device
let pbv : [Float32] = device.texture2tensor(texture: para.priorBoxVar.metalTexture!, dim: para.priorBoxVar.tensorDim.dims, transpose: para.priorBoxVar.transpose)
let pb : [Float32] = device.texture2tensor(texture: para.priorBox.metalTexture!, dim: para.priorBox.tensorDim.dims, transpose: para.priorBox.transpose)
let tb : [Float32] = device.texture2tensor(texture: para.targetBox.metalTexture!, dim: para.targetBox.tensorDim.dims, transpose: para.targetBox.transpose)
let out : [Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
print(" prior box var ")
print(" target box ")
let padToFourDim = para.output.padToFourDim
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(" prior box ")
print(" output ")
......@@ -65,15 +65,10 @@ class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Run
func delogOutput() {
print(" \(type) output: ")
let padToFourDim = para.output.padToFourDim
if para.output.transpose == [0, 1, 2, 3] {
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
} else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])).strideArray())
} else {
fatalError(" not implemet")
let device = para.output.metalTexture!.device
let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
See the License for the specific language governing permissions and
limitations under the License. */
import Foundation
class ConvAddPreluParam<P: PrecisionType>: OpParam {
typealias ParamPrecisionType = P
required init(opDesc: OpDesc, inScope: Scope) throws {
do {
filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
} catch let error {
throw error
let input: Texture<P>
let y: Tensor<ParamPrecisionType>
let filter: Tensor<ParamPrecisionType>
let mode: String
let alpha: Tensor<P>
var output: Texture<P>
let stride: [Int32]
let paddings: [Int32]
let dilations: [Int32]
let groups: Int
class ConvAddPreluOp<P: PrecisionType>: Operator<ConvAddPreluKernel<P>, ConvAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
typealias OpType = ConvAddPreluOp<P>
static func fusionNode() -> Node {
let beginNode = Node.init(inType: gConvType)
_ = beginNode
--> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
return beginNode
static func change() -> [String : [(from: String, to: String)]] {
return [:]
static func fusionType() -> String {
return gConvAddPreluType
func inferShape() {
let inDims = para.input.dim
let filterDim = para.filter.dim
let strides = para.stride
let paddings = para.paddings
let dilations = para.dilations
var outDim = [inDims[0]]
for i in 0..<strides.count {
let dilation: Int = Int(dilations[i])
let filterSize: Int = filterDim[i + 1]
let inputSize: Int = inDims[i + 1]
let padding: Int = Int(paddings[i])
let stride: Int = Int(strides[i])
let dKernel = dilation * (filterSize - 1) + 1
let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
para.output.dim = Dim.init(inDim: outDim)
func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
do {
try kernel.compute(commandBuffer: buffer, param: para)
} catch let error {
throw error
func delogOutput() {
print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
......@@ -15,14 +15,15 @@
import Foundation
class FetchParam<P: PrecisionType>: OpParam{
var output: Texture<P>
var output: FetchHolder
let input: Texture<P>
let scope: Scope
required init(opDesc: OpDesc, inScope: Scope) throws {
scope = inScope
do {
input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
output = input
output = FetchHolder.init(inCapacity: input.numel(), inDim: input.tensorDim.dims)
scope.setOutput(output: output)
} catch let error {
throw error
......@@ -34,14 +35,40 @@ class FetchParam<P: PrecisionType>: OpParam{
class FetchKernel<P: PrecisionType>: Kernel, Computable {
func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
required init(device: MTLDevice, param: FetchParam<P>) {
super.init(device: device, inFunctionName: "place_holder")
param.output.initBuffer(device: device)
if computePrecision == .Float16 {
if param.input.transpose == [0, 2, 3, 1] {
super.init(device: device, inFunctionName: "fetch_half")
} else {
// fatalError(" not support ")
super.init(device: device, inFunctionName: "fetch_placeholder_half")
print(" not support ")
} else if computePrecision == .Float32 {
if param.input.transpose == [0, 2, 3, 1] {
super.init(device: device, inFunctionName: "fetch")
} else {
print(" not support ")
super.init(device: device, inFunctionName: "fetch_placeholder")
// fatalError(" not support ")
} else {
fatalError(" not support ")
class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable{
class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable {
typealias OpType = FetchOp<P>
......@@ -50,7 +77,11 @@ class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runab
func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
scope.setOutput(output: para.output)
do {
try kernel.compute(commandBuffer: buffer, param: para)
} catch let error {
throw error
......@@ -14,7 +14,24 @@
import Foundation
class FlattenOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>, Runable, Creator, InferShaperable{
class FlattenParam<P: PrecisionType>: OpParam {
typealias ParamPrecisionType = P
required init(opDesc: OpDesc, inScope: Scope) throws {
do {
input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope)
output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope)
axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs)
} catch let error {
throw error
let input: Texture<P>
var output: Texture<P>
let axis: Int
class FlattenOp<P: PrecisionType>: Operator<FlattenKernel<P>, FlattenParam<P>>, Runable, Creator, InferShaperable{
typealias OpType = FlattenOp<P>
......@@ -32,6 +49,9 @@ class FlattenOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>,
func delogOutput() {
print(" \(type) output: ")
let device = para.output.metalTexture!.device
let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
......@@ -15,20 +15,21 @@
import Foundation
class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
// var newScale: MTLBuffer
// var newBias: MTLBuffer
required init(device: MTLDevice, param: BatchNormParam<P>) {
// guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else {
// fatalError()
// }
// guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
// fatalError()
// }
// self.newScale = newScale
// self.newBias = newBias
let count = param.variance.dim.numel()
let varianceP = param.variance.data.pointer
let meanP = param.mean.data.pointer
let scaleP = param.scale.data.pointer
let biasP = param.bias.data.pointer
for i in 0..<count {
let invStd = P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot())
biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i]
scaleP[i] = invStd * scaleP[i]
param.bias.initBuffer(device: device, precision: computePrecision)
param.scale.initBuffer(device: device, precision: computePrecision)
param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "batchnorm")
} else if computePrecision == .Float16 {
......@@ -36,37 +37,16 @@ class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
} else {
// let varianceBuffer : MTLBuffer = param.inputVariance.buffer
// var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
// let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
// for i in 0..<(varianceBuffer.length / MemoryLayout<P>.stride) {
// invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
// }
// let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
// let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
// let scale : MTLBuffer = param.inputScale.buffer
// let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
// let bias : MTLBuffer = param.inputBias.buffer
// let biasContents = bias.contents().assumingMemoryBound(to: P.self)
// let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
// for i in 0..<(newScale.length / MemoryLayout<P>.stride) {
// newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
// newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
// }
func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encoder is nil")
// encoder.setTexture(param.input.metalTexture, index: 0)
// encoder.setTexture(param.output.metalTexture, index: 1)
// encoder.setBuffer(newScale, offset: 0, index: 0)
// encoder.setBuffer(newBias, offset: 0, index: 1)
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBuffer(param.scale.buffer, offset: 0, index: 0)
encoder.setBuffer(param.bias.buffer, offset: 0, index: 1)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
......@@ -27,10 +27,16 @@ class BilinearInterpKernel<P: PrecisionType>: Kernel, Computable{
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2])
let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3])
var ratio_h: Float32 = 0
var ratio_w: Float32 = 0
if param.output.tensorDim.dims[2] > 1 {
ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1)
if param.output.tensorDim.dims[3] > 1 {
ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1)
var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
encoder.setBytes(&p, length: MemoryLayout<BilinearInterpMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
......@@ -38,7 +44,7 @@ class BilinearInterpKernel<P: PrecisionType>: Kernel, Computable{
required init(device: MTLDevice, param: BilinearInterpParam<P>) {
param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "bilinear_interp")
super.init(device: device, inFunctionName: "bilinear_interp_float")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "bilinear_interp_half")
} else {
......@@ -33,9 +33,9 @@ class BoxcoderKernel<P: PrecisionType>: Kernel, Computable{
required init(device: MTLDevice, param: BoxcoderParam<P>) {
param.output.initTexture(device: device, computePrecision: computePrecision)
param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: computePrecision)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "boxcoder")
super.init(device: device, inFunctionName: "boxcoder_float")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "boxcoder_half")
} else {
......@@ -31,101 +31,111 @@ struct ConcatMetalParam {
class ConcatKernel<P: PrecisionType>: Kernel, Computable{
func encodeTest(_ cmdBuffer: MTLCommandBuffer, _ param: ConcatTestParam, _ istart: Int, _ iend: Int) {
let encoder = cmdBuffer.makeComputeCommandEncoder()!
var p = ConcatMetalParam.init()
var odim: [Int32] = [1, 1, 1, 1]
for i in 0..<param.odim.count {
odim[4-param.odim.count+i] = Int32(param.odim[i])
p.odim = (odim[0], odim[1], odim[2], odim[3])
p.axis = Int32(4 - param.odim.count + param.axis)
for i in 0..<istart {
p.offset += Int32(param.dims[i][param.axis])
var v = "normal"
var pm = ConcatMetalParam.init()
func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
var vdim: [Int32] = []
for i in 0..<(iend - istart) {
encoder.setTexture(param.input[i+istart], index: i)
let num = param.input.count
for i in 0..<num {
encoder.setTexture(param.input[i].metalTexture, index: i)
for i in (iend-istart)..<6 {
encoder.setTexture(param.input[0], index: i)
encoder.setTexture(param.output.metalTexture, index: num)
if v == "normal" {
encoder.setTexture(param.output.metalTexture, index: num + 1)
p.vdim = (vdim[0], vdim[1], vdim[2], vdim[3], vdim[4], vdim[5])
encoder.setTexture(param.output, index: 6)
encoder.setTexture(param.output, index: 7)
encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output)
encoder.setBytes(&pm, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
func encode(_ cmdBuffer: MTLCommandBuffer, _ param: ConcatParam<P>, _ istart: Int, _ iend: Int) throws {
guard let encoder = cmdBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
var p = ConcatMetalParam.init()
let odim = (0..<4).map { Int32(param.output.dim[$0]) }
p.odim = (odim[0], odim[1], odim[2], odim[3])
p.axis = Int32(4 - param.output.tensorDim.cout() + param.axis)
required init(device: MTLDevice, param: ConcatParam<P>) {
param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: computePrecision)
let orank = param.output.tensorDim.cout()
let num = param.input.count
assert(num <= 6)
var axis = 4 - param.output.tensorDim.cout() + param.axis
for i in 0..<4 {
if Int32(param.transpose[i]) == p.axis {
p.axis = Int32(i)
if param.transpose[i] == axis {
axis = i
for i in 0..<istart {
p.offset += Int32(param.input[i+istart].dim[Int(p.axis)])
var vdim: [Int32] = []
for i in 0..<(iend - istart) {
encoder.setTexture(param.input[i+istart].metalTexture, index: i)
for i in (iend-istart)..<6 {
encoder.setTexture(param.input[0].metalTexture, index: i)
p.trans = (Int32(param.transpose[0]), Int32(param.transpose[1]), Int32(param.transpose[2]), Int32(param.transpose[3]))
p.vdim = (vdim[0], vdim[1], vdim[2], vdim[3], vdim[4], vdim[5])
encoder.setTexture(param.output.metalTexture, index: 6)
encoder.setTexture(param.output.metalTexture, index: 7)
encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
let group = param.input.count / 6
let remain = param.input.count % 6
for i in 0..<group {
try self.encode(commandBuffer, param, 6 * i, 6 * (i + 1))
if remain > 0 {
try self.encode(commandBuffer, param, 6 * group, param.input.count)
func test(cmdBuffer: MTLCommandBuffer, param: ConcatTestParam) {
let group = param.input.count / 6
let remain = param.input.count % 6
for i in 0..<group {
self.encodeTest(cmdBuffer, param, 6 * i, 6 * (i + 1))
pm.axis = Int32(axis)
pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3]))
pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3]))
var vdim: [Int] = [0, 0, 0, 0, 0, 0]
for i in 0..<num {
vdim[i] = param.input[i].dim[axis]
if remain > 0 {
self.encodeTest(cmdBuffer, param, 6 * group, param.input.count)
if orank == 4 {
if axis == 1 {
v = "y"
} else if axis == 2 {
v = "x"
} else {
if (param.output.dim[0] == 1) && axis == 3 {
var vz = true
for i in 0..<num {
if vdim[i] % 4 != 0 {
vz = false
if vz {
v = "z"
for i in 0..<num {
vdim[i] = vdim[i] / 4
} else if orank == 3 {
if axis == 2 {
v = "y"
} else if axis == 3 {
v = "x"
} else if axis == 1 {
var vz = true
for i in 0..<num {
if vdim[i] % 4 != 0 {
vz = false
if vz {
v = "z"
for i in 0..<num {
vdim[i] = vdim[i] / 4
} else {
if axis == 2 {
v = "y"
} else if axis == 3 {
var vx = true
for i in 0..<num {
if vdim[i] % 4 != 0 {
vx = false
if vx {
v = "x"
for i in 0..<num {
vdim[i] = vdim[i] / 4
required init(device: MTLDevice, param: ConcatParam<P>) {
param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: computePrecision)
pm.vdim = (Int32(vdim[0]), Int32(vdim[1]), Int32(vdim[2]), Int32(vdim[3]), Int32(vdim[4]), Int32(vdim[5]))
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "concat")
super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_float")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "concat_half")
super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_half")
} else {
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
See the License for the specific language governing permissions and
limitations under the License. */
import Foundation
class ConvAddPreluKernel<P: PrecisionType>: Kernel, Computable {
var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvAddPreluParam<P>) {
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
param.filter.initBuffer(device: device, precision: computePrecision)
param.y.initBuffer(device: device, precision: computePrecision)
param.alpha.initBuffer(device: device, precision: computePrecision)
if computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half")
} else {
super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half")
} else if param.filter.channel == 1 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half")
} else {
super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half")
} else if param.filter.width == 3 && param.filter.height == 3 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half")
} else {
super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half")
} else if param.filter.width == 1 && param.filter.height == 5 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half")
} else {
super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half")
} else if param.filter.width == 5 && param.filter.height == 1 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half")
} else {
super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half")
} else {
fatalError(" unsupport yet ")
} else if computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float")
} else {
super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float")
} else if param.filter.channel == 1 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float")
} else {
super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float")
} else if param.filter.width == 3 && param.filter.height == 3 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float")
} else {
super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float")
} else if param.filter.width == 1 && param.filter.height == 5 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float")
} else {
super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float")
} else if param.filter.width == 5 && param.filter.height == 1 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float")
} else {
super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float")
} else {
fatalError(" unsupport yet ")
} else {
let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
// print(" function: \(functionName)")
// print("offset x: \(offsetX)")
// print("offset y: \(offsetY)")
let offsetZ = 0.0
let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
// print("metal param: ")
// print(inMetalParam)
metalParam = inMetalParam
func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam<P>) throws {
// guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
// throw PaddleMobileError.predictError(message: " encode is nil")
// }
// encoder.setTexture(param.input.metalTexture, index: 0)
// encoder.setTexture(param.output.metalTexture, index: 1)
// encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
// encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
// encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
// encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
// encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
// encoder.endEncoding()
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
See the License for the specific language governing permissions and
limitations under the License. */
import Foundation
struct FlattenMetalParam {
var idim: (Int32, Int32, Int32, Int32)
var itrans: (Int32, Int32, Int32, Int32)
var odim: (Int32, Int32, Int32, Int32)
var otrans: (Int32, Int32, Int32, Int32)
class FlattenKernel<P: PrecisionType>: Kernel, Computable{
var metalParam: FlattenMetalParam
required init(device: MTLDevice, param: FlattenParam<P>) {
param.output.initTexture(device: device, computePrecision: computePrecision)
var id: [Int32] = [1, 1, 1, 1]
for i in 0..<param.input.tensorDim.cout() {
id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
let it: [Int32] = param.input.transpose.map { Int32($0) }
var od: [Int32] = [1, 1, 1, 1]
for i in 0..<param.output.tensorDim.cout() {
od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
let ot: [Int32] = param.output.transpose.map { Int32($0) }
metalParam = FlattenMetalParam.init(
idim: (id[0], id[1], id[2], id[3]),
itrans: (it[0], it[1], it[2], it[3]),
odim: (od[0], od[1], od[2], od[3]),
otrans: (ot[0], ot[1], ot[2], ot[3])
let irank = param.input.tensorDim.cout()
let orank = param.output.tensorDim.cout()
assert(orank == 2)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "reshape_\(irank)_2_float")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "reshape_\(irank)_2_half")
} else {
func compute(commandBuffer: MTLCommandBuffer, param: FlattenParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encoder is nil")
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
......@@ -15,11 +15,41 @@
import Foundation
class MulticlassNMSKernel<P: PrecisionType>: Kernel, Computable{
let pipline1: MTLComputePipelineState
required init(device: MTLDevice, param: MulticlassNMSParam<P>) {
super.init(device: device, inFunctionName: "place_holder")
param.middleOutput.initBuffer(device: device)
param.bboxOutput.initBuffer(device: device)
if computePrecision == .Float32 {
pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", inPaddleMobileLib: true)
super.init(device: device, inFunctionName: "nms_fetch_result")
} else if computePrecision == .Float16 {
pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", inPaddleMobileLib: true)
super.init(device: device, inFunctionName: "nms_fetch_result_half")
} else {
fatalError( " unsupport precision " )
func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
encoder.setTexture(param.scores.metalTexture, index: 0)
encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture)
guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
encoderBox.setTexture(param.bboxes.metalTexture, index: 0)
encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0)
encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture)
......@@ -34,24 +34,44 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
required init(device: MTLDevice, param: PriorBoxParam<P>) {
param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
let originDim = param.output.tensorDim;
param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: computePrecision)
param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "prior_box")
if param.min_max_aspect_ratios_order {
super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder")
} else {
super.init(device: device, inFunctionName: "prior_box")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "prior_box_half")
if param.min_max_aspect_ratios_order {
super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half")
} else {
super.init(device: device, inFunctionName: "prior_box_half")
} else {
let n = 1
let h = param.output.dim[1]
let w = param.output.dim[2]
let c = param.output.dim[3] * param.output.dim[0]
param.output.dim = Dim.init(inDim: [n, h, w, c])
param.output.transpose = [0, 1, 2, 3]
guard param.minSizes.count == 1 else {
fatalError(" need implement ")
// let n = 1
// let h = param.output.dim[1]
// let w = param.output.dim[2]
// let c = param.output.dim[3] * param.output.dim[0]
// param.output.dim = Dim.init(inDim: [n, h, w, c])
// param.output.transpose = [0, 1, 2, 3]
let imageWidth = Float32(param.inputImage.padToFourDim[3])
let imageHeight = Float32(param.inputImage.padToFourDim[2])
......@@ -49,10 +49,12 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
odim: (od[0], od[1], od[2], od[3]),
otrans: (ot[0], ot[1], ot[2], ot[3])
let irank = param.input.tensorDim.cout()
let orank = param.output.tensorDim.cout()
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "reshape")
super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_float")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "reshape_half")
super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_half")
} else {
......@@ -72,7 +74,7 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encoder is nil")
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
......@@ -81,15 +83,15 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
encoder.setTexture(testParam.inputTexture, index: 0)
encoder.setTexture(testParam.outputTexture, index: 1)
var pm: ReshapeMetalParam = testParam.param
encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
// func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
// guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
// fatalError()
// }
// encoder.setTexture(testParam.inputTexture, index: 0)
// encoder.setTexture(testParam.outputTexture, index: 1)
// var pm: ReshapeMetalParam = testParam.param
// encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
// encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
// encoder.endEncoding()
// }
......@@ -19,19 +19,20 @@ struct ShapeMetalParam {
class ShapeKernel<P: PrecisionType>: Kernel, Computable{
func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
encoder.setTexture(param.output.metalTexture, index: 0)
// print("shape compute")
// guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
// throw PaddleMobileError.predictError(message: " encode is nil")
// }
// encoder.setTexture(param.output.metalTexture, index: 0)
// encoder.endEncoding()
required init(device: MTLDevice, param: ShapeParam<P>) {
param.output.initTexture(device: device, computePrecision: computePrecision)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "split")
super.init(device: device, inFunctionName: "shape")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "split_half")
super.init(device: device, inFunctionName: "shape_half")
} else {
......@@ -29,7 +29,7 @@ class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{
K: Int32(param.input.tensorDim[1])
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "softmax")
super.init(device: device, inFunctionName: "softmax_float")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "softmax_half")
} else {
......@@ -15,23 +15,76 @@
import Foundation
struct SplitMetalParam {
var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
var axis: Int32 = 0
var offset: Int32 = 0
var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
class SplitKernel<P: PrecisionType>: Kernel, Computable{
var smp: SplitMetalParam
func compute(commandBuffer: MTLCommandBuffer, param: SplitParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
encoder.setTexture(param.output.metalTexture, index: 0)
encoder.setTexture(param.input.metalTexture, index: 0)
for i in 0..<param.outputList.count {
encoder.setTexture(param.outputList[i].metalTexture, index: i + 1)
encoder.setBytes(&smp, length: MemoryLayout<SplitMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
required init(device: MTLDevice, param: SplitParam<P>) {
param.output.initTexture(device: device, computePrecision: computePrecision)
// param.output.initTexture(device: device, computePrecision: computePrecision)
let num = param.outputList.count
let rank = param.input.tensorDim.cout()
assert(num >= 2 && num <= 4)
for output in param.outputList {
output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
smp = SplitMetalParam.init()
smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3]))
smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout())
for i in 0..<4 {
if param.input.transpose[i] == smp.axis {
smp.axis = Int32(i)
smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3]))
var vdim: [Int32] = [0, 0, 0, 0]
for i in 0..<num {
vdim[i] = Int32(param.outputList[i].tensorDim[param.axis])
smp.vdim = (vdim[0], vdim[1], vdim[2], vdim[3])
var v = "normal"
if rank == 4 {
if smp.axis == 1 {
v = "y"
} else if smp.axis == 2 {
v = "x"
} else if rank == 3 {
if smp.axis == 2 {
v = "y"
} else if smp.axis == 3 {
v = "x"
} else if rank == 2 {
if smp.axis == 2 {
v = "y"
if v == "normal" {
fatalError("split unsupported")
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "split")
super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_float")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "split_half")
super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_half")
} else {
......@@ -17,73 +17,52 @@ import Foundation
struct TransposeMetalParam {
var iC: Int32 = 0
var oC: Int32 = 0
var i0: Int32
var i1: Int32
var i2: Int32
var i3: Int32
init(_ i0: Int32, _ i1: Int32, _ i2: Int32, _ i3: Int32) {
self.i0 = i0
self.i1 = i1
self.i2 = i2
self.i3 = i3
init(_ axis: [Int]) {
self.init(Int32(axis[0]), Int32(axis[1]), Int32(axis[2]), Int32(axis[3]))
struct TransposeTestParam: TestParam {
let inputTexture: MTLTexture
let outputTexture: MTLTexture
let iC: Int
let oC: Int
let axis: [Int]
var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
class TransposeKernel<P: PrecisionType>: Kernel, Computable, Testable {
class TransposeKernel<P: PrecisionType>: Kernel, Computable {
var metalParam: TransposeMetalParam = TransposeMetalParam.init()
required init(device: MTLDevice, param: TransposeParam<P>) {
param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: computePrecision)
if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "transpose_half")
} else if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "transpose")
} else {
var invT: [Int] = [0, 1, 2, 3]
for (i, v) in param.input.transpose.enumerated() {
invT[v] = i
param.output.initTexture(device: device, computePrecision: computePrecision)
let rank = param.input.tensorDim.cout()
var axis: [Int] = [0, 1, 2, 3]
for i in 0..<param.axis.count {
axis[4-param.axis.count+i] = 4 - param.axis.count + Int(param.axis[i])
axis[4-rank+i] = 4 - rank + Int(param.axis[i])
let realAxis = axis.map {invT[$0]}
var tmp = TransposeMetalParam.init(realAxis)
tmp.iC = Int32(param.input.dim[param.input.transpose[3]])
tmp.oC = Int32(param.output.dim[3])
if realAxis == [0, 1, 2, 3] {
// print("====> transpose! FAST :)")
} else {
// print("====> transpose! SLOW :(")
var naxis: [Int] = [0, 0, 0, 0]
for i in 0..<4 {
for j in 0..<4 {
if param.input.transpose[j] == axis[i] {
naxis[i] = j
metalParam = tmp
required init(device: MTLDevice, testParam: TransposeTestParam) {
metalParam.iC = Int32(param.input.dim[param.input.transpose[3]])
metalParam.oC = Int32(param.output.dim[3])
metalParam.axis = (Int32(naxis[0]), Int32(naxis[1]), Int32(naxis[2]), Int32(naxis[3]))
var kernelFunc = "transpose_undefined"
if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "transpose_half")
if param.input.transpose == axis {
kernelFunc = "transpose_copy_half"
} else {
kernelFunc = "transpose_\(rank)_half"
} else if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "transpose")
if param.input.transpose == axis {
kernelFunc = "transpose_copy_float"
} else {
kernelFunc = "transpose_\(rank)_float"
} else {
print("===========>", kernelFunc)
super.init(device: device, inFunctionName: kernelFunc)
var metalParam: TransposeMetalParam!
func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
......@@ -95,20 +74,6 @@ class TransposeKernel<P: PrecisionType>: Kernel, Computable, Testable {
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
public func test(commandBuffer: MTLCommandBuffer, param: TransposeTestParam) {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
encoder.setTexture(param.inputTexture, index: 0)
encoder.setTexture(param.outputTexture, index: 1)
var tmp = TransposeMetalParam.init(param.axis)
tmp.iC = Int32(param.iC)
tmp.oC = Int32(param.oC)
encoder.setBytes(&tmp, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
......@@ -15,28 +15,28 @@
#include <metal_stdlib>
using namespace metal;
kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
const device half4 * newScale [[buffer(0)]],
const device half4 * newBias [[buffer(1)]],
kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
const device float4 * nscale [[buffer(0)]],
const device float4 * nbias [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
const half4 input = inTexture.read(gid.xy, gid.z);
half4 output = input * newScale[gid.z] + newBias[gid.z];
const float4 input = inTexture.read(gid.xy, gid.z);
float4 output = input * nscale[gid.z] + nbias[gid.z];
outTexture.write(output, gid.xy, gid.z);
kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
const device float4 * newScale [[buffer(0)]],
const device float4 * newBias [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
const device half4 * newScale [[buffer(0)]],
const device half4 * newBias [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
const float4 input = inTexture.read(gid.xy, gid.z);
float4 output = input * newScale[gid.z] + newBias[gid.z];
const half4 input = inTexture.read(gid.xy, gid.z);
half4 output = input * newScale[gid.z] + newBias[gid.z];
outTexture.write(output, gid.xy, gid.z);
#ifdef P
#define CONCAT2(a, b) a ## b
#define CONCAT2_(a, b) a ## _ ## b
#define FUNC(f, p) CONCAT2_(f, p)
#define VECTOR(p, n) CONCAT2(p, n)
kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[texture(0)]],
texture2d_array<P, access::write> output [[texture(1)]],
constant bilinear_interp_param & pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
VECTOR(P, 4) r;
if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
r = input.read(gid.xy, gid.z);
} else {
P w = gid.x * pm.ratio_w;
P h = gid.y * pm.ratio_h;
uint w0 = w, h0 = h;
uint w1 = w0 + 1, h1 = h0 + 1;
P w1lambda = w - w0, h1lambda = h - h0;
P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
if (w1 >= input.get_width()) w1 = w0;
if (h1 >= input.get_height()) h1 = h0;
VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z);
VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
r = h2lambda * (w2lambda * r0 + w1lambda * r1)
+ h1lambda * (w2lambda * r2 + w1lambda * r3);
output.write(r, gid.xy, gid.z);
......@@ -16,60 +16,14 @@
using namespace metal;
struct bilinear_interp_param {
// int32_t out_h;
// int32_t out_w;
float ratio_h;
float ratio_w;
kernel void bilinear_interp(texture2d_array<float, access::read> input [[texture(0)]],
texture2d_array<float, access::write> output [[texture(2)]],
constant bilinear_interp_param & pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
float4 r;
if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
r = input.read(gid.xy, gid.z);
} else {
float w = gid.x * pm.ratio_w;
float h = gid.y * pm.ratio_h;
uint w0 = w, h0 = h;
uint w1 = w0 + 1, h1 = h0 + 1;
float w1lambda = w - w0, h1lambda = h - h0;
float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
if (w1 >= input.get_width()) w1 = w0;
if (h1 >= input.get_height()) h1 = h0;
float4 r0 = input.read(uint2(w0, h0), gid.z);
float4 r1 = input.read(uint2(w1, h0), gid.z);
float4 r2 = input.read(uint2(w0, h1), gid.z);
float4 r3 = input.read(uint2(w1, h1), gid.z);
r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
output.write(r, gid.xy, gid.z);
#define P float
#include "BilinearInterp.inc.metal"
#undef P
kernel void bilinear_interp_half(texture2d_array<half, access::read> input [[texture(0)]],
texture2d_array<half, access::write> output [[texture(2)]],
constant bilinear_interp_param & pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
half4 r;
if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
r = input.read(gid.xy, gid.z);
} else {
half w = gid.x * pm.ratio_w;
half h = gid.y * pm.ratio_h;
uint w0 = w, h0 = h;
uint w1 = w0 + 1, h1 = h0 + 1;
half w1lambda = w - w0, h1lambda = h - h0;
half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
if (w1 >= input.get_width()) w1 = w0;
if (h1 >= input.get_height()) h1 = h0;
half4 r0 = input.read(uint2(w0, h0), gid.z);
half4 r1 = input.read(uint2(w1, h0), gid.z);
half4 r2 = input.read(uint2(w0, h1), gid.z);
half4 r3 = input.read(uint2(w1, h1), gid.z);
r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
output.write(r, gid.xy, gid.z);
output.write(r, gid.xy, gid.z);
#define P half
#include "BilinearInterp.inc.metal"
#undef P
#ifdef P
#define CONCAT2(a, b) a ## b
#define CONCAT2_(a, b) a ## _ ## b
#define FUNC(f, p) CONCAT2_(f, p)
#define VECTOR(p, n) CONCAT2(p, n)
kernel void FUNC(boxcoder, P)(texture2d_array<P, access::read> priorBox [[texture(0)]],
texture2d_array<P, access::read> priorBoxVar [[texture(1)]],
texture2d_array<P, access::read> targetBox [[texture(2)]],
texture2d_array<P, access::write> output[[texture(3)]],
uint3 gid [[thread_position_in_grid]]) {
VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z);
VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z);
VECTOR(P, 4) t;
t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0];
t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0];
t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0];
t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0];
P px = (p.x + p.z) / 2;
P py = (p.y + p.w) / 2;
P pw = p.z - p.x;
P ph = p.w - p.y;
P tx = pv.x * t.x * pw + px;
P ty = pv.y * t.y * ph + py;
P tw = exp(pv.z * t.z) * pw;
P th = exp(pv.w * t.w) * ph;
VECTOR(P, 4) r;
r.x = tx - tw / 2;
r.y = ty - th / 2;
r.z = tx + tw / 2;
r.w = ty + th / 2;
output.write(r, gid.xy, gid.z);
......@@ -15,58 +15,9 @@
#include <metal_stdlib>
using namespace metal;
kernel void boxcoder(texture2d_array<float, access::read> priorBox [[texture(0)]],
texture2d_array<float, access::read> priorBoxVar [[texture(1)]],
texture2d_array<float, access::read> targetBox [[texture(2)]],
texture2d_array<float, access::write> output[[texture(3)]],
uint3 gid [[thread_position_in_grid]]) {
float4 t = targetBox.read(gid.xy, gid.z);
float4 p = priorBox.read(gid.xy, gid.z);
float4 pv = priorBoxVar.read(gid.xy, gid.z);
float px = (p.x + p.z) / 2;
float py = (p.y + p.w) / 2;
float pw = p.z - p.x;
float ph = p.w - p.y;
float tx = pv.x * t.x * pw + px;
float ty = pv.y * t.y * ph + py;
float tw = exp(pv.z * t.z) * pw;
float th = exp(pv.w * t.w) * ph;
float4 r;
r.x = tx - tw / 2;
r.y = ty - th / 2;
r.z = tx + tw / 2;
r.w = ty + th / 2;
output.write(r, gid.xy, gid.z);
kernel void boxcoder_half(texture2d_array<half, access::read> priorBox [[texture(0)]],
texture2d_array<half, access::read> priorBoxVar [[texture(1)]],
texture2d_array<half, access::read> targetBox [[texture(2)]],
texture2d_array<half, access::write> output[[texture(3)]],
uint3 gid [[thread_position_in_grid]]) {
half4 t = targetBox.read(gid.xy, gid.z);
half4 p = priorBox.read(gid.xy, gid.z);
half4 pv = priorBoxVar.read(gid.xy, gid.z);
float px = (float(p.x) + float(p.z)) / 2;
float py = (float(p.y) + float(p.w)) / 2;
float pw = float(p.z) - float(p.x);
float ph = float(p.w) - float(p.y);
float tx = float(pv.x) * float(t.x) * pw + px;
float ty = float(pv.y) * float(t.y) * ph + py;
float tw = exp(float(pv.z) * float(t.z)) * pw;
float th = exp(float(pv.w) * float(t.w)) * ph;
float4 r;
r.x = tx - tw / 2;
r.y = ty - th / 2;
r.z = tx + tw / 2;
r.w = ty + th / 2;
output.write(half4(r), gid.xy, gid.z);
#define P float
#include "BoxCoder.inc.metal"
#undef P
#define P half
#include "BoxCoder.inc.metal"
#undef P
......@@ -15,6 +15,55 @@
#include <metal_stdlib>
using namespace metal;
inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) {
abcd[0] = abcd[1] = abcd[2] = 0;
abcd[3] = xyzn[0] * 4 + xyzn[3];
inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) {
abcd[0] = abcd[1] = 0;
abcd[2] = xyzn[1];
abcd[3] = xyzn[0] * 4 + xyzn[3];
inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) {
abcd[0] = 0;
abcd[3] = xyzn[0];
abcd[2] = xyzn[1];
abcd[1] = xyzn[2] * 4 + xyzn[3];
inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) {
abcd[2] = xyzn[0];
abcd[1] = xyzn[1];
uint t = xyzn[2] * 4 + xyzn[3];
abcd[0] = t / C;
abcd[3] = t % C;
inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) {
xyzn[1] = xyzn[2] = 0;
xyzn[0] = abcd[3] / 4;
xyzn[1] = abcd[3] % 4;
inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) {
xyzn[2] = 0;
xyzn[1] = abcd[2];
xyzn[0] = abcd[3] / 4;
xyzn[3] = abcd[3] % 4;
inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) {
xyzn[0] = abcd[3];
xyzn[1] = abcd[2];
xyzn[2] = abcd[1] / 4;
xyzn[3] = abcd[1] % 4;
inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) {
xyzn[0] = abcd[2];
xyzn[1] = abcd[1];
uint t = abcd[0] * C + abcd[3];
xyzn[2] = t / 4;
xyzn[3] = t % 4;
inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) {
abcd[2] = xyzn[0];
abcd[1] = xyzn[1];
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
See the License for the specific language governing permissions and
limitations under the License. */
#include <metal_stdlib>
#include "Common.metal"
using namespace metal;
struct ConcatParam {
int32_t odim[4];
int32_t axis;
int32_t offset;
int32_t trans[4];
int32_t vdim[6];
kernel void concat(texture2d_array<float, access::read> in0 [[texture(0)]],
texture2d_array<float, access::read> in1 [[texture(1)]],
texture2d_array<float, access::read> in2 [[texture(2)]],
texture2d_array<float, access::read> in3 [[texture(3)]],
texture2d_array<float, access::read> in4 [[texture(4)]],
texture2d_array<float, access::read> in5 [[texture(5)]],
texture2d_array<float, access::read> inx [[texture(6)]],
texture2d_array<float, access::write> out [[texture(7)]],
constant ConcatParam & pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
ConcatParam cp = pm;
int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
float4 r;
for (int i = 0; i < 4; i++) {
xyzn[3] = i;
xyzn2abcd(cp.odim[3], xyzn, abcd);
int k = abcd[cp.axis] - cp.offset;
int j = 0;
if (k < 0) {
r[i] = inx.read(gid.xy, gid.z)[i];
} else {
for (; j < 6; j++) {
if (k < cp.vdim[j]) {
k -= cp.vdim[j];
int ta = cp.odim[cp.axis];
abcd[cp.axis] = k;
cp.odim[cp.axis] = cp.vdim[j];
abcd2xyzn(cp.odim[3], abcd, oxyzn);
cp.odim[cp.axis] = ta;
switch (j) {
case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
out.write(r, gid.xy, gid.z);
kernel void concat_half(texture2d_array<half, access::read> in0 [[texture(0)]],
texture2d_array<half, access::read> in1 [[texture(1)]],
texture2d_array<half, access::read> in2 [[texture(2)]],
texture2d_array<half, access::read> in3 [[texture(3)]],
texture2d_array<half, access::read> in4 [[texture(4)]],
texture2d_array<half, access::read> in5 [[texture(5)]],
texture2d_array<half, access::read> inx [[texture(6)]],
texture2d_array<half, access::write> out [[texture(7)]],
constant ConcatParam & pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
ConcatParam cp = pm;
int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
half4 r;
for (int i = 0; i < 4; i++) {
xyzn[3] = i;
xyzn2abcd(cp.odim[3], xyzn, abcd);
int k = abcd[cp.axis] - cp.offset;
int j = 0;
if (k < 0) {
r[i] = inx.read(gid.xy, gid.z)[i];
} else {
for (; j < 6; j++) {
if (k < cp.vdim[j]) {
k -= cp.vdim[j];
int ta = cp.odim[cp.axis];
abcd[cp.axis] = k;
cp.odim[cp.axis] = cp.vdim[j];
abcd2xyzn(cp.odim[3], abcd, oxyzn);
cp.odim[cp.axis] = ta;
switch (j) {
case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
out.write(r, gid.xy, gid.z);
......@@ -17,14 +17,15 @@
using namespace metal;
kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
kernel void conv_add_batch_norm_relu_1x1_half(
texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device half4 *new_scale [[buffer(3)]],
const device half4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
......@@ -41,7 +42,7 @@ kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::samp
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
half4 output = half4(0.0);
float4 output = float4(0.0);
half4 input;
for (uint i = 0; i < input_arr_size; ++i) {
......@@ -58,19 +59,19 @@ kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::samp
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
outTexture.write(output, gid.xy, gid.z);
output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
outTexture.write(half4(output), gid.xy, gid.z);
kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
kernel void conv_add_batch_norm_relu_3x3_half(
texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device half4 *new_scale [[buffer(3)]],
const device half4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
......@@ -86,7 +87,7 @@ kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::samp
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
half4 output = half4(0.0);
float4 output = float4(0.0);
half4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
......@@ -113,19 +114,19 @@ kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::samp
output.w += dot(input[j], weight_w);
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
outTexture.write(output, gid.xy, gid.z);
output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
outTexture.write(half4(output), gid.xy, gid.z);
kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
kernel void depthwise_conv_add_batch_norm_relu_3x3_half(
texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device half4 *new_scale [[buffer(3)]],
const device half4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
......@@ -138,7 +139,7 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, ac
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
half4 output = half4(0.0);
float4 output = float4(0.0);
half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
......@@ -156,11 +157,12 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, ac
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
outTexture.write(output, gid.xy, gid.z);
output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
outTexture.write(half4(output), gid.xy, gid.z);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
想要评论请 注册