From 005115a1957690e935f3b14c19ac1035ca28219e Mon Sep 17 00:00:00 2001 From: liuruilong Date: Tue, 5 Mar 2019 21:03:47 +0800 Subject: [PATCH] format files, improve accuracy --- .../MobileNetDemo/AppDelegate.swift | 64 +- .../MobileNetDemo/MobileNet.swift | 82 +- .../MobileNetDemo/MobilenetPreProcess.metal | 28 +- .../MobileNetDemo/ViewController.swift | 132 +- .../paddle-mobile-demo/AppDelegate.swift | 18 +- .../Base.lproj/Main.storyboard | 2 +- .../paddle-mobile-demo/MetalHelper.swift | 20 +- .../MultiPredictViewController.swift | 92 +- .../paddle-mobile-demo/Net/CPUCompute.mm | 434 +++---- .../paddle-mobile-demo/Net/Genet.swift | 64 +- .../paddle-mobile-demo/Net/MobileNet.swift | 86 +- .../Net/MobileNetCombined.swift | 28 +- .../paddle-mobile-demo/Net/MobileNetSSD.swift | 158 +-- .../Net/MobilenetSSD_AR.swift | 256 ++-- .../Net/PreProcessKernel.metal | 80 +- .../paddle-mobile-demo/Net/YoloNet.swift | 28 +- .../OCDemo/LoadPointerViewController.m | 136 +- .../OCInterface/PaddleMobileGPU.h | 4 +- .../OCInterface/PaddleMobileGPU.m | 84 +- .../OCInterface/SuperResolutionNet.swift | 96 +- .../VideoCapture/FPSCounter.swift | 42 +- .../VideoCapture/VideoCapture.swift | 112 +- .../paddle-mobile-demo/ViewController.swift | 411 +++--- .../project.pbxproj | 6 +- .../BatchNormKernel.metal | 32 +- .../BatchNormRelu.metal | 10 +- .../BilinearInterp.inc.metal | 46 +- .../BilinearInterp.metal | 4 +- .../paddle-mobile-metallib/BoxCoder.inc.metal | 58 +- .../BufferToTexture.metal | 28 +- .../paddle-mobile-metallib/Common.metal | 124 +- .../ConcatKernel.inc.metal | 328 ++--- .../paddle-mobile-metallib/ConcatKernel.metal | 190 +-- .../ConvAddBNReluKernel.metal | 476 +++---- .../paddle-mobile-metallib/ConvAddMetal.metal | 1040 +++++++-------- .../ConvAddPrelu.inc.metal | 692 +++++----- .../ConvAddPreluKernel.metal | 60 +- .../ConvBNReluKernel.metal | 464 +++---- .../paddle-mobile-metallib/ConvKernel.metal | 440 +++---- .../ConvTransposeKernel.metal | 158 +-- .../paddle-mobile-metallib/Elementwise.metal | 134 +- .../ElementwiseAddPreluKernel.inc.metal | 104 +- .../ElementwiseAddPreluKernel.metal | 14 +- .../FetchKernel.inc.metal | 58 +- .../paddle-mobile-metallib/FetchKernel.metal | 2 +- .../paddle-mobile-metallib/Kernels.metal | 52 +- .../NMSFetchResultKernel.metal | 100 +- .../PoolKernel.inc.metal | 58 +- .../paddle-mobile-metallib/PoolKernel.metal | 14 +- .../paddle-mobile-metallib/PreluKernel.metal | 222 ++-- .../PriorBoxKernel.metal | 584 ++++----- .../paddle-mobile-metallib/ReluKernel.metal | 36 +- .../ReshapeKernel.inc.metal | 60 +- .../ReshapeKernel.metal | 8 +- .../ResizeBilinear.metal | 100 +- .../paddle-mobile-metallib/Scale.metal | 28 +- .../paddle-mobile-metallib/Softmax.inc.metal | 66 +- .../paddle-mobile-metallib/Softmax.metal | 4 +- .../paddle-mobile-metallib/Split.inc.metal | 98 +- .../paddle-mobile-metallib/Split.metal | 50 +- .../TransposeKernel.inc.metal | 42 +- .../TransposeKernel.metal | 58 +- .../paddle-mobile-unit-test/AppDelegate.swift | 24 +- .../ViewController.swift | 12 +- .../paddle-mobile.xcodeproj/project.pbxproj | 4 +- .../paddle-mobile/API/GlobalConfig.swift | 30 +- .../paddle-mobile/paddle-mobile/API/Net.swift | 136 +- .../paddle-mobile/API/Runner.swift | 322 ++--- .../paddle-mobile/Src/Common/Extensions.swift | 156 +-- .../Src/Common/MetalExtension.swift | 1122 ++++++++--------- .../Src/Common/PaddleMobileUnitTest.swift | 376 +++--- .../paddle-mobile/Src/Common/Types.swift | 414 +++--- .../paddle-mobile/Src/Framework/Dim.swift | 74 +- .../Src/Framework/Executor.swift | 233 ++-- .../paddle-mobile/Src/Framework/Loader.swift | 464 +++---- .../paddle-mobile/Src/Framework/Tensor.swift | 586 ++++----- .../paddle-mobile/Src/Framework/Texture.swift | 288 ++--- .../Src/Operators/Base/OpCreator.swift | 4 +- .../Src/Operators/Base/OpParam.swift | 376 +++--- .../Src/Operators/Base/Operator.swift | 180 +-- .../Src/Operators/BatchNormOp.swift | 86 +- .../Src/Operators/BilinearInterpOp.swift | 78 +- .../Src/Operators/BoxcoderOp.swift | 116 +- .../Src/Operators/ConcatOp.swift | 98 +- .../Src/Operators/ConvAddAddPreluOp.swift | 166 +-- .../Operators/ConvAddBatchNormReluOp.swift | 202 +-- .../Src/Operators/ConvAddOp.swift | 180 +-- .../Src/Operators/ConvAddPreluOp.swift | 152 +-- .../Src/Operators/ConvBNReluOp.swift | 180 +-- .../paddle-mobile/Src/Operators/ConvOp.swift | 112 +- .../Src/Operators/ConvTransposeOp.swift | 66 +- .../Src/Operators/DepthwiseConvOp.swift | 68 +- .../Src/Operators/DwConvBNReluOp.swift | 98 +- .../Src/Operators/ElementwiseAddOp.swift | 132 +- .../Src/Operators/ElementwiseAddPreluOp.swift | 172 +-- .../paddle-mobile/Src/Operators/FeedOp.swift | 86 +- .../paddle-mobile/Src/Operators/FetchOp.swift | 66 +- .../Src/Operators/FlattenOp.swift | 66 +- .../Src/Operators/Kernels/Base/Kernel.swift | 184 +-- .../Operators/Kernels/BatchNormKernel.swift | 66 +- .../Kernels/BilinearInterpKernel.swift | 62 +- .../Operators/Kernels/BoxcoderKernel.swift | 46 +- .../Src/Operators/Kernels/ConcatKernel.swift | 228 ++-- .../Kernels/ConvAddAddPreluKernel.swift | 248 ++-- .../Kernels/ConvAddBatchNormReluKernel.swift | 302 ++--- .../Src/Operators/Kernels/ConvAddKernel.swift | 134 +- .../Kernels/ConvAddPreluKernel.swift | 248 ++-- .../Operators/Kernels/ConvBNReluKernel.swift | 302 ++--- .../Src/Operators/Kernels/ConvKernel.swift | 74 +- .../Kernels/ConvTransposeKernel.swift | 114 +- .../Kernels/ElementwiseAddKernel.swift | 100 +- .../Kernels/ElementwiseAddPreluKernel.swift | 114 +- .../Src/Operators/Kernels/FetchKernel.swift | 80 +- .../Src/Operators/Kernels/FlattenKernel.swift | 94 +- .../Kernels/MulticlassNMSKernel.swift | 66 +- .../Src/Operators/Kernels/PoolKernel.swift | 94 +- .../Src/Operators/Kernels/PreluKernel.swift | 66 +- .../Operators/Kernels/PriorBoxKernel.swift | 250 ++-- .../Src/Operators/Kernels/ReluKernel.swift | 34 +- .../Src/Operators/Kernels/ReshapeKernel.swift | 140 +- .../Kernels/ResizeBilinearKernel.swift | 54 +- .../Src/Operators/Kernels/Scale.swift | 16 +- .../Src/Operators/Kernels/ShapeKernel.swift | 38 +- .../Src/Operators/Kernels/SoftmaxKernel.swift | 56 +- .../Src/Operators/Kernels/SplitKernel.swift | 140 +- .../Kernels/Texture2DTo2DArrayKernel.swift | 44 +- .../Operators/Kernels/TransposeKernel.swift | 110 +- .../Src/Operators/MulticlassNMSOp.swift | 90 +- .../paddle-mobile/Src/Operators/PoolOp.swift | 100 +- .../paddle-mobile/Src/Operators/PreluOp.swift | 80 +- .../Src/Operators/PriorBoxOp.swift | 188 +-- .../paddle-mobile/Src/Operators/ReluOp.swift | 66 +- .../Src/Operators/ReshapeOp.swift | 106 +- .../Src/Operators/ResizeBilinearOp.swift | 76 +- .../paddle-mobile/Src/Operators/ShapeOp.swift | 56 +- .../Src/Operators/SoftmaxOp.swift | 76 +- .../paddle-mobile/Src/Operators/SplitOp.swift | 100 +- .../Src/Operators/TransposeOp.swift | 64 +- .../Src/Program/PMBlockDesc.swift | 10 +- .../paddle-mobile/Src/Program/PMOpDesc.swift | 50 +- .../paddle-mobile/Src/Program/PMVarDesc.swift | 4 +- .../Src/Program/ProgramOptimize.swift | 468 +++---- .../paddle-mobile/Src/Program/Scope.swift | 2 +- 143 files changed, 10224 insertions(+), 10246 deletions(-) diff --git a/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift b/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift index 4152b9be89..9596c1a535 100644 --- a/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift +++ b/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift @@ -10,37 +10,37 @@ import UIKit @UIApplicationMain class AppDelegate: UIResponder, UIApplicationDelegate { - - var window: UIWindow? - - - func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool { - // Override point for customization after application launch. - return true - } - - func applicationWillResignActive(_ application: UIApplication) { - // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state. - // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game. - } - - func applicationDidEnterBackground(_ application: UIApplication) { - // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later. - // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits. - } - - func applicationWillEnterForeground(_ application: UIApplication) { - // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background. - } - - func applicationDidBecomeActive(_ application: UIApplication) { - // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface. - } - - func applicationWillTerminate(_ application: UIApplication) { - // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:. - } - - + + var window: UIWindow? + + + func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool { + // Override point for customization after application launch. + return true + } + + func applicationWillResignActive(_ application: UIApplication) { + // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state. + // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game. + } + + func applicationDidEnterBackground(_ application: UIApplication) { + // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later. + // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits. + } + + func applicationWillEnterForeground(_ application: UIApplication) { + // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background. + } + + func applicationDidBecomeActive(_ application: UIApplication) { + // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface. + } + + func applicationWillTerminate(_ application: UIApplication) { + // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:. + } + + } diff --git a/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift b/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift index f0902855cc..7f26427f2b 100644 --- a/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift +++ b/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift @@ -16,51 +16,51 @@ import Foundation import paddle_mobile public class MobileNet: Net{ - class MobilenetPreProccess: CusomKernel { - init(device: MTLDevice) { - let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3) - super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) + class MobilenetPreProccess: CusomKernel { + init(device: MTLDevice) { + let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3) + super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) + } } - } - - class PreWords { - var contents: [String] = [] - init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) { - if let filePath = inBundle.path(forResource: fileName, ofType: type) { - let string = try! String.init(contentsOfFile: filePath) - contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{ - String($0[$0.index($0.startIndex, offsetBy: 10)...]) + + class PreWords { + var contents: [String] = [] + init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) { + if let filePath = inBundle.path(forResource: fileName, ofType: type) { + let string = try! String.init(contentsOfFile: filePath) + contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{ + String($0[$0.index($0.startIndex, offsetBy: 10)...]) + } + }else{ + fatalError("no file call \(fileName)") + } + } + subscript(index: Int) -> String { + return contents[index] } - }else{ - fatalError("no file call \(fileName)") - } } - subscript(index: Int) -> String { - return contents[index] + + let labels = PreWords.init(fileName: "synset") + + override public func resultStr(res: [ResultHolder]) -> String { + let firstRes = res[0] + let resPointer = firstRes.result + var s: [String] = [] + (0.. String { - let firstRes = res[0] - let resPointer = firstRes.result - var s: [String] = [] - (0.. outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f); - const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; - outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f); + const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid); } kernel void mobilenet_preprocess_half( @@ -28,11 +28,11 @@ kernel void mobilenet_preprocess_half( texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f); - const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; - outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f); + const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); } diff --git a/metal/MobileNetDemo/MobileNetDemo/ViewController.swift b/metal/MobileNetDemo/MobileNetDemo/ViewController.swift index 4e31282f03..a0d69c5c06 100644 --- a/metal/MobileNetDemo/MobileNetDemo/ViewController.swift +++ b/metal/MobileNetDemo/MobileNetDemo/ViewController.swift @@ -10,84 +10,84 @@ import UIKit import paddle_mobile class ViewController: UIViewController { - @IBOutlet weak var resultTextView: UITextView! - @IBOutlet weak var selectImageView: UIImageView! - @IBOutlet weak var elapsedTimeLabel: UILabel! - var net: MobileNet! - var runner: Runner! - var toPredictTexture: MTLTexture? - - override func viewDidLoad() { - super.viewDidLoad() - GlobalConfig.shared.computePrecision = .Float16 - net = MobileNet.init(device: MetalHelper.shared.device) - runner = Runner.init(inNet: net, commandQueue: MetalHelper.shared.queue) + @IBOutlet weak var resultTextView: UITextView! + @IBOutlet weak var selectImageView: UIImageView! + @IBOutlet weak var elapsedTimeLabel: UILabel! + var net: MobileNet! + var runner: Runner! + var toPredictTexture: MTLTexture? - if let selectImage = UIImage.init(named: "banana.jpeg") { - selectImageView.image = selectImage - runner.getTexture(image: selectImage.cgImage!) {[weak self] (texture) in - self?.toPredictTexture = texture - } + override func viewDidLoad() { + super.viewDidLoad() + GlobalConfig.shared.computePrecision = .Float16 + net = MobileNet.init(device: MetalHelper.shared.device) + runner = Runner.init(inNet: net, commandQueue: MetalHelper.shared.queue) + + if let selectImage = UIImage.init(named: "banana.jpeg") { + selectImageView.image = selectImage + runner.getTexture(image: selectImage.cgImage!) {[weak self] (texture) in + self?.toPredictTexture = texture + } + } + + } + + @IBAction func loadAct(_ sender: Any) { + if runner.load() { + let resutText = " load success ! " + print(resutText) + self.resultTextView.text = resutText + } else { + fatalError(" load error ") + } + } + + @IBAction func selectImageAct(_ sender: Any) { + let imagePicker = UIImagePickerController() + imagePicker.sourceType = .camera + imagePicker.delegate = self + self.present(imagePicker, animated: true, completion: nil) } - } - - @IBAction func loadAct(_ sender: Any) { - if runner.load() { - let resutText = " load success ! " - print(resutText) - self.resultTextView.text = resutText - } else { - fatalError(" load error ") + @IBAction func clearAct(_ sender: Any) { + runner.clear() } - } - - @IBAction func selectImageAct(_ sender: Any) { - let imagePicker = UIImagePickerController() - imagePicker.sourceType = .camera - imagePicker.delegate = self - self.present(imagePicker, animated: true, completion: nil) - } - - @IBAction func clearAct(_ sender: Any) { - runner.clear() - } - - @IBAction func predictAct(_ sender: Any) { - if let texture = toPredictTexture { - let beginDate = Date.init() - runner.predict(texture: texture) { [weak self] (success, resultHolder) in - if success, let inResultHolder = resultHolder { - let timeUse = Date.init().timeIntervalSince(beginDate) - DispatchQueue.main.async { - self?.elapsedTimeLabel.text = "\(timeUse * 1000)ms" - self?.resultTextView.text = self?.net.resultStr(res: inResultHolder) - } - + @IBAction func predictAct(_ sender: Any) { + + if let texture = toPredictTexture { + let beginDate = Date.init() + runner.predict(texture: texture) { [weak self] (success, resultHolder) in + if success, let inResultHolder = resultHolder { + let timeUse = Date.init().timeIntervalSince(beginDate) + DispatchQueue.main.async { + self?.elapsedTimeLabel.text = "\(timeUse * 1000)ms" + self?.resultTextView.text = self?.net.resultStr(res: inResultHolder) + } + + } else { + print(" predict fail ") + } + } } else { - print(" predict fail ") + print(" toPredictTexture is nil ") } - } - } else { - print(" toPredictTexture is nil ") + } - } - } extension ViewController: UIImagePickerControllerDelegate, UINavigationControllerDelegate { - func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) { - picker.dismiss(animated: true){[weak self] in - guard let sSelf = self, let image = info["UIImagePickerControllerOriginalImage"] as? UIImage else { - fatalError("no image") - } - sSelf.selectImageView.image = image - sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in - sSelf.toPredictTexture = texture - }) + func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) { + picker.dismiss(animated: true){[weak self] in + guard let sSelf = self, let image = info["UIImagePickerControllerOriginalImage"] as? UIImage else { + fatalError("no image") + } + sSelf.selectImageView.image = image + sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in + sSelf.toPredictTexture = texture + }) + } } - } } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift index 537fb06ed9..557f5eef35 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift @@ -16,36 +16,36 @@ import UIKit @UIApplicationMain class AppDelegate: UIResponder, UIApplicationDelegate { - + var window: UIWindow? - + func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool { // Override point for customization after application launch. return true } - + func applicationWillResignActive(_ application: UIApplication) { // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state. // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game. } - + func applicationDidEnterBackground(_ application: UIApplication) { // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later. // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits. } - + func applicationWillEnterForeground(_ application: UIApplication) { // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background. } - + func applicationDidBecomeActive(_ application: UIApplication) { // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface. } - + func applicationWillTerminate(_ application: UIApplication) { // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:. } - - + + } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard index 88445bfdb4..d67403f272 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard @@ -1,5 +1,5 @@ - + diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift index ca19c166c3..8252258c97 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift @@ -18,14 +18,14 @@ import Foundation import paddle_mobile @objc public class MetalHelper: NSObject { - @objc let device: MTLDevice - @objc let queue: MTLCommandQueue - @objc let textureLoader: MTKTextureLoader - @objc static let shared: MetalHelper = MetalHelper.init() - private override init(){ - device = MTLCreateSystemDefaultDevice()! - queue = device.makeCommandQueue()! - textureLoader = MTKTextureLoader.init(device: device) - super.init() - } + @objc let device: MTLDevice + @objc let queue: MTLCommandQueue + @objc let textureLoader: MTKTextureLoader + @objc static let shared: MetalHelper = MetalHelper.init() + private override init(){ + device = MTLCreateSystemDefaultDevice()! + queue = device.makeCommandQueue()! + textureLoader = MTKTextureLoader.init(device: device) + super.init() + } } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift index 22fb5723ac..8af436d779 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift @@ -16,51 +16,51 @@ import UIKit import paddle_mobile class MultiPredictViewController: UIViewController { - var runner1: Runner! - var runner2: Runner! - override func viewDidLoad() { - super.viewDidLoad() - let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device) - let genet = Genet.init(device: MetalHelper.shared.device) - runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue) - let queue2 = MetalHelper.shared.device.makeCommandQueue() + var runner1: Runner! + var runner2: Runner! + override func viewDidLoad() { + super.viewDidLoad() + let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device) + let genet = Genet.init(device: MetalHelper.shared.device) + runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue) + let queue2 = MetalHelper.shared.device.makeCommandQueue() + + runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue) + } - runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue) - } - - @IBAction func predictAct(_ sender: Any) { - let success = self.runner2.load() -// DispatchQueue.global().async { - let image1 = UIImage.init(named: "hand.jpg") -// let success = self.runner2.load() -// if success { -// for i in 0..<10000 { -// print(i) -// self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in -// print("result1: ") -//// print(res) -// }) -// } -// } else { -// print("load failed") -// } -// self.runner1.clear() -// } -// return -// DispatchQueue.global().async { -//// sleep(1) -// let image1 = UIImage.init(named: "banana.jpeg") -//// if success { -// for _ in 0..<10 { -// self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in -// print("result2: ") -// print(res) -// }) -// } -//// } else { -//// print("load failed") -//// } -//// self.runner2.clear() -// } - } + @IBAction func predictAct(_ sender: Any) { + let success = self.runner2.load() + // DispatchQueue.global().async { + let image1 = UIImage.init(named: "hand.jpg") + // let success = self.runner2.load() + // if success { + // for i in 0..<10000 { + // print(i) + // self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in + // print("result1: ") + //// print(res) + // }) + // } + // } else { + // print("load failed") + // } + // self.runner1.clear() + // } + // return + // DispatchQueue.global().async { + //// sleep(1) + // let image1 = UIImage.init(named: "banana.jpeg") + //// if success { + // for _ in 0..<10 { + // self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in + // print("result2: ") + // print(res) + // }) + // } + //// } else { + //// print("load failed") + //// } + //// self.runner2.clear() + // } + } } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm index fac8af2527..ddfc5f770d 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm @@ -20,30 +20,30 @@ #import struct NMSParam { - - float *score_data; - - float *box_data; - - float *output; - - int output_size; - - std::vector score_dim; - - std::vector box_dim; - - float scoreThredshold; - - int nmsTopK; - - int keepTopK; - - float nmsEta; - - float nmsThreshold; - - int background_label; + + float *score_data; + + float *box_data; + + float *output; + + int output_size; + + std::vector score_dim; + + std::vector box_dim; + + float scoreThredshold; + + int nmsTopK; + + int keepTopK; + + float nmsEta; + + float nmsThreshold; + + int background_label; }; @@ -53,63 +53,63 @@ constexpr int kBBoxSize = 4; template bool SortScorePairDescend(const std::pair& pair1, const std::pair& pair2) { - return pair1.first > pair2.first; + return pair1.first > pair2.first; } template static inline void GetMaxScoreIndex( const std::vector& scores, const T threshold, int top_k, std::vector>* sorted_indices) { - for (size_t i = 0; i < scores.size(); ++i) { - if (scores[i] > threshold) { - sorted_indices->push_back(std::make_pair(scores[i], i)); + for (size_t i = 0; i < scores.size(); ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { + sorted_indices->resize(top_k); } - } - // Sort the score pair according to the scores in descending order - std::stable_sort(sorted_indices->begin(), sorted_indices->end(), - SortScorePairDescend); - // Keep top_k scores if needed. - if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { - sorted_indices->resize(top_k); - } } template static inline T BBoxArea(const T* box, const bool normalized) { - if (box[2] < box[0] || box[3] < box[1]) { - // If coordinate values are is invalid - // (e.g. xmax < xmin or ymax < ymin), return 0. - return static_cast(0.); - } else { - const T w = box[2] - box[0]; - const T h = box[3] - box[1]; - if (normalized) { - return w * h; + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); } else { - // If coordinate values are not within range [0, 1]. - return (w + 1) * (h + 1); + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } } - } } template static inline T JaccardOverlap(const T* box1, const T* box2, const bool normalized) { - if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || - box2[3] < box1[1]) { - return static_cast(0.); - } else { - const T inter_xmin = std::max(box1[0], box2[0]); - const T inter_ymin = std::max(box1[1], box2[1]); - const T inter_xmax = std::min(box1[2], box2[2]); - const T inter_ymax = std::min(box1[3], box2[3]); - const T inter_w = inter_xmax - inter_xmin; - const T inter_h = inter_ymax - inter_ymin; - const T inter_area = inter_w * inter_h; - const T bbox1_area = BBoxArea(box1, normalized); - const T bbox2_area = BBoxArea(box2, normalized); - return inter_area / (bbox1_area + bbox2_area - inter_area); - } + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + const T inter_w = inter_xmax - inter_xmin; + const T inter_h = inter_ymax - inter_ymin; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } } template @@ -120,40 +120,40 @@ static inline void NMSFast( const T score_threshold, const T nms_threshold, const T eta, const int top_k, std::vector* selected_indices) { - // The total boxes for each instance. - int num_boxes = bbox_dim[0]; - // 4: [xmin ymin xmax ymax] - int box_size = bbox_dim[1]; - - std::vector scores_data(num_boxes); - std::copy_n(score_data, num_boxes, scores_data.begin()); - std::vector> sorted_indices; - GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); - - selected_indices->clear(); - T adaptive_threshold = nms_threshold; - - while (sorted_indices.size() != 0) { - const int idx = sorted_indices.front().second; - bool keep = true; - for (size_t k = 0; k < selected_indices->size(); ++k) { - if (keep) { - const int kept_idx = (*selected_indices)[k]; - T overlap = JaccardOverlap(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, true); - keep = overlap <= adaptive_threshold; - } else { - break; - } - } - if (keep) { - selected_indices->push_back(idx); - } - sorted_indices.erase(sorted_indices.begin()); - if (keep && eta < 1 && adaptive_threshold > 0.5) { - adaptive_threshold *= eta; + // The total boxes for each instance. + int num_boxes = bbox_dim[0]; + // 4: [xmin ymin xmax ymax] + int box_size = bbox_dim[1]; + + std::vector scores_data(num_boxes); + std::copy_n(score_data, num_boxes, scores_data.begin()); + std::vector> sorted_indices; + GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); + + selected_indices->clear(); + T adaptive_threshold = nms_threshold; + + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (size_t k = 0; k < selected_indices->size(); ++k) { + if (keep) { + const int kept_idx = (*selected_indices)[k]; + T overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, true); + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + if (keep) { + selected_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } } - } } template @@ -165,48 +165,48 @@ void MultiClassNMS(const T *boxes_data, const int& background_label, const int& nms_top_k, const int& keep_top_k, const T& nms_threshold, const T& nms_eta, const T& score_threshold) { - - int64_t class_num = score_dim[0]; - int64_t predict_dim = score_dim[1]; - int num_det = 0; - for (int c = 0; c < class_num; ++c) { - if (c == background_label) continue; - const T *score_data = scores_data + c * predict_dim; - /// [c] is key - NMSFast(boxes_data, box_dim, score_data, score_threshold, nms_threshold, nms_eta, + int64_t class_num = score_dim[0]; + int64_t predict_dim = score_dim[1]; + int num_det = 0; + for (int c = 0; c < class_num; ++c) { + if (c == background_label) continue; + const T *score_data = scores_data + c * predict_dim; + + /// [c] is key + NMSFast(boxes_data, box_dim, score_data, score_threshold, nms_threshold, nms_eta, nms_top_k, &((*indices)[c])); - num_det += (*indices)[c].size(); - } - - *num_nmsed_out = num_det; - if (keep_top_k > -1 && num_det > keep_top_k) { - std::vector>> score_index_pairs; - for (const auto& it : *indices) { - int label = it.first; - const T* sdata = scores_data + label * predict_dim; - const std::vector& label_indices = it.second; - for (size_t j = 0; j < label_indices.size(); ++j) { - int idx = label_indices[j]; - // PADDLE_ENFORCE_LT(idx, predict_dim); - score_index_pairs.push_back(std::make_pair(sdata[idx], std::make_pair(label, idx))); - } + num_det += (*indices)[c].size(); } - // Keep top k results per image. - std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), - SortScorePairDescend>); - score_index_pairs.resize(keep_top_k); - - // Store the new indices. - std::map> new_indices; - for (size_t j = 0; j < score_index_pairs.size(); ++j) { - int label = score_index_pairs[j].second.first; - int idx = score_index_pairs[j].second.second; - new_indices[label].push_back(idx); + + *num_nmsed_out = num_det; + if (keep_top_k > -1 && num_det > keep_top_k) { + std::vector>> score_index_pairs; + for (const auto& it : *indices) { + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& label_indices = it.second; + for (size_t j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + // PADDLE_ENFORCE_LT(idx, predict_dim); + score_index_pairs.push_back(std::make_pair(sdata[idx], std::make_pair(label, idx))); + } + } + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(keep_top_k); + + // Store the new indices. + std::map> new_indices; + for (size_t j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + new_indices.swap(*indices); + *num_nmsed_out = keep_top_k; } - new_indices.swap(*indices); - *num_nmsed_out = keep_top_k; - } } template @@ -215,69 +215,69 @@ void MultiClassOutput(const T *scores_data, const T *bboxes_data, T *outputs_data, const std::map>& selected_indices) { - int predict_dim = score_dim[1]; - int count = 0; - for (const auto& it : selected_indices) { - /// one batch - int label = it.first; - const T* sdata = scores_data + label * predict_dim; - const std::vector& indices = it.second; - for (size_t j = 0; j < indices.size(); ++j) { - int idx = indices[j]; - const T* bdata = bboxes_data + idx * kBBoxSize; - outputs_data[count * kOutputDim] = label; // label - outputs_data[count * kOutputDim + 1] = sdata[idx]; // score - // xmin, ymin, xmax, ymax - std::memcpy(outputs_data + count * kOutputDim + 2, bdata, 4 * sizeof(T)); - count++; + int predict_dim = score_dim[1]; + int count = 0; + for (const auto& it : selected_indices) { + /// one batch + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& indices = it.second; + for (size_t j = 0; j < indices.size(); ++j) { + int idx = indices[j]; + const T* bdata = bboxes_data + idx * kBBoxSize; + outputs_data[count * kOutputDim] = label; // label + outputs_data[count * kOutputDim + 1] = sdata[idx]; // score + // xmin, ymin, xmax, ymax + std::memcpy(outputs_data + count * kOutputDim + 2, bdata, 4 * sizeof(T)); + count++; + } } - } } void MultiClassNMSCompute(NMSParam *param) { - assert(param->score_dim[0] == 1); - assert(param->box_dim[0] == 1); - assert (param->score_dim.size() == 3); - assert(param->box_dim.size() == 3); - - float* outputs; - auto background_label = param->background_label; - auto nms_top_k = param->nmsTopK; - auto keep_top_k = param->keepTopK; - auto nms_threshold = param->nmsThreshold; - auto nms_eta = param->nmsEta; - auto score_threshold = param->scoreThredshold; - - std::vector score_dim_one_batch = {param->score_dim[1], param->score_dim[2]}; - std::vector box_dim_one_batch = {param->box_dim[1], param->box_dim[2]}; - - std::vector batch_starts = {0}; - - std::map> indices; - int num_nmsed_out = 0; - - MultiClassNMS(param->box_data, box_dim_one_batch, param->score_data, score_dim_one_batch, &indices, &num_nmsed_out, - background_label, nms_top_k, keep_top_k, nms_threshold, - nms_eta, score_threshold); - batch_starts.push_back(batch_starts.back() + num_nmsed_out); - - int output_size = 0; - int num_kept = batch_starts.back(); - if (num_kept == 0) { - outputs = new float[1]; - outputs[0] = -1; - output_size = 1; - } else { - outputs = new float[num_kept * kOutputDim]; - int64_t s = batch_starts[0]; - int64_t e = batch_starts[1]; - if (e > s) { - MultiClassOutput(param->score_data, score_dim_one_batch, param->box_data, outputs, indices); + assert(param->score_dim[0] == 1); + assert(param->box_dim[0] == 1); + assert (param->score_dim.size() == 3); + assert(param->box_dim.size() == 3); + + float* outputs; + auto background_label = param->background_label; + auto nms_top_k = param->nmsTopK; + auto keep_top_k = param->keepTopK; + auto nms_threshold = param->nmsThreshold; + auto nms_eta = param->nmsEta; + auto score_threshold = param->scoreThredshold; + + std::vector score_dim_one_batch = {param->score_dim[1], param->score_dim[2]}; + std::vector box_dim_one_batch = {param->box_dim[1], param->box_dim[2]}; + + std::vector batch_starts = {0}; + + std::map> indices; + int num_nmsed_out = 0; + + MultiClassNMS(param->box_data, box_dim_one_batch, param->score_data, score_dim_one_batch, &indices, &num_nmsed_out, + background_label, nms_top_k, keep_top_k, nms_threshold, + nms_eta, score_threshold); + batch_starts.push_back(batch_starts.back() + num_nmsed_out); + + int output_size = 0; + int num_kept = batch_starts.back(); + if (num_kept == 0) { + outputs = new float[1]; + outputs[0] = -1; + output_size = 1; + } else { + outputs = new float[num_kept * kOutputDim]; + int64_t s = batch_starts[0]; + int64_t e = batch_starts[1]; + if (e > s) { + MultiClassOutput(param->score_data, score_dim_one_batch, param->box_data, outputs, indices); + } + output_size = num_kept * kOutputDim; } - output_size = num_kept * kOutputDim; - } - param->output = outputs; - param->output_size = output_size; + param->output = outputs; + param->output_size = output_size; } @implementation CPUResult @@ -286,31 +286,31 @@ void MultiClassNMSCompute(NMSParam *param) { @implementation NMSCompute -(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox { - NMSParam param; - param.box_data = bbox; - param.score_data = score; - param.background_label = self.background_label; - param.scoreThredshold = self.scoreThredshold; - param.nmsTopK = self.nmsTopK; - param.keepTopK = self.keepTopK; - param.nmsEta = self.nmsEta; - param.nmsThreshold = self.nmsThreshold; - std::vector score_dim; - for (int i = 0; i < self.scoreDim.count; ++i) { - score_dim.push_back(self.scoreDim[i].intValue); - } - param.score_dim = score_dim; - - std::vector box_dim; - for (int i = 0; i < self.bboxDim.count; ++i) { - box_dim.push_back(self.bboxDim[i].intValue); - } - param.box_dim = box_dim; - MultiClassNMSCompute(¶m); - CPUResult *cr = [[CPUResult alloc] init]; - cr.output = param.output; - cr.outputSize = param.output_size; - return cr; + NMSParam param; + param.box_data = bbox; + param.score_data = score; + param.background_label = self.background_label; + param.scoreThredshold = self.scoreThredshold; + param.nmsTopK = self.nmsTopK; + param.keepTopK = self.keepTopK; + param.nmsEta = self.nmsEta; + param.nmsThreshold = self.nmsThreshold; + std::vector score_dim; + for (int i = 0; i < self.scoreDim.count; ++i) { + score_dim.push_back(self.scoreDim[i].intValue); + } + param.score_dim = score_dim; + + std::vector box_dim; + for (int i = 0; i < self.bboxDim.count; ++i) { + box_dim.push_back(self.bboxDim[i].intValue); + } + param.box_dim = box_dim; + MultiClassNMSCompute(¶m); + CPUResult *cr = [[CPUResult alloc] init]; + cr.output = param.output; + cr.outputSize = param.output_size; + return cr; } @end diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift index 91bf014e9f..b248e53bac 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift @@ -16,37 +16,37 @@ import Foundation import paddle_mobile public class Genet: Net { - @objc public override init(device: MTLDevice) { - super.init(device: device) - modelPath = Bundle.main.path(forResource: "genet_model", ofType: nil) ?! "model null" - paramPath = Bundle.main.path(forResource: "genet_params", ofType: nil) ?! "para null" - preprocessKernel = GenetPreProccess.init(device: device) - inputDim = Dim.init(inDim: [1, 128, 128, 3]) - metalLoadMode = .LoadMetalInCustomMetalLib - metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") - } - - @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) { - super.init(device: device, - inParamPointer: inParamPointer, - inParamSize: inParamSize, - inModelPointer: inModelPointer, - inModelSize: inModelSize) - metalLoadMode = .LoadMetalInCustomMetalLib - metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") - preprocessKernel = GenetPreProccess.init(device: device) - inputDim = Dim.init(inDim: [1, 128, 128, 3]) - } - - class GenetPreProccess: CusomKernel { - init(device: MTLDevice) { - let s = Shape.init(inWidth: 128, inHeight: 128, inChannel: 3) - super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) + @objc public override init(device: MTLDevice) { + super.init(device: device) + modelPath = Bundle.main.path(forResource: "genet_model", ofType: nil) ?! "model null" + paramPath = Bundle.main.path(forResource: "genet_params", ofType: nil) ?! "para null" + preprocessKernel = GenetPreProccess.init(device: device) + inputDim = Dim.init(inDim: [1, 128, 128, 3]) + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + } + + @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) { + super.init(device: device, + inParamPointer: inParamPointer, + inParamSize: inParamSize, + inModelPointer: inModelPointer, + inModelSize: inModelSize) + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + preprocessKernel = GenetPreProccess.init(device: device) + inputDim = Dim.init(inDim: [1, 128, 128, 3]) + } + + class GenetPreProccess: CusomKernel { + init(device: MTLDevice) { + let s = Shape.init(inWidth: 128, inHeight: 128, inChannel: 3) + super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) + } + } + + override public func resultStr(res: [ResultHolder]) -> String { + return " \(res[0].result[0]) ... " } - } - - override public func resultStr(res: [ResultHolder]) -> String { - return " \(res[0].result[0]) ... " - } - + } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift index d35fde97d7..608cd3180b 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift @@ -16,53 +16,53 @@ import Foundation import paddle_mobile public class MobileNet: Net{ - - class MobilenetPreProccess: CusomKernel { - init(device: MTLDevice) { - let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3) - super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) - } - } - - class PreWords { - var contents: [String] = [] - init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) { - if let filePath = inBundle.path(forResource: fileName, ofType: type) { - let string = try! String.init(contentsOfFile: filePath) - contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{ - String($0[$0.index($0.startIndex, offsetBy: 10)...]) + + class MobilenetPreProccess: CusomKernel { + init(device: MTLDevice) { + let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3) + super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) } - }else{ - fatalError("no file call \(fileName)") - } } - subscript(index: Int) -> String { - return contents[index] + + class PreWords { + var contents: [String] = [] + init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) { + if let filePath = inBundle.path(forResource: fileName, ofType: type) { + let string = try! String.init(contentsOfFile: filePath) + contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{ + String($0[$0.index($0.startIndex, offsetBy: 10)...]) + } + }else{ + fatalError("no file call \(fileName)") + } + } + subscript(index: Int) -> String { + return contents[index] + } } - } - - let labels = PreWords.init(fileName: "synset") - - override public func resultStr(res: [ResultHolder]) -> String { - let resPointer = res[0].result - var s: [String] = [] - (0.. String { + let resPointer = res[0].result + var s: [String] = [] + (0.. String { - return " \(res[0].result[0]) ... " - } - + @objc public override init(device: MTLDevice) { + super.init(device: device) + except = 0 + modelPath = Bundle.main.path(forResource: "combined_mobilenet_model", ofType: nil) ?! "model null" + paramPath = Bundle.main.path(forResource: "combined_mobilenet_params", ofType: nil) ?! "para null" + inputDim = Dim.init(inDim: [1, 224, 224, 3]) + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + } + + override public func resultStr(res: [ResultHolder]) -> String { + return " \(res[0].result[0]) ... " + } + } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift index 140aefdfb3..38d20557d2 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift @@ -16,84 +16,84 @@ import Foundation import paddle_mobile public class MobileNet_ssd_hand: Net { - @objc public override init(device: MTLDevice) { - super.init(device: device) - except = 2 - modelPath = Bundle.main.path(forResource: "ssd_hand_model", ofType: nil) ?! "model null" - paramPath = Bundle.main.path(forResource: "ssd_hand_params", ofType: nil) ?! "para null" - metalLoadMode = .LoadMetalInCustomMetalLib - metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") - preprocessKernel = MobilenetssdPreProccess.init(device: device) - inputDim = Dim.init(inDim: [1, 300, 300, 3]) - } - - @objc override public init(device: MTLDevice,inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer inModePointer: UnsafeMutableRawPointer, inModelSize: Int) { - super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModePointer,inModelSize:inModelSize) - except = 2 - modelPath = "" - paramPath = "" - metalLoadMode = .LoadMetalInCustomMetalLib - metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") - preprocessKernel = MobilenetssdPreProccess.init(device: device) - inputDim = Dim.init(inDim: [1, 300, 300, 3]) - } - - class MobilenetssdPreProccess: CusomKernel { - init(device: MTLDevice) { - let s = Shape.init(inWidth: 300, inHeight: 300, inChannel: 3) - super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) + @objc public override init(device: MTLDevice) { + super.init(device: device) + except = 2 + modelPath = Bundle.main.path(forResource: "ssd_hand_model", ofType: nil) ?! "model null" + paramPath = Bundle.main.path(forResource: "ssd_hand_params", ofType: nil) ?! "para null" + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + preprocessKernel = MobilenetssdPreProccess.init(device: device) + inputDim = Dim.init(inDim: [1, 300, 300, 3]) } - } - - override public func resultStr(res: [ResultHolder]) -> String { - return " \(res[0])" - } - - override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] { - -// guard let interRes = paddleMobileRes.intermediateResults else { -// fatalError(" need have inter result ") -// } -// -// guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as? Texture else { -// fatalError(" need score ") -// } -// -// guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture else { -// fatalError() -// } -// -// var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3])) -//// print("score: ") -//// print(scoreFormatArr.strideArray()) -//// -// var bboxArr = bbox.metalTexture.float32Array() -//// print("bbox: ") -//// print(bboxArr.strideArray()) -// -// let nmsCompute = NMSCompute.init() -// nmsCompute.scoreThredshold = 0.01 -// nmsCompute.nmsTopK = 400 -// nmsCompute.keepTopK = 200 -// nmsCompute.nmsEta = 1.0 -// nmsCompute.nmsThreshold = 0.45 -// nmsCompute.background_label = 0; -// -// nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])] -// -// nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])] -// guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else { -// fatalError( " result error " ) -// } -// -// let output: [Float32] = result.map { $0.floatValue } -// -// -// return output - fatalError() - } - - - - + + @objc override public init(device: MTLDevice,inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer inModePointer: UnsafeMutableRawPointer, inModelSize: Int) { + super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModePointer,inModelSize:inModelSize) + except = 2 + modelPath = "" + paramPath = "" + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + preprocessKernel = MobilenetssdPreProccess.init(device: device) + inputDim = Dim.init(inDim: [1, 300, 300, 3]) + } + + class MobilenetssdPreProccess: CusomKernel { + init(device: MTLDevice) { + let s = Shape.init(inWidth: 300, inHeight: 300, inChannel: 3) + super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) + } + } + + override public func resultStr(res: [ResultHolder]) -> String { + return " \(res[0])" + } + + override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] { + + // guard let interRes = paddleMobileRes.intermediateResults else { + // fatalError(" need have inter result ") + // } + // + // guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as? Texture else { + // fatalError(" need score ") + // } + // + // guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture else { + // fatalError() + // } + // + // var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3])) + //// print("score: ") + //// print(scoreFormatArr.strideArray()) + //// + // var bboxArr = bbox.metalTexture.float32Array() + //// print("bbox: ") + //// print(bboxArr.strideArray()) + // + // let nmsCompute = NMSCompute.init() + // nmsCompute.scoreThredshold = 0.01 + // nmsCompute.nmsTopK = 400 + // nmsCompute.keepTopK = 200 + // nmsCompute.nmsEta = 1.0 + // nmsCompute.nmsThreshold = 0.45 + // nmsCompute.background_label = 0; + // + // nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])] + // + // nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])] + // guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else { + // fatalError( " result error " ) + // } + // + // let output: [Float32] = result.map { $0.floatValue } + // + // + // return output + fatalError() + } + + + + } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift index 134a07bba6..76feb0ecd0 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift @@ -16,137 +16,137 @@ import Foundation import paddle_mobile public class MobileNet_ssd_AR: Net { - @objc public override init(device: MTLDevice) { - super.init(device: device) - except = 2 - modelPath = Bundle.main.path(forResource: "ar_model", ofType: nil) ?! "model null" - paramPath = Bundle.main.path(forResource: "ar_params", ofType: nil) ?! "para null" - preprocessKernel = MobilenetssdPreProccess.init(device: device) - inputDim = Dim.init(inDim: [1, 160, 160, 3]) - metalLoadMode = .LoadMetalInCustomMetalLib - metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") - } - - @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) { - super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModelPointer,inModelSize:inModelSize) - except = 2 - preprocessKernel = MobilenetssdPreProccess.init(device: device) - inputDim = Dim.init(inDim: [1, 160, 160, 3]) - metalLoadMode = .LoadMetalInCustomMetalLib - metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") - } - - class MobilenetssdPreProccess: CusomKernel { - init(device: MTLDevice) { - let s = Shape.init(inWidth: 160, inHeight: 160, inChannel: 3) - super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) + @objc public override init(device: MTLDevice) { + super.init(device: device) + except = 2 + modelPath = Bundle.main.path(forResource: "ar_model", ofType: nil) ?! "model null" + paramPath = Bundle.main.path(forResource: "ar_params", ofType: nil) ?! "para null" + preprocessKernel = MobilenetssdPreProccess.init(device: device) + inputDim = Dim.init(inDim: [1, 160, 160, 3]) + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") } - } - - override public func resultStr(res: [ResultHolder]) -> String { - return " \(res[0].result[0])" - } - - override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] { - fatalError() -// guard let interRes = paddleMobileRes.intermediateResults else { -// fatalError(" need have inter result ") -// } -// -// guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as? FetchHolder else { -// fatalError(" need score ") -// } -// -// guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else { -// fatalError() -// } -// let startDate = Date.init() + @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) { + super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModelPointer,inModelSize:inModelSize) + except = 2 + preprocessKernel = MobilenetssdPreProccess.init(device: device) + inputDim = Dim.init(inDim: [1, 160, 160, 3]) + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + } -// print("scoreFormatArr: ") -//print((0.. String { + return " \(res[0].result[0])" + } + + override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] { + fatalError() + // guard let interRes = paddleMobileRes.intermediateResults else { + // fatalError(" need have inter result ") + // } + // + // guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as? FetchHolder else { + // fatalError(" need score ") + // } + // + // guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else { + // fatalError() + // } + + // let startDate = Date.init() + + // print("scoreFormatArr: ") + //print((0.. inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]], - uint2 gid [[thread_position_in_grid]]) + texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]], + uint2 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height()) { @@ -31,9 +31,9 @@ kernel void mobilenet_preprocess( } kernel void mobilenet_preprocess_half( - texture2d inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]], - uint2 gid [[thread_position_in_grid]]) + texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]], + uint2 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height()) { @@ -45,9 +45,9 @@ kernel void mobilenet_preprocess_half( } kernel void mobilenet_ssd_preprocess( - texture2d inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]], - uint2 gid [[thread_position_in_grid]]) + texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]], + uint2 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height()) { @@ -59,9 +59,9 @@ kernel void mobilenet_ssd_preprocess( } kernel void mobilenet_ssd_preprocess_half( - texture2d inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]], - uint2 gid [[thread_position_in_grid]]) + texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]], + uint2 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height()) { @@ -74,44 +74,44 @@ kernel void mobilenet_ssd_preprocess_half( kernel void genet_preprocess(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f); - const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; - outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f); + const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid); } kernel void genet_preprocess_half(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f); - const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; - outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f); + const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); } kernel void mobilent_ar_preprocess(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f); - const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; - outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f); + const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid); } kernel void mobilent_ar_preprocess_half(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f); - const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; - outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f); + const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift index f5f4ef81e9..caaef97695 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift @@ -17,18 +17,18 @@ import Foundation import paddle_mobile public class YoloNet: Net { - @objc public override init(device: MTLDevice) { - super.init(device: device) - except = 0 - modelPath = Bundle.main.path(forResource: "yolo_model", ofType: nil) ?! "model null" - paramPath = Bundle.main.path(forResource: "yolo_params", ofType: nil) ?! "para null" - inputDim = Dim.init(inDim: [1, 416, 416, 3]) - metalLoadMode = .LoadMetalInCustomMetalLib - metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") - } - - override public func resultStr(res: [ResultHolder]) -> String { - return " \(res[0].result[0]) ... " - } - + @objc public override init(device: MTLDevice) { + super.init(device: device) + except = 0 + modelPath = Bundle.main.path(forResource: "yolo_model", ofType: nil) ?! "model null" + paramPath = Bundle.main.path(forResource: "yolo_params", ofType: nil) ?! "para null" + inputDim = Dim.init(inDim: [1, 416, 416, 3]) + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + } + + override public func resultStr(res: [ResultHolder]) -> String { + return " \(res[0].result[0]) ... " + } + } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m index 586fc91a7f..5bef9317b1 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m @@ -34,83 +34,83 @@ @implementation LoadPointerViewController - (void)viewDidLoad { - [super viewDidLoad]; - - self.imageView.image = [UIImage imageNamed:@"banana.jpeg"]; - - NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"super_model" withExtension:nil].path; - NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"super_params" withExtension:nil].path; - - long fileSize; - FILE *fp; - fp = fopen([modelPath UTF8String], "rb"); - fseek(fp, 0, SEEK_END); - fileSize = ftell(fp); - rewind(fp); - void *buffer = malloc(fileSize); - fread(buffer, 1, fileSize, fp); - fclose(fp); - - long paramfileSize; - FILE *parmaFilePointer; - parmaFilePointer = fopen([paramPath UTF8String], "rb"); - fseek(parmaFilePointer, 0, SEEK_END); - paramfileSize = ftell(parmaFilePointer); - rewind(parmaFilePointer); - void *parmaBuffer = malloc(paramfileSize); - fread(parmaBuffer, 1, paramfileSize, parmaFilePointer); - fclose(parmaFilePointer); - - _modelConfig = [[ModelConfig alloc] init]; - _modelConfig.modelPointer = buffer; - _modelConfig.modelSize = (int)fileSize; - _modelConfig.paramPointer = parmaBuffer; - _modelConfig.paramSize = (int)paramfileSize; + [super viewDidLoad]; + + self.imageView.image = [UIImage imageNamed:@"banana.jpeg"]; + + NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"super_model" withExtension:nil].path; + NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"super_params" withExtension:nil].path; + + long fileSize; + FILE *fp; + fp = fopen([modelPath UTF8String], "rb"); + fseek(fp, 0, SEEK_END); + fileSize = ftell(fp); + rewind(fp); + void *buffer = malloc(fileSize); + fread(buffer, 1, fileSize, fp); + fclose(fp); + + long paramfileSize; + FILE *parmaFilePointer; + parmaFilePointer = fopen([paramPath UTF8String], "rb"); + fseek(parmaFilePointer, 0, SEEK_END); + paramfileSize = ftell(parmaFilePointer); + rewind(parmaFilePointer); + void *parmaBuffer = malloc(paramfileSize); + fread(parmaBuffer, 1, paramfileSize, parmaFilePointer); + fclose(parmaFilePointer); + + _modelConfig = [[ModelConfig alloc] init]; + _modelConfig.modelPointer = buffer; + _modelConfig.modelSize = (int)fileSize; + _modelConfig.paramPointer = parmaBuffer; + _modelConfig.paramSize = (int)paramfileSize; } - (IBAction)loaderButtonPressed:(id)sender { - self.paddleMobile = [[PaddleMobileGPU alloc] initWithCommandQueue:MetalHelper.shared.queue net:SuperResolutionNetType modelConfig:_modelConfig]; - _loaded = [self.paddleMobile load]; - NSLog(@" load 结果: %@", _loaded ? @"成功" : @"失败"); + self.paddleMobile = [[PaddleMobileGPU alloc] initWithCommandQueue:MetalHelper.shared.queue net:SuperResolutionNetType modelConfig:_modelConfig]; + _loaded = [self.paddleMobile load]; + NSLog(@" load 结果: %@", _loaded ? @"成功" : @"失败"); } - (IBAction)predictButtonPressed:(id)sender { - [self predict]; + [self predict]; } - (void)predict { - UIImage *image = self.imageView.image; - if (!image) { - NSLog(@" image is nil"); - return; - } - id texture = [MetalHelper.shared.textureLoader newTextureWithCGImage:image.CGImage options:nil error:nil]; - _texture = texture; - if (!_texture) { - NSLog(@" texture is nil"); - return; - } - - if (!self.loaded) { - NSLog(@" not load "); - return; - } - - NSTimeInterval startTime = [[NSDate date] timeIntervalSince1970]; - NSInteger max = 1; - for (int i = 0;i < max; i ++) { - [self.paddleMobile predict:_texture withCompletion:^(BOOL success , NSArray *result) { - if (success) { - if (i == max -1) { - double time = [[NSDate date] timeIntervalSince1970] - startTime; - time = (time/max)*1000; - NSLog(@"gap ==== %fms",time); - } - } - }]; - } + UIImage *image = self.imageView.image; + if (!image) { + NSLog(@" image is nil"); + return; + } + id texture = [MetalHelper.shared.textureLoader newTextureWithCGImage:image.CGImage options:nil error:nil]; + _texture = texture; + if (!_texture) { + NSLog(@" texture is nil"); + return; + } + + if (!self.loaded) { + NSLog(@" not load "); + return; + } + + NSTimeInterval startTime = [[NSDate date] timeIntervalSince1970]; + NSInteger max = 1; + for (int i = 0;i < max; i ++) { + [self.paddleMobile predict:_texture withCompletion:^(BOOL success , NSArray *result) { + if (success) { + if (i == max -1) { + double time = [[NSDate date] timeIntervalSince1970] - startTime; + time = (time/max)*1000; + NSLog(@"gap ==== %fms",time); + } + } + }]; + } } - (IBAction)clear:(id)sender { - [self.paddleMobile clear]; - self.loaded = NO; + [self.paddleMobile clear]; + self.loaded = NO; } @end diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h index cd99ddad43..d45d7daaa1 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h @@ -16,8 +16,8 @@ #import typedef enum : NSUInteger { - SuperResolutionNetType, - MobileNetSSDType + SuperResolutionNetType, + MobileNetSSDType } NetType; @interface PaddleMobileGPUResult: NSObject diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m index 670753fd9f..881a6cb505 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m @@ -30,75 +30,75 @@ @implementation PaddleMobileGPUResult - (void)setOutputResult:(ResultHolder *)resultHolder { - self.resultHolder = resultHolder; - self.output = resultHolder.result; - self.outputSize = resultHolder.capacity; + self.resultHolder = resultHolder; + self.output = resultHolder.result; + self.outputSize = resultHolder.capacity; } -(void)releaseOutput { - [self.resultHolder releasePointer]; + [self.resultHolder releasePointer]; } @end @interface PaddleMobileGPU () { - Runner *runner; + Runner *runner; } @end @implementation PaddleMobileGPU -(instancetype)initWithCommandQueue:(id)queue net:(NetType)netType modelConfig:(ModelConfig *)config { - self = [super init]; - if (self) { - Net *net = nil; - if (netType == SuperResolutionNetType) { - net = [[SuperResolutionNet alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize]; - } else if (netType == MobileNetSSDType) { - net = [[MobileNet_ssd_AR alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize]; + self = [super init]; + if (self) { + Net *net = nil; + if (netType == SuperResolutionNetType) { + net = [[SuperResolutionNet alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize]; + } else if (netType == MobileNetSSDType) { + net = [[MobileNet_ssd_AR alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize]; + } + runner = [[Runner alloc] initInNet:net commandQueue:queue]; } - runner = [[Runner alloc] initInNet:net commandQueue:queue]; - } - return self; + return self; } -(BOOL)load { - return [runner load]; + return [runner load]; } -(void)predict:(id)texture withCompletion:(void (^)(BOOL, NSArray*> *))completion { - - [runner predictWithTexture:texture completion:^(BOOL success, NSArray * _Nullable resultArr) { - NSMutableArray*> *ocResultArray = [NSMutableArray arrayWithCapacity:resultArr.count]; - for (int i = 0; i < resultArr.count; ++i) { - ResultHolder *resultHolder = resultArr[i]; - NSMutableArray *res = [NSMutableArray arrayWithCapacity:resultHolder.capacity]; - for (int j = 0; j < resultHolder.capacity; ++j) { - [res addObject:[NSNumber numberWithFloat:resultHolder.result[i]]]; - } - [ocResultArray addObject:res]; - [resultHolder releasePointer]; - } - completion(success, ocResultArray); - }]; + + [runner predictWithTexture:texture completion:^(BOOL success, NSArray * _Nullable resultArr) { + NSMutableArray*> *ocResultArray = [NSMutableArray arrayWithCapacity:resultArr.count]; + for (int i = 0; i < resultArr.count; ++i) { + ResultHolder *resultHolder = resultArr[i]; + NSMutableArray *res = [NSMutableArray arrayWithCapacity:resultHolder.capacity]; + for (int j = 0; j < resultHolder.capacity; ++j) { + [res addObject:[NSNumber numberWithFloat:resultHolder.result[i]]]; + } + [ocResultArray addObject:res]; + [resultHolder releasePointer]; + } + completion(success, ocResultArray); + }]; } -(void)predict:(id)texture withResultCompletion:(void (^)(BOOL, NSArray *))completion { - [runner predictWithTexture:texture completion:^(BOOL success, NSArray * _Nullable resultArr) { - NSMutableArray *ocResultArr = [NSMutableArray arrayWithCapacity:resultArr.count]; - for (int i = 0; i < resultArr.count; ++i) { - ResultHolder *result = resultArr[i]; - PaddleMobileGPUResult *gpuResult = [[PaddleMobileGPUResult alloc] init]; - gpuResult.dim = result.dim; - [gpuResult setOutputResult:result]; - [ocResultArr addObject:gpuResult]; - } - completion(success, ocResultArr); - }]; + [runner predictWithTexture:texture completion:^(BOOL success, NSArray * _Nullable resultArr) { + NSMutableArray *ocResultArr = [NSMutableArray arrayWithCapacity:resultArr.count]; + for (int i = 0; i < resultArr.count; ++i) { + ResultHolder *result = resultArr[i]; + PaddleMobileGPUResult *gpuResult = [[PaddleMobileGPUResult alloc] init]; + gpuResult.dim = result.dim; + [gpuResult setOutputResult:result]; + [ocResultArr addObject:gpuResult]; + } + completion(success, ocResultArr); + }]; } -(void)clear { - [runner clear]; + [runner clear]; } @end diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift index d2bebb2668..50dd29095e 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift @@ -16,57 +16,57 @@ import Foundation import paddle_mobile @objc public class SuperResolutionNet: Net{ - override public func resultStr(res: [ResultHolder]) -> String { - return "未实现" - } - - public override init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize: Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) { - super.init(device: device) - except = 0 - metalLoadMode = .LoadMetalInCustomMetalLib - metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") - inputDim = Dim.init(inDim: [1, 224, 224, 3]) - self.paramPointer = inParamPointer - self.paramSize = inParamSize - self.modelPointer = inModelPointer - self.modelSize = inModelSize - } + override public func resultStr(res: [ResultHolder]) -> String { + return "未实现" + } + + public override init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize: Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) { + super.init(device: device) + except = 0 + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + inputDim = Dim.init(inDim: [1, 224, 224, 3]) + self.paramPointer = inParamPointer + self.paramSize = inParamSize + self.modelPointer = inModelPointer + self.modelSize = inModelSize + } + + @objc override public init(device: MTLDevice) { + super.init(device: device) + except = 0 + modelPath = Bundle.main.path(forResource: "super_model", ofType: nil) ?! "model null" + paramPath = Bundle.main.path(forResource: "super_params", ofType: nil) ?! "para null" + preprocessKernel = nil + inputDim = Dim.init(inDim: [1, 224, 224, 1]) + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + } - @objc override public init(device: MTLDevice) { - super.init(device: device) - except = 0 - modelPath = Bundle.main.path(forResource: "super_model", ofType: nil) ?! "model null" - paramPath = Bundle.main.path(forResource: "super_params", ofType: nil) ?! "para null" - preprocessKernel = nil - inputDim = Dim.init(inDim: [1, 224, 224, 1]) - metalLoadMode = .LoadMetalInCustomMetalLib - metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") - } - - override public func updateProgram(program: Program) { - // n h w c - for block in program.programDesc.blocks { - for varDesc in block.vars { - if !varDesc.persistable { - if varDesc.type == .LodTensor { - let varEle = program.scope.vars[varDesc.name] - if let texture = varEle as? Texture { - let newDim = Dim.init(inDim: [texture.dim[0], inputDim[1], inputDim[2], texture.tensorDim[1]]) - print(" var desc name " + varDesc.name + " new dim" + "\(newDim)") - - texture.updateDims(inTensorDim: Dim.init(inDim: [texture.tensorDim[0], texture.tensorDim[1], inputDim[1], inputDim[2]]), inDim: newDim) - texture.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision) - - let output: FetchHolder = program.scope.output() as! FetchHolder - output.dim = newDim - output.capacity = newDim.numel() - output.paddedCapacity = newDim.numel() * 4 - output.initBuffer(device: device) + override public func updateProgram(program: Program) { + // n h w c + for block in program.programDesc.blocks { + for varDesc in block.vars { + if !varDesc.persistable { + if varDesc.type == .LodTensor { + let varEle = program.scope.vars[varDesc.name] + if let texture = varEle as? Texture { + let newDim = Dim.init(inDim: [texture.dim[0], inputDim[1], inputDim[2], texture.tensorDim[1]]) + print(" var desc name " + varDesc.name + " new dim" + "\(newDim)") + + texture.updateDims(inTensorDim: Dim.init(inDim: [texture.tensorDim[0], texture.tensorDim[1], inputDim[1], inputDim[2]]), inDim: newDim) + texture.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision) + + let output: FetchHolder = program.scope.output() as! FetchHolder + output.dim = newDim + output.capacity = newDim.numel() + output.paddedCapacity = newDim.numel() * 4 + output.initBuffer(device: device) + } + } + } } - } } - } } - } } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift index f9e841f9c2..0080aa80f6 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift @@ -4,28 +4,28 @@ import Foundation import QuartzCore public class FPSCounter { - private(set) public var fps: Double = 0 - - var frames = 0 - var startTime: CFTimeInterval = 0 - - public func start() { - frames = 0 - startTime = CACurrentMediaTime() - } - - public func frameCompleted() { - frames += 1 - let now = CACurrentMediaTime() - let elapsed = now - startTime - if elapsed > 0.1 { - let current = Double(frames) / elapsed - let smoothing = 0.75 - fps = smoothing*fps + (1 - smoothing)*current - if elapsed > 1 { + private(set) public var fps: Double = 0 + + var frames = 0 + var startTime: CFTimeInterval = 0 + + public func start() { frames = 0 startTime = CACurrentMediaTime() - } } - } + + public func frameCompleted() { + frames += 1 + let now = CACurrentMediaTime() + let elapsed = now - startTime + if elapsed > 0.1 { + let current = Double(frames) / elapsed + let smoothing = 0.75 + fps = smoothing*fps + (1 - smoothing)*current + if elapsed > 1 { + frames = 0 + startTime = CACurrentMediaTime() + } + } + } } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift index c235ed2f03..cb63954487 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift @@ -6,15 +6,15 @@ import AVFoundation @available(iOS 10.0, *) @objc public protocol VideoCaptureDelegate: NSObjectProtocol { - @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime) + @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime) @objc optional func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime) @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhoto previewImage: UIImage?) @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhotoTexture texture: MTLTexture?) } /** - Simple interface to the iPhone's camera. -*/ + Simple interface to the iPhone's camera. + */ @available(iOS 10.0, *) public class VideoCapture: NSObject { public var previewLayer: AVCaptureVideoPreviewLayer? @@ -35,9 +35,9 @@ public class VideoCapture: NSObject { self.cameraPosition = position super.init() } - + public func setUp(sessionPreset: AVCaptureSession.Preset = .medium, - completion: @escaping (Bool) -> Void) { + completion: @escaping (Bool) -> Void) { queue.async { let success = self.setUpCamera(sessionPreset: sessionPreset) DispatchQueue.main.async { @@ -45,7 +45,7 @@ public class VideoCapture: NSObject { } } } - + func fontCamera() -> AVCaptureDevice? { let deveices = AVCaptureDevice.DiscoverySession.init(deviceTypes: [.builtInWideAngleCamera], mediaType: AVMediaType.video, position: .front).devices return deveices.first @@ -62,7 +62,7 @@ public class VideoCapture: NSObject { captureSession.beginConfiguration() captureSession.sessionPreset = sessionPreset - + var oCaptureDevice: AVCaptureDevice? switch cameraPosition { case .back: @@ -79,56 +79,56 @@ public class VideoCapture: NSObject { print("Error: no video devices available") return false } - + guard let videoInput = try? AVCaptureDeviceInput(device: captureDevice) else { print("Error: could not create AVCaptureDeviceInput") return false } - + if captureSession.canAddInput(videoInput) { captureSession.addInput(videoInput) } - + let previewLayer = AVCaptureVideoPreviewLayer(session: captureSession) previewLayer.videoGravity = AVLayerVideoGravity.resizeAspect previewLayer.connection?.videoOrientation = self.videoOrientation self.previewLayer = previewLayer - + let settings: [String : Any] = [ - kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA) + kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA) ] - + videoOutput.videoSettings = settings videoOutput.alwaysDiscardsLateVideoFrames = true videoOutput.setSampleBufferDelegate(self, queue: queue) if captureSession.canAddOutput(videoOutput) { captureSession.addOutput(videoOutput) } - + // We want the buffers to be in portrait orientation otherwise they are // rotated by 90 degrees. Need to set this _after_ addOutput()! videoOutput.connection(with: AVMediaType.video)?.videoOrientation = self.videoOrientation - + if captureSession.canAddOutput(photoOutput) { captureSession.addOutput(photoOutput) } - + captureSession.commitConfiguration() return true } - + public func start() { if !captureSession.isRunning { captureSession.startRunning() } } - + public func stop() { if captureSession.isRunning { captureSession.stopRunning() } } - + /* Captures a single frame of the camera input. */ public func capturePhoto() { let settings = AVCapturePhotoSettings(format: [kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)]) @@ -139,7 +139,7 @@ public class VideoCapture: NSObject { ] photoOutput.capturePhoto(with: settings, delegate: self) } - + func convertToMTLTexture(sampleBuffer: CMSampleBuffer?) -> MTLTexture? { if let textureCache = textureCache, let sampleBuffer = sampleBuffer, let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) { let width = CVPixelBufferGetWidth(imageBuffer) @@ -152,7 +152,7 @@ public class VideoCapture: NSObject { } return nil } - + func convertToUIImage(sampleBuffer: CMSampleBuffer?) -> UIImage? { if let sampleBuffer = sampleBuffer, let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) { @@ -172,47 +172,47 @@ public class VideoCapture: NSObject { @available(iOS 10.0, *) extension VideoCapture: AVCaptureVideoDataOutputSampleBufferDelegate { - public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { - // Because lowering the capture device's FPS looks ugly in the preview, - // we capture at full speed but only call the delegate at its desired - // framerate. If `fps` is -1, we run at the full framerate. - let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer) - let deltaTime = timestamp - lastTimestamp - if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) { - lastTimestamp = timestamp - self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp) - if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{ - let texture = convertToMTLTexture(sampleBuffer: sampleBuffer) - delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp) + public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + // Because lowering the capture device's FPS looks ugly in the preview, + // we capture at full speed but only call the delegate at its desired + // framerate. If `fps` is -1, we run at the full framerate. + let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer) + let deltaTime = timestamp - lastTimestamp + if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) { + lastTimestamp = timestamp + self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp) + if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{ + let texture = convertToMTLTexture(sampleBuffer: sampleBuffer) + delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp) + } } } - } - - public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { - print("dropped frame") - } + + public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + print("dropped frame") + } } @available(iOS 10.0, *) extension VideoCapture: AVCapturePhotoCaptureDelegate { - public func photoOutput(_ captureOutput: AVCapturePhotoOutput, - didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?, - previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?, - resolvedSettings: AVCaptureResolvedPhotoSettings, - bracketSettings: AVCaptureBracketedStillImageSettings?, - error: Error?) { - var imageTexture: MTLTexture? - var previewImage: UIImage? - if error == nil { - if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{ - imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer) - self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture) - } - - if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{ - previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer) - self.delegate?.videoCapture?(self, didCapturePhoto: previewImage) + public func photoOutput(_ captureOutput: AVCapturePhotoOutput, + didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?, + previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?, + resolvedSettings: AVCaptureResolvedPhotoSettings, + bracketSettings: AVCaptureBracketedStillImageSettings?, + error: Error?) { + var imageTexture: MTLTexture? + var previewImage: UIImage? + if error == nil { + if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{ + imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer) + self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture) + } + + if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{ + previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer) + self.delegate?.videoCapture?(self, didCapturePhoto: previewImage) + } } } - } } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift index 612a986d85..42d6c2b7ab 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift @@ -19,265 +19,242 @@ import paddle_mobile import MetalPerformanceShaders class FileReader { - let file: UnsafeMutablePointer - let fileSize: Int - init(paramPath: String) throws { - guard let tmpFile = fopen(paramPath, "rb") else { - throw PaddleMobileError.loaderError(message: "open param file error" + paramPath) + let file: UnsafeMutablePointer + let fileSize: Int + init(paramPath: String) throws { + guard let tmpFile = fopen(paramPath, "rb") else { + throw PaddleMobileError.loaderError(message: "open param file error" + paramPath) + } + file = tmpFile + fseek(file, 0, SEEK_END) + fileSize = ftell(file) + guard fileSize > 0 else { + throw PaddleMobileError.loaderError(message: "param file size is too small") + } + rewind(file) + } + + func read() -> UnsafeMutablePointer { + let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size * fileSize) + fread(ptr, fileSize, 1, file) + return ptr } - file = tmpFile - fseek(file, 0, SEEK_END) - fileSize = ftell(file) - guard fileSize > 0 else { - throw PaddleMobileError.loaderError(message: "param file size is too small") + + deinit { + fclose(file) } - rewind(file) - } - - func read() -> UnsafeMutablePointer { - let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size * fileSize) - fread(ptr, fileSize, 1, file) - return ptr - } - - deinit { - fclose(file) - } } enum Platform { - case GPU + case GPU } let platformSupport: [(Platform, String)] = [(.GPU, "GPU")] enum SupportModel: String{ - case yolo = "yolo" - case mobilenet_combined = "mobilenet_combined" - case super_resolution = "superresoltion" - case mobilenet = "mobilenet" - - static func supportedModels() -> [SupportModel] { - return [.super_resolution, .yolo, .mobilenet_combined, .mobilenet] - } + case yolo = "yolo" + case mobilenet_combined = "mobilenet_combined" + case super_resolution = "superresoltion" + case mobilenet = "mobilenet" + + static func supportedModels() -> [SupportModel] { + return [.super_resolution, .yolo, .mobilenet_combined, .mobilenet] + } } let netSupport: [SupportModel : Net] = [ - .super_resolution : SuperResolutionNet.init(device: MetalHelper.shared.device), - .yolo : YoloNet.init(device: MetalHelper.shared.device), - .mobilenet_combined : MobileNetCombined.init(device: MetalHelper.shared.device), - .mobilenet : MobileNet.init(device: MetalHelper.shared.device)] + .super_resolution : SuperResolutionNet.init(device: MetalHelper.shared.device), + .yolo : YoloNet.init(device: MetalHelper.shared.device), + .mobilenet_combined : MobileNetCombined.init(device: MetalHelper.shared.device), + .mobilenet : MobileNet.init(device: MetalHelper.shared.device)] class ViewController: UIViewController { - @IBOutlet weak var resultTextView: UITextView! - @IBOutlet weak var selectImageView: UIImageView! - @IBOutlet weak var elapsedTimeLabel: UILabel! - @IBOutlet weak var modelPickerView: UIPickerView! - @IBOutlet weak var threadPickerView: UIPickerView! - @IBOutlet weak var videoView: UIView! - // var videoCapture: VideoCapture! - - var selectImage: UIImage? - var inputPointer: UnsafeMutablePointer? - var modelType: SupportModel = SupportModel.supportedModels()[0] - var toPredictTexture: MTLTexture? - - var runner: Runner! - var platform: Platform = .GPU - var threadNum = 1 - - @IBAction func loadAct(_ sender: Any) { - runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue) - if platform == .GPU { -// let filePath = Bundle.main.path(forResource: "mingren_input_data", ofType: nil) -// let fileReader = try! FileReader.init(paramPath: filePath!) -// let pointer: UnsafeMutablePointer = fileReader.read() -// -// -// let buffer = MetalHelper.shared.device.makeBuffer(length: fileReader.fileSize, options: .storageModeShared) -// -// buffer?.contents().copyMemory(from: pointer, byteCount: fileReader.fileSize) - - - if self.toPredictTexture == nil { - -// runner.getTexture(inBuffer: buffer!) { [weak self] (texture) in -// self?.toPredictTexture = texture -// } + @IBOutlet weak var resultTextView: UITextView! + @IBOutlet weak var selectImageView: UIImageView! + @IBOutlet weak var elapsedTimeLabel: UILabel! + @IBOutlet weak var modelPickerView: UIPickerView! + @IBOutlet weak var threadPickerView: UIPickerView! + @IBOutlet weak var videoView: UIView! + // var videoCapture: VideoCapture! + + var selectImage: UIImage? + var inputPointer: UnsafeMutablePointer? + var modelType: SupportModel = SupportModel.supportedModels()[0] + var toPredictTexture: MTLTexture? + + var runner: Runner! + var platform: Platform = .GPU + var threadNum = 1 + + @IBAction func loadAct(_ sender: Any) { + runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue) + if platform == .GPU { + // let filePath = Bundle.main.path(forResource: "mingren_input_data", ofType: nil) + // let fileReader = try! FileReader.init(paramPath: filePath!) + // let pointer: UnsafeMutablePointer = fileReader.read() + // + // + // let buffer = MetalHelper.shared.device.makeBuffer(length: fileReader.fileSize, options: .storageModeShared) + // + // buffer?.contents().copyMemory(from: pointer, byteCount: fileReader.fileSize) + + + if self.toPredictTexture == nil { + + // runner.getTexture(inBuffer: buffer!) { [weak self] (texture) in + // self?.toPredictTexture = texture + // } + + runner.getTexture(image: selectImage!.cgImage!) { [weak self] (texture) in + self?.toPredictTexture = texture + } + } + } else { + fatalError( " unsupport " ) + } - runner.getTexture(image: selectImage!.cgImage!) { [weak self] (texture) in - self?.toPredictTexture = texture + if runner.load() { + print(" load success ! ") + } else { + print(" load error ! ") } - } - } else { - fatalError( " unsupport " ) } - if runner.load() { - print(" load success ! ") - } else { - print(" load error ! ") + @IBAction func selectImageAct(_ sender: Any) { + let imagePicker = UIImagePickerController() + imagePicker.sourceType = .camera + imagePicker.delegate = self + self.present(imagePicker, animated: true, completion: nil) } - } - - @IBAction func selectImageAct(_ sender: Any) { - let imagePicker = UIImagePickerController() - imagePicker.sourceType = .camera - imagePicker.delegate = self - self.present(imagePicker, animated: true, completion: nil) - } - - @IBAction func clearAct(_ sender: Any) { - runner.clear() - } - - @IBAction func predictAct(_ sender: Any) { - let max = 1 - switch platform { - case .GPU: - guard let inTexture = toPredictTexture else { - resultTextView.text = "请选择图片 ! " - return - } - - let startDate = Date.init() - for i in 0.. Int { - if pickerView == modelPickerView { - return 1 - } else if pickerView == threadPickerView { - return 1 - } else { - fatalError() + func numberOfComponents(in pickerView: UIPickerView) -> Int { + if pickerView == modelPickerView { + return 1 + } else if pickerView == threadPickerView { + return 1 + } else { + fatalError() + } } - } - - func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int { - if pickerView == modelPickerView { - return SupportModel.supportedModels().count - } else if pickerView == threadPickerView { - return platformSupport.count - } else { - fatalError() + + func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int { + if pickerView == modelPickerView { + return SupportModel.supportedModels().count + } else if pickerView == threadPickerView { + return platformSupport.count + } else { + fatalError() + } } - } - - public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? { - if pickerView == modelPickerView { - return SupportModel.supportedModels()[row].rawValue - } else if pickerView == threadPickerView { - return platformSupport[row].1 - } else { - fatalError() + + public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? { + if pickerView == modelPickerView { + return SupportModel.supportedModels()[row].rawValue + } else if pickerView == threadPickerView { + return platformSupport[row].1 + } else { + fatalError() + } } - } - - public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) { - if pickerView == modelPickerView { - self.modelType = SupportModel.supportedModels()[row] - } else if pickerView == threadPickerView { - platform = platformSupport[row].0 - } else { - fatalError() + + public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) { + if pickerView == modelPickerView { + self.modelType = SupportModel.supportedModels()[row] + } else if pickerView == threadPickerView { + platform = platformSupport[row].0 + } else { + fatalError() + } } - } } extension ViewController: UIImagePickerControllerDelegate, UINavigationControllerDelegate { - func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) { - picker.dismiss(animated: true){[weak self] in - guard let sSelf = self, let image = info["UIImagePickerControllerOriginalImage"] as? UIImage else{ - fatalError("no image") - } - sSelf.selectImage = image - sSelf.selectImageView.image = image - sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in - sSelf.toPredictTexture = texture - }) + func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) { + picker.dismiss(animated: true){[weak self] in + guard let sSelf = self, let image = info["UIImagePickerControllerOriginalImage"] as? UIImage else{ + fatalError("no image") + } + sSelf.selectImage = image + sSelf.selectImageView.image = image + sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in + sSelf.toPredictTexture = texture + }) + } } - } } var bool1 = false extension ViewController: VideoCaptureDelegate{ - func predictTexture(texture: MTLTexture){ - runner.scaleTexture(input: texture) { (scaledTexture) in - self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in - // print(resultHolder!.result![0]) - resultHolder?.first?.releasePointer() - }) + func predictTexture(texture: MTLTexture){ + runner.scaleTexture(input: texture) { (scaledTexture) in + self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in + // print(resultHolder!.result![0]) + resultHolder?.first?.releasePointer() + }) + } } - } - + } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj index 5b7b65da7c..007fd5e429 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj @@ -326,9 +326,10 @@ isa = XCBuildConfiguration; buildSettings = { ALWAYS_SEARCH_USER_PATHS = NO; - IPHONEOS_DEPLOYMENT_TARGET = 12.1; + IPHONEOS_DEPLOYMENT_TARGET = 9.0; MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; MTL_FAST_MATH = YES; + MTL_LANGUAGE_REVISION = Metal12; SDKROOT = iphoneos; }; name = Debug; @@ -337,9 +338,10 @@ isa = XCBuildConfiguration; buildSettings = { ALWAYS_SEARCH_USER_PATHS = NO; - IPHONEOS_DEPLOYMENT_TARGET = 12.1; + IPHONEOS_DEPLOYMENT_TARGET = 9.0; MTL_ENABLE_DEBUG_INFO = NO; MTL_FAST_MATH = YES; + MTL_LANGUAGE_REVISION = Metal12; SDKROOT = iphoneos; }; name = Release; diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal index 96333a07a9..ab1dcfae68 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal @@ -20,23 +20,23 @@ kernel void batchnorm(texture2d_array inTexture [[texture(0 const device float4 * nscale [[buffer(0)]], const device float4 * nbias [[buffer(1)]], uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - const float4 input = inTexture.read(gid.xy, gid.z); - float4 output = input * nscale[gid.z] + nbias[gid.z]; - outTexture.write(output, gid.xy, gid.z); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + const float4 input = inTexture.read(gid.xy, gid.z); + float4 output = input * nscale[gid.z] + nbias[gid.z]; + outTexture.write(output, gid.xy, gid.z); } kernel void batchnorm_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device half4 * newScale [[buffer(0)]], - const device half4 * newBias [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - const half4 input = inTexture.read(gid.xy, gid.z); - half4 output = input * newScale[gid.z] + newBias[gid.z]; - outTexture.write(output, gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + const device half4 * newScale [[buffer(0)]], + const device half4 * newBias [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + const half4 input = inTexture.read(gid.xy, gid.z); + half4 output = input * newScale[gid.z] + newBias[gid.z]; + outTexture.write(output, gid.xy, gid.z); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal index eb94408c8a..98ba10d847 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal @@ -15,10 +15,10 @@ struct MetalConvParam { }; kernel void batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device float4 *new_scale [[buffer(0)]], - const device float4 *new_biase [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { + texture2d_array outTexture [[texture(1)]], + const device float4 *new_scale [[buffer(0)]], + const device float4 *new_biase [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() || @@ -32,5 +32,5 @@ kernel void batch_norm_relu_3x3(texture2d_array inTexture input = inTexture.sample(sample, gid.x, gid.y, gid.z); output = fmax(input * new_scale[gid.z] + new_biase[gid.z], 0.0); outTexture.write(output, gid.xy, gid.z); - + } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal index a590f80898..188c31019d 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal @@ -21,29 +21,29 @@ #define VECTOR(p, n) CONCAT2(p, n) kernel void FUNC(bilinear_interp, P)(texture2d_array input [[texture(0)]], - texture2d_array output [[texture(1)]], - constant bilinear_interp_param & pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - VECTOR(P, 4) r; - if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) { - r = input.read(gid.xy, gid.z); - } else { - P w = gid.x * pm.ratio_w; - P h = gid.y * pm.ratio_h; - uint w0 = w, h0 = h; - uint w1 = w0 + 1, h1 = h0 + 1; - P w1lambda = w - w0, h1lambda = h - h0; - P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda; - if (w1 >= input.get_width()) w1 = w0; - if (h1 >= input.get_height()) h1 = h0; - VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z); - VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z); - VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z); - VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z); - r = h2lambda * (w2lambda * r0 + w1lambda * r1) - + h1lambda * (w2lambda * r2 + w1lambda * r3); - } - output.write(r, gid.xy, gid.z); + texture2d_array output [[texture(1)]], + constant bilinear_interp_param & pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + VECTOR(P, 4) r; + if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) { + r = input.read(gid.xy, gid.z); + } else { + P w = gid.x * pm.ratio_w; + P h = gid.y * pm.ratio_h; + uint w0 = w, h0 = h; + uint w1 = w0 + 1, h1 = h0 + 1; + P w1lambda = w - w0, h1lambda = h - h0; + P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda; + if (w1 >= input.get_width()) w1 = w0; + if (h1 >= input.get_height()) h1 = h0; + VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z); + VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z); + VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z); + VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z); + r = h2lambda * (w2lambda * r0 + w1lambda * r1) + + h1lambda * (w2lambda * r2 + w1lambda * r3); + } + output.write(r, gid.xy, gid.z); } #endif diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal index 394cf89db0..6104abb01d 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal @@ -16,8 +16,8 @@ using namespace metal; struct bilinear_interp_param { - float ratio_h; - float ratio_w; + float ratio_h; + float ratio_w; }; #define P float diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal index 918fbac1a7..184ee2bb71 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal @@ -20,35 +20,35 @@ #define FUNC(f, p) CONCAT2_(f, p) #define VECTOR(p, n) CONCAT2(p, n) kernel void FUNC(boxcoder, P)(texture2d_array priorBox [[texture(0)]], - texture2d_array priorBoxVar [[texture(1)]], - texture2d_array targetBox [[texture(2)]], - texture2d_array output[[texture(3)]], - uint3 gid [[thread_position_in_grid]]) { - VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z); - VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z); - VECTOR(P, 4) t; - t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0]; - t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0]; - t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0]; - t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0]; - - P px = (p.x + p.z) / 2; - P py = (p.y + p.w) / 2; - P pw = p.z - p.x; - P ph = p.w - p.y; - - P tx = pv.x * t.x * pw + px; - P ty = pv.y * t.y * ph + py; - P tw = exp(pv.z * t.z) * pw; - P th = exp(pv.w * t.w) * ph; - - VECTOR(P, 4) r; - r.x = tx - tw / 2; - r.y = ty - th / 2; - r.z = tx + tw / 2; - r.w = ty + th / 2; - - output.write(r, gid.xy, gid.z); + texture2d_array priorBoxVar [[texture(1)]], + texture2d_array targetBox [[texture(2)]], + texture2d_array output[[texture(3)]], + uint3 gid [[thread_position_in_grid]]) { + VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z); + VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z); + VECTOR(P, 4) t; + t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0]; + t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0]; + t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0]; + t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0]; + + P px = (p.x + p.z) / 2; + P py = (p.y + p.w) / 2; + P pw = p.z - p.x; + P ph = p.w - p.y; + + P tx = pv.x * t.x * pw + px; + P ty = pv.y * t.y * ph + py; + P tw = exp(pv.z * t.z) * pw; + P th = exp(pv.w * t.w) * ph; + + VECTOR(P, 4) r; + r.x = tx - tw / 2; + r.y = ty - th / 2; + r.z = tx + tw / 2; + r.w = ty + th / 2; + + output.write(r, gid.xy, gid.z); } #endif diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal index 3c07872616..12450f5741 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal @@ -13,24 +13,24 @@ kernel void buffer_to_texture_kernel( const device float *input [[buffer(0)]], texture2d outTexture [[texture(0)]], uint2 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - - float y = input[outTexture.get_width() * gid.y + gid.x]; - outTexture.write(float4(y, 0.0f, 0.0f, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + + float y = input[outTexture.get_width() * gid.y + gid.x]; + outTexture.write(float4(y, 0.0f, 0.0f, 0.0f), gid); } kernel void buffer_to_texture_kernel_half(const device float *input [[buffer(0)]], texture2d outTexture [[texture(0)]], uint2 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - - float y = input[outTexture.get_width() * gid.y + gid.x]; - outTexture.write(half4(y, 0.0f, 0.0f, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + + float y = input[outTexture.get_width() * gid.y + gid.x]; + outTexture.write(half4(y, 0.0f, 0.0f, 0.0f), gid); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal index 40bae035c0..099b8ca77c 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal @@ -17,104 +17,104 @@ using namespace metal; inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) { - abcd[0] = abcd[1] = abcd[2] = 0; - abcd[3] = xyzn[0] * 4 + xyzn[3]; + abcd[0] = abcd[1] = abcd[2] = 0; + abcd[3] = xyzn[0] * 4 + xyzn[3]; } inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) { - abcd[0] = abcd[1] = 0; - abcd[2] = xyzn[1]; - abcd[3] = xyzn[0] * 4 + xyzn[3]; + abcd[0] = abcd[1] = 0; + abcd[2] = xyzn[1]; + abcd[3] = xyzn[0] * 4 + xyzn[3]; } inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) { - abcd[0] = 0; - abcd[3] = xyzn[0]; - abcd[2] = xyzn[1]; - abcd[1] = xyzn[2] * 4 + xyzn[3]; + abcd[0] = 0; + abcd[3] = xyzn[0]; + abcd[2] = xyzn[1]; + abcd[1] = xyzn[2] * 4 + xyzn[3]; } inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) { - abcd[2] = xyzn[0]; - abcd[1] = xyzn[1]; - uint t = xyzn[2] * 4 + xyzn[3]; - abcd[0] = t / C; - abcd[3] = t % C; + abcd[2] = xyzn[0]; + abcd[1] = xyzn[1]; + uint t = xyzn[2] * 4 + xyzn[3]; + abcd[0] = t / C; + abcd[3] = t % C; } inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) { - xyzn[1] = xyzn[2] = 0; - xyzn[0] = abcd[3] / 4; - xyzn[1] = abcd[3] % 4; + xyzn[1] = xyzn[2] = 0; + xyzn[0] = abcd[3] / 4; + xyzn[1] = abcd[3] % 4; } inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) { - xyzn[2] = 0; - xyzn[1] = abcd[2]; - xyzn[0] = abcd[3] / 4; - xyzn[3] = abcd[3] % 4; + xyzn[2] = 0; + xyzn[1] = abcd[2]; + xyzn[0] = abcd[3] / 4; + xyzn[3] = abcd[3] % 4; } inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) { - xyzn[0] = abcd[3]; - xyzn[1] = abcd[2]; - xyzn[2] = abcd[1] / 4; - xyzn[3] = abcd[1] % 4; + xyzn[0] = abcd[3]; + xyzn[1] = abcd[2]; + xyzn[2] = abcd[1] / 4; + xyzn[3] = abcd[1] % 4; } inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) { - xyzn[0] = abcd[2]; - xyzn[1] = abcd[1]; - uint t = abcd[0] * C + abcd[3]; - xyzn[2] = t / 4; - xyzn[3] = t % 4; + xyzn[0] = abcd[2]; + xyzn[1] = abcd[1]; + uint t = abcd[0] * C + abcd[3]; + xyzn[2] = t / 4; + xyzn[3] = t % 4; } inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) { - abcd[2] = xyzn[0]; - abcd[1] = xyzn[1]; - uint t = xyzn[2] * 4 + xyzn[3]; - abcd[0] = t / C; - abcd[3] = t % C; + abcd[2] = xyzn[0]; + abcd[1] = xyzn[1]; + uint t = xyzn[2] * 4 + xyzn[3]; + abcd[0] = t / C; + abcd[3] = t % C; } inline void abcd2xyzn(int C, int abcd[4], int xyzn[4]) { - xyzn[0] = abcd[2]; - xyzn[1] = abcd[1]; - uint t = abcd[0] * C + abcd[3]; - xyzn[2] = t / 4; - xyzn[3] = t % 4; + xyzn[0] = abcd[2]; + xyzn[1] = abcd[1]; + uint t = abcd[0] * C + abcd[3]; + xyzn[2] = t / 4; + xyzn[3] = t % 4; } inline int32_t abcd2index(int32_t dim[4], int32_t abcd[4]) { - int32_t r = abcd[0]; - r = r * dim[1] + abcd[1]; - r = r * dim[2] + abcd[2]; - r = r * dim[3] + abcd[3]; - return r; + int32_t r = abcd[0]; + r = r * dim[1] + abcd[1]; + r = r * dim[2] + abcd[2]; + r = r * dim[3] + abcd[3]; + return r; } inline void index2abcd(int32_t dim[4], int32_t ind, int32_t abcd[4]) { - abcd[3] = ind % dim[3]; ind /= dim[3]; - abcd[2] = ind % dim[2]; ind /= dim[2]; - abcd[1] = ind % dim[1]; ind /= dim[1]; - abcd[0] = ind; + abcd[3] = ind % dim[3]; ind /= dim[3]; + abcd[2] = ind % dim[2]; ind /= dim[2]; + abcd[1] = ind % dim[1]; ind /= dim[1]; + abcd[0] = ind; } inline void trans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) { - for (int i = 0; i < 4; i++) { - opos[i] = ipos[trans[i]]; - } + for (int i = 0; i < 4; i++) { + opos[i] = ipos[trans[i]]; + } } inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) { - for (int i = 0; i < 4; i++) { - opos[trans[i]] = ipos[i]; - } + for (int i = 0; i < 4; i++) { + opos[trans[i]] = ipos[i]; + } } struct MetalConvParam { - short offsetX; - short offsetY; - short offsetZ; - ushort strideX; - ushort strideY; - ushort dilationX; - ushort dilationY; + short offsetX; + short offsetY; + short offsetZ; + ushort strideX; + ushort strideY; + ushort dilationX; + ushort dilationY; }; diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal index 2b070fc48b..ff8bd3d7a3 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal @@ -42,73 +42,73 @@ // uint3 gid [[thread_position_in_grid]]) { //} kernel void FUNC(concat, R, N, VV, P)(texture2d_array in0 [[texture(0)]], - texture2d_array in1 [[texture(1)]], + texture2d_array in1 [[texture(1)]], #if N >= 3 - texture2d_array in2 [[texture(2)]], + texture2d_array in2 [[texture(2)]], #endif #if N >= 4 - texture2d_array in3 [[texture(3)]], + texture2d_array in3 [[texture(3)]], #endif #if N >= 5 - texture2d_array in4 [[texture(4)]], + texture2d_array in4 [[texture(4)]], #endif #if N >= 6 - texture2d_array in5 [[texture(5)]], + texture2d_array in5 [[texture(5)]], #endif - texture2d_array inx [[texture(N)]], - texture2d_array out [[texture(N+1)]], - constant ConcatParam & pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - - ConcatParam cp = pm; - int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4]; - VECTOR(P, 4) r = inx.read(gid.xy, gid.z); - for (int i = 0; i < 4; i++) { - xyzn[3] = i; + texture2d_array inx [[texture(N)]], + texture2d_array out [[texture(N+1)]], + constant ConcatParam & pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + ConcatParam cp = pm; + int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4]; + VECTOR(P, 4) r = inx.read(gid.xy, gid.z); + for (int i = 0; i < 4; i++) { + xyzn[3] = i; #if R == 4 - xyzn2abcd_4(cp.odim[3], xyzn, abcd); + xyzn2abcd_4(cp.odim[3], xyzn, abcd); #else - FUNC_R(xyzn2abcd, R)(xyzn, abcd); + FUNC_R(xyzn2abcd, R)(xyzn, abcd); #endif - int k = abcd[cp.axis] - cp.offset; - if (k < 0) continue; - int j = 0; - for (; j < N; j++) { - if (k < cp.vdim[j]) { - break; - } - k -= cp.vdim[j]; - } - if (j == N) { - continue; - } - int ta = cp.odim[cp.axis]; - abcd[cp.axis] = k; - cp.odim[cp.axis] = cp.vdim[j]; + int k = abcd[cp.axis] - cp.offset; + if (k < 0) continue; + int j = 0; + for (; j < N; j++) { + if (k < cp.vdim[j]) { + break; + } + k -= cp.vdim[j]; + } + if (j == N) { + continue; + } + int ta = cp.odim[cp.axis]; + abcd[cp.axis] = k; + cp.odim[cp.axis] = cp.vdim[j]; #if R == 4 - abcd2xyzn_4(cp.odim[3], abcd, oxyzn); + abcd2xyzn_4(cp.odim[3], abcd, oxyzn); #else - FUNC_R(abcd2xyzn, R)(abcd, oxyzn); + FUNC_R(abcd2xyzn, R)(abcd, oxyzn); #endif - cp.odim[cp.axis] = ta; - switch (j) { - case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; - case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; + cp.odim[cp.axis] = ta; + switch (j) { + case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; + case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; #if N >= 3 - case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; + case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; #endif #if N >= 4 - case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; + case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; #endif #if N >= 5 - case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; + case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; #endif #if N >= 6 - case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; + case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; #endif - } - } - out.write(r, gid.xy, gid.z); + } + } + out.write(r, gid.xy, gid.z); } #endif // V == NORMAL @@ -117,66 +117,66 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array in0 [[tex #if V == VX kernel void FUNC(concat, R, N, VV, P)(texture2d_array in0 [[texture(0)]], - texture2d_array in1 [[texture(1)]], + texture2d_array in1 [[texture(1)]], #if N >= 3 - texture2d_array in2 [[texture(2)]], + texture2d_array in2 [[texture(2)]], #endif // N >= 3 #if N >= 4 - texture2d_array in3 [[texture(3)]], + texture2d_array in3 [[texture(3)]], #endif // N >= 4 #if N >= 5 - texture2d_array in4 [[texture(4)]], + texture2d_array in4 [[texture(4)]], #endif // N >= 5 #if N >= 6 - texture2d_array in5 [[texture(5)]], + texture2d_array in5 [[texture(5)]], #endif // N >= 6 - texture2d_array out [[texture(N)]], - constant ConcatParam & pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - int x = gid.x - pm.offset; - if (x < 0) return; - if (x < pm.vdim[0]) { - VECTOR(P, 4) r = in0.read(gid.xy, gid.z); - out.write(r, gid.xy, gid.z); - return; - } - x -= pm.vdim[0]; - if (x < pm.vdim[1]) { - VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } + texture2d_array out [[texture(N)]], + constant ConcatParam & pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + int x = gid.x - pm.offset; + if (x < 0) return; + if (x < pm.vdim[0]) { + VECTOR(P, 4) r = in0.read(gid.xy, gid.z); + out.write(r, gid.xy, gid.z); + return; + } + x -= pm.vdim[0]; + if (x < pm.vdim[1]) { + VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } #if N >= 3 - x -= pm.vdim[1]; - if (x < pm.vdim[2]) { - VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } + x -= pm.vdim[1]; + if (x < pm.vdim[2]) { + VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } #endif // N >= 3 #if N >= 4 - x -= pm.vdim[2]; - if (x < pm.vdim[3]) { - VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } + x -= pm.vdim[2]; + if (x < pm.vdim[3]) { + VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } #endif // N >= 4 #if N >= 5 - x -= pm.vdim[3]; - if (x < pm.vdim[4]) { - VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } + x -= pm.vdim[3]; + if (x < pm.vdim[4]) { + VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } #endif // N >= 5 #if N >= 6 - x -= pm.vdim[4]; - if (x < pm.vdim[5]) { - VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } + x -= pm.vdim[4]; + if (x < pm.vdim[5]) { + VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } #endif // N >= 6 } #endif // V == VX @@ -199,50 +199,50 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array in0 [[tex texture2d_array out [[texture(N)]], constant ConcatParam & pm [[buffer(0)]], uint3 gid [[thread_position_in_grid]]) { - int y = gid.y - pm.offset; - if (y < 0) return; - if (y < pm.vdim[0]) { - VECTOR(P, 4) r = in0.read(gid.xy, gid.z); - out.write(r, gid.xy, gid.z); - return; - } - y -= pm.vdim[0]; - if (y < pm.vdim[1]) { - VECTOR(P, 4) r = in1.read(uint2(gid.x, y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } + int y = gid.y - pm.offset; + if (y < 0) return; + if (y < pm.vdim[0]) { + VECTOR(P, 4) r = in0.read(gid.xy, gid.z); + out.write(r, gid.xy, gid.z); + return; + } + y -= pm.vdim[0]; + if (y < pm.vdim[1]) { + VECTOR(P, 4) r = in1.read(uint2(gid.x, y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } #if N >= 3 - y -= pm.vdim[1]; - if (y < pm.vdim[2]) { - VECTOR(P, 4) r = in2.read(uint2(gid.x, y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } + y -= pm.vdim[1]; + if (y < pm.vdim[2]) { + VECTOR(P, 4) r = in2.read(uint2(gid.x, y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } #endif // N >= 3 #if N >= 4 - y -= pm.vdim[2]; - if (y < pm.vdim[3]) { - VECTOR(P, 4) r = in3.read(uint2(gid.x, y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } + y -= pm.vdim[2]; + if (y < pm.vdim[3]) { + VECTOR(P, 4) r = in3.read(uint2(gid.x, y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } #endif // N >= 4 #if N >= 5 - y -= pm.vdim[3]; - if (y < pm.vdim[4]) { - VECTOR(P, 4) r = in4.read(uint2(gid.x, y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } + y -= pm.vdim[3]; + if (y < pm.vdim[4]) { + VECTOR(P, 4) r = in4.read(uint2(gid.x, y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } #endif // N >= 5 #if N >= 6 - y -= pm.vdim[4]; - if (y < pm.vdim[5]) { - VECTOR(P, 4) r = in5.read(uint2(gid.x, y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } + y -= pm.vdim[4]; + if (y < pm.vdim[5]) { + VECTOR(P, 4) r = in5.read(uint2(gid.x, y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } #endif // N >= 6 } #endif // V == VY @@ -265,50 +265,50 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array in0 [[tex texture2d_array out [[texture(N)]], constant ConcatParam & pm [[buffer(0)]], uint3 gid [[thread_position_in_grid]]) { - int z = gid.z - pm.offset; - if (z < 0) return; - if (z < pm.vdim[0]) { - VECTOR(P, 4) r = in0.read(gid.xy, gid.z); - out.write(r, gid.xy, gid.z); - return; - } - z -= pm.vdim[0]; - if (z < pm.vdim[1]) { - VECTOR(P, 4) r = in1.read(gid.xy, z); - out.write(r, gid.xy, gid.z); - return; - } + int z = gid.z - pm.offset; + if (z < 0) return; + if (z < pm.vdim[0]) { + VECTOR(P, 4) r = in0.read(gid.xy, gid.z); + out.write(r, gid.xy, gid.z); + return; + } + z -= pm.vdim[0]; + if (z < pm.vdim[1]) { + VECTOR(P, 4) r = in1.read(gid.xy, z); + out.write(r, gid.xy, gid.z); + return; + } #if N >= 3 - z -= pm.vdim[1]; - if (z < pm.vdim[2]) { - VECTOR(P, 4) r = in2.read(gid.xy, z); - out.write(r, gid.xy, gid.z); - return; - } + z -= pm.vdim[1]; + if (z < pm.vdim[2]) { + VECTOR(P, 4) r = in2.read(gid.xy, z); + out.write(r, gid.xy, gid.z); + return; + } #endif // N >= 3 #if N >= 4 - z -= pm.vdim[2]; - if (z < pm.vdim[3]) { - VECTOR(P, 4) r = in3.read(gid.xy, z); - out.write(r, gid.xy, gid.z); - return; - } + z -= pm.vdim[2]; + if (z < pm.vdim[3]) { + VECTOR(P, 4) r = in3.read(gid.xy, z); + out.write(r, gid.xy, gid.z); + return; + } #endif // N >= 4 #if N >= 5 - z -= pm.vdim[3]; - if (z < pm.vdim[4]) { - VECTOR(P, 4) r = in4.read(gid.xy, z); - out.write(r, gid.xy, gid.z); - return; - } + z -= pm.vdim[3]; + if (z < pm.vdim[4]) { + VECTOR(P, 4) r = in4.read(gid.xy, z); + out.write(r, gid.xy, gid.z); + return; + } #endif // N >= 5 #if N >= 6 - z -= pm.vdim[4]; - if (z < pm.vdim[5]) { - VECTOR(P, 4) r = in5.read(gid.xy, z); - out.write(r, gid.xy, gid.z); - return; - } + z -= pm.vdim[4]; + if (z < pm.vdim[5]) { + VECTOR(P, 4) r = in5.read(gid.xy, z); + out.write(r, gid.xy, gid.z); + return; + } #endif // N >= 6 } #endif // V == VZ diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal index b7d17f2d25..8a0390e624 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal @@ -18,11 +18,11 @@ using namespace metal; struct ConcatParam { - int32_t odim[4]; - int32_t axis; - int32_t offset; - int32_t trans[4]; - int32_t vdim[6]; + int32_t odim[4]; + int32_t axis; + int32_t offset; + int32_t trans[4]; + int32_t vdim[6]; }; #define VNORMAL 1 @@ -41,129 +41,129 @@ struct ConcatParam { // ssd-ar: (R=3, N=5, V=x) #define V VX - #define R 3 - #define N 5 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R +#define R 3 +#define N 5 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R #undef V // ssd-ar: (R=2, N=5, V=x) #define V VX - #define R 2 - #define N 5 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R +#define R 2 +#define N 5 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R #undef V // ssd-ar: (R=3, N=2, V=y) #define V VY - #define R 3 - #define N 2 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R +#define R 3 +#define N 2 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R #undef V // ssd-ar: (R=4, N=3, V=z) #define V VZ - #define R 4 - #define N 3 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R +#define R 4 +#define N 3 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R #undef V // ssd: (R=2, N=6, V=y) #define V VY - #define R 2 - #define N 6 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R +#define R 2 +#define N 6 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R #undef V // ssd: (R=3, N=6, V=y) #define V VY - #define R 3 - #define N 6 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R +#define R 3 +#define N 6 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R #undef V #define V VNORMAL - #define R 4 - #define N 2 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R +#define R 4 +#define N 2 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R #undef V #define V VY - #define R 2 - #define N 2 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R +#define R 2 +#define N 2 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R #undef V #define V VY - #define R 2 - #define N 5 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R +#define R 2 +#define N 5 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R #undef V diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal index 87b60a64fc..f55386096f 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal @@ -18,147 +18,147 @@ using namespace metal; kernel void conv_add_batch_norm_relu_1x1_half( - texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - const device half4 *new_scale [[buffer(3)]], - const device half4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - half4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); + texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device half4 *new_scale [[buffer(3)]], + const device half4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); + float4 output = float4(0.0); - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); - outTexture.write(half4(output), gid.xy, gid.z); + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); } kernel void conv_add_batch_norm_relu_3x3_half( - texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - const device half4 *new_scale [[buffer(3)]], - const device half4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - half4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); + texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device half4 *new_scale [[buffer(3)]], + const device half4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } } - } - output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); - outTexture.write(half4(output), gid.xy, gid.z); + output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); } kernel void depthwise_conv_add_batch_norm_relu_3x3_half( - texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - const device half4 *new_scale [[buffer(3)]], - const device half4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - half4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - half4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); - outTexture.write(half4(output), gid.xy, gid.z); + texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device half4 *new_scale [[buffer(3)]], + const device half4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); } @@ -175,41 +175,41 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array const device float4 *new_scale [[buffer(3)]], const device float4 *new_biase [[buffer(4)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); + float4 output = float4(0.0); - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); } kernel void conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], @@ -220,50 +220,50 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array const device float4 *new_scale [[buffer(3)]], const device float4 *new_biase [[buffer(4)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } } - } - output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); + output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); } kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], @@ -274,37 +274,37 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - float4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - float4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + float4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + float4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal index 274e416576..e2513e1b1e 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal @@ -24,41 +24,41 @@ kernel void conv_add_1x1(texture2d_array inTexture [[text const device float4 *weights [[buffer(1)]], const device float4 *biase [[buffer(2)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = biase[gid.z]; - - float4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } -// output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = biase[gid.z]; + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + // output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); } kernel void conv_add_3x3(texture2d_array inTexture [[texture(0)]], @@ -67,66 +67,66 @@ kernel void conv_add_3x3(texture2d_array inTexture [[text const device float4 *weights [[buffer(1)]], const device float4 *biase [[buffer(2)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 9; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = biase[gid.z]; - - ushort dilation_x = param.dilationX; - ushort dilation_y = param.dilationY; - - float4 input[9]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - - input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); - - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - - input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 9; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + float4 input[9]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } } - } -// output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); + // output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); } kernel void conv_add_5x1(texture2d_array inTexture [[texture(0)]], @@ -135,56 +135,56 @@ kernel void conv_add_5x1(texture2d_array inTexture [[text const device float4 *weights [[buffer(1)]], const device float4 *biase [[buffer(2)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 5; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = biase[gid.z]; - - ushort dilation_y = param.dilationY; - float4 input[5]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); - - for (int j = 0; j < 5; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = biase[gid.z]; + + ushort dilation_y = param.dilationY; + float4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); + + for (int j = 0; j < 5; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } } - } -// output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); + // output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); } @@ -194,56 +194,56 @@ kernel void conv_add_1x5(texture2d_array inTexture [[text const device float4 *weights [[buffer(1)]], const device float4 *biase [[buffer(2)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 5; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = biase[gid.z]; - - ushort dilation_x = param.dilationX; - float4 input[5]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); - - for (int j = 0; j < 5; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; } - } -// output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + float4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); + + for (int j = 0; j < 5; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + // output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); } @@ -253,297 +253,297 @@ kernel void depthwise_conv_add_3x3(texture2d_array inText const device float *weights [[buffer(1)]], const device float4 *biase [[buffer(2)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = biase[gid.z]; - float4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - float4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } -// output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = biase[gid.z]; + float4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + float4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + // output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); } #pragma mark - half kernel void conv_add_1x1_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - half4 output = biase[gid.z]; - - half4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } -// output = output + float4(biase[gid.z]); - outTexture.write(output, gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(biase[gid.z]); + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = float4(inTexture.sample(sample, float2(posInInput.x, posInInput.y), i)); + float4 weight_x = float4(weights[weithTo + 0 * kernelHXW * input_arr_size + i]); + output.x += dot(input, weight_x); + + float4 weight_y = float4(weights[weithTo + 1 * kernelHXW * input_arr_size + i]); + output.y += dot(input, weight_y); + + float4 weight_z = float4(weights[weithTo + 2 * kernelHXW * input_arr_size + i]); + output.z += dot(input, weight_z); + + float4 weight_w = float4(weights[weithTo + 3 * kernelHXW * input_arr_size + i]); + output.w += dot(input, weight_w); + } + // output = output + float4(biase[gid.z]); + outTexture.write(half4(output), gid.xy, gid.z); } kernel void conv_add_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - half4 output = biase[gid.z]; - - ushort dilation_x = param.dilationX; - ushort dilation_y = param.dilationY; - - half4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); - for (int j = 0; j < 9; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(float4(input[j]), float4(weight_x)); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(float4(input[j]), float4(weight_y)); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(float4(input[j]), float4(weight_z)); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(float4(input[j]), float4(weight_w)); + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; } - } -// output = output + float4(biase[gid.z]); - outTexture.write(output, gid.xy, gid.z); + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + half4 output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + // output = output + float4(biase[gid.z]); + outTexture.write(output, gid.xy, gid.z); } kernel void depthwise_conv_add_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - half4 output = biase[gid.z]; - half4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - half4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } -// output = output + float4(biase[gid.z]); - outTexture.write(output, gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + half4 output = biase[gid.z]; + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + // output = output + float4(biase[gid.z]); + outTexture.write(output, gid.xy, gid.z); } kernel void conv_add_5x1_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 5; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - half4 output = biase[gid.z]; - - ushort dilation_y = param.dilationY; - half4 input[5]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); - - for (int j = 0; j < 5; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + half4 output = biase[gid.z]; + + ushort dilation_y = param.dilationY; + half4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); + + for (int j = 0; j < 5; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } } - } -// output = output + float4(biase[gid.z]); - outTexture.write(output, gid.xy, gid.z); + // output = output + float4(biase[gid.z]); + outTexture.write(output, gid.xy, gid.z); } kernel void conv_add_1x5_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 5; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - half4 output = biase[gid.z]; - - ushort dilation_x = param.dilationX; - half4 input[5]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); - - for (int j = 0; j < 5; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + half4 output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + half4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); + + for (int j = 0; j < 5; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } } - } -// output = output + float4(biase[gid.z]); - outTexture.write(output, gid.xy, gid.z); + // output = output + float4(biase[gid.z]); + outTexture.write(output, gid.xy, gid.z); } @@ -553,69 +553,69 @@ kernel void test_conv_add_3x3(texture2d_array inTexture [ const device float4 *weights [[buffer(1)]], const device float4 *biase [[buffer(2)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 9; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - ushort dilation_x = param.dilationX; - ushort dilation_y = param.dilationY; - - float4 input[9]; - - for (uint i = 0; i < input_arr_size; ++i) { - - input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - - input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); - - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - - input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 9; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + float4 input[9]; + + for (uint i = 0; i < input_arr_size; ++i) { + + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } } - } - // output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); + // output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal index 069daa20e8..e2b8834cc5 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal @@ -19,428 +19,428 @@ #pragma mark - convAdd kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device VECTOR(P, 4) *weights [[buffer(1)]], - const device VECTOR(P, 4) *biase [[buffer(2)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device VECTOR(P, 4) *weights [[buffer(1)]], + const device VECTOR(P, 4) *biase [[buffer(2)]], #ifdef PRELU_CHANNEL - const device VECTOR(P, 4) *alpha [[buffer(3)]], + const device VECTOR(P, 4) *alpha [[buffer(3)]], #endif #ifdef PRELU_ELEMENT - const device VECTOR(P, 4) *alpha [[buffer(3)]], + const device VECTOR(P, 4) *alpha [[buffer(3)]], #endif #ifdef PRELU_OTHER - const device P *alpha [[buffer(3)]], + const device P *alpha [[buffer(3)]], #endif - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - VECTOR(P, 4) output = biase[gid.z]; - - VECTOR(P, 4) input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i); - VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - -// output = output + float4(biase[gid.z]); - + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + VECTOR(P, 4) output = biase[gid.z]; + + VECTOR(P, 4) input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i); + VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + + // output = output + float4(biase[gid.z]); + #ifdef PRELU_CHANNEL - VECTOR(P, 4) alpha_value = alpha[gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); + VECTOR(P, 4) alpha_value = alpha[gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); #endif #ifdef PRELU_ELEMENT - int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); - VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); + int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); + VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); #endif #ifdef PRELU_OTHER - P alpha_value = alpha[0]; - output.x = output.x > 0 ? output.x : (alpha_value * output.x); - output.y = output.y > 0 ? output.y : (alpha_value * output.y); - output.z = output.z > 0 ? output.z : (alpha_value * output.z); - output.w = output.w > 0 ? output.w : (alpha_value * output.w); + P alpha_value = alpha[0]; + output.x = output.x > 0 ? output.x : (alpha_value * output.x); + output.y = output.y > 0 ? output.y : (alpha_value * output.y); + output.z = output.z > 0 ? output.z : (alpha_value * output.z); + output.w = output.w > 0 ? output.w : (alpha_value * output.w); #endif - outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); + outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); } kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device VECTOR(P, 4) *weights [[buffer(1)]], - const device VECTOR(P, 4) *biase [[buffer(2)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device VECTOR(P, 4) *weights [[buffer(1)]], + const device VECTOR(P, 4) *biase [[buffer(2)]], #ifdef PRELU_CHANNEL - const device VECTOR(P, 4) *alpha [[buffer(3)]], + const device VECTOR(P, 4) *alpha [[buffer(3)]], #endif #ifdef PRELU_ELEMENT - const device VECTOR(P, 4) *alpha [[buffer(3)]], + const device VECTOR(P, 4) *alpha [[buffer(3)]], #endif #ifdef PRELU_OTHER - const device P *alpha [[buffer(3)]], + const device P *alpha [[buffer(3)]], #endif - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 9; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - VECTOR(P, 4) output = biase[gid.z]; - - ushort dilation_x = param.dilationX; - ushort dilation_y = param.dilationY; - - VECTOR(P, 4) input[9]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - - input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); - - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - - input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); - - for (int j = 0; j < 9; ++j) { - VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 9; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + VECTOR(P, 4) output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + VECTOR(P, 4) input[9]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + + for (int j = 0; j < 9; ++j) { + VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } } - } -// output = output + float4(biase[gid.z]); - + // output = output + float4(biase[gid.z]); + #ifdef PRELU_CHANNEL - VECTOR(P, 4) alpha_value = alpha[gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); + VECTOR(P, 4) alpha_value = alpha[gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); #endif #ifdef PRELU_ELEMENT - int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); - VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); + int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); + VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); #endif #ifdef PRELU_OTHER - P alpha_value = alpha[0]; - output.x = output.x > 0 ? output.x : (alpha_value * output.x); - output.y = output.y > 0 ? output.y : (alpha_value * output.y); - output.z = output.z > 0 ? output.z : (alpha_value * output.z); - output.w = output.w > 0 ? output.w : (alpha_value * output.w); + P alpha_value = alpha[0]; + output.x = output.x > 0 ? output.x : (alpha_value * output.x); + output.y = output.y > 0 ? output.y : (alpha_value * output.y); + output.z = output.z > 0 ? output.z : (alpha_value * output.z); + output.w = output.w > 0 ? output.w : (alpha_value * output.w); #endif - outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); + outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); } kernel void FUNC3_(conv_add_5x1, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device VECTOR(P, 4) *weights [[buffer(1)]], - const device VECTOR(P, 4) *biase [[buffer(2)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device VECTOR(P, 4) *weights [[buffer(1)]], + const device VECTOR(P, 4) *biase [[buffer(2)]], #ifdef PRELU_CHANNEL - const device VECTOR(P, 4) *alpha [[buffer(3)]], + const device VECTOR(P, 4) *alpha [[buffer(3)]], #endif #ifdef PRELU_ELEMENT - const device VECTOR(P, 4) *alpha [[buffer(3)]], + const device VECTOR(P, 4) *alpha [[buffer(3)]], #endif #ifdef PRELU_OTHER - const device P *alpha [[buffer(3)]], + const device P *alpha [[buffer(3)]], #endif - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 5; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - VECTOR(P, 4) output = biase[gid.z];; - - ushort dilation_y = param.dilationY; - VECTOR(P, 4) input[5]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); - - for (int j = 0; j < 5; ++j) { - VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + VECTOR(P, 4) output = biase[gid.z];; + + ushort dilation_y = param.dilationY; + VECTOR(P, 4) input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); + + for (int j = 0; j < 5; ++j) { + VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } } - } - + #ifdef PRELU_CHANNEL - VECTOR(P, 4) alpha_value = alpha[gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); + VECTOR(P, 4) alpha_value = alpha[gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); #endif #ifdef PRELU_ELEMENT - int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); - VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); + int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); + VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); #endif #ifdef PRELU_OTHER - P alpha_value = alpha[0]; - output.x = output.x > 0 ? output.x : (alpha_value * output.x); - output.y = output.y > 0 ? output.y : (alpha_value * output.y); - output.z = output.z > 0 ? output.z : (alpha_value * output.z); - output.w = output.w > 0 ? output.w : (alpha_value * output.w); + P alpha_value = alpha[0]; + output.x = output.x > 0 ? output.x : (alpha_value * output.x); + output.y = output.y > 0 ? output.y : (alpha_value * output.y); + output.z = output.z > 0 ? output.z : (alpha_value * output.z); + output.w = output.w > 0 ? output.w : (alpha_value * output.w); #endif - outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); + outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); } kernel void FUNC3_(conv_add_1x5, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device VECTOR(P, 4) *weights [[buffer(1)]], - const device VECTOR(P, 4) *biase [[buffer(2)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device VECTOR(P, 4) *weights [[buffer(1)]], + const device VECTOR(P, 4) *biase [[buffer(2)]], #ifdef PRELU_CHANNEL - const device VECTOR(P, 4) *alpha [[buffer(3)]], + const device VECTOR(P, 4) *alpha [[buffer(3)]], #endif #ifdef PRELU_ELEMENT - const device VECTOR(P, 4) *alpha [[buffer(3)]], + const device VECTOR(P, 4) *alpha [[buffer(3)]], #endif #ifdef PRELU_OTHER - const device P *alpha [[buffer(3)]], + const device P *alpha [[buffer(3)]], #endif - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 5; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - VECTOR(P, 4) output = biase[gid.z]; - - ushort dilation_x = param.dilationX; - VECTOR(P, 4) input[5]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); - - for (int j = 0; j < 5; ++j) { - VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + VECTOR(P, 4) output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + VECTOR(P, 4) input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); + + for (int j = 0; j < 5; ++j) { + VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } } - } - + #ifdef PRELU_CHANNEL - VECTOR(P, 4) alpha_value = alpha[gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); + VECTOR(P, 4) alpha_value = alpha[gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); #endif #ifdef PRELU_ELEMENT - int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); - VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); + int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); + VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); #endif #ifdef PRELU_OTHER - P alpha_value = alpha[0]; - output.x = output.x > 0 ? output.x : (alpha_value * output.x); - output.y = output.y > 0 ? output.y : (alpha_value * output.y); - output.z = output.z > 0 ? output.z : (alpha_value * output.z); - output.w = output.w > 0 ? output.w : (alpha_value * output.w); + P alpha_value = alpha[0]; + output.x = output.x > 0 ? output.x : (alpha_value * output.x); + output.y = output.y > 0 ? output.y : (alpha_value * output.y); + output.z = output.z > 0 ? output.z : (alpha_value * output.z); + output.w = output.w > 0 ? output.w : (alpha_value * output.w); #endif - outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); + outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); } kernel void FUNC3_(depthwise_conv_add_3x3, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device P *weights [[buffer(1)]], - const device VECTOR(P, 4) *biase [[buffer(2)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device P *weights [[buffer(1)]], + const device VECTOR(P, 4) *biase [[buffer(2)]], #ifdef PRELU_CHANNEL - const device VECTOR(P, 4) *alpha [[buffer(3)]], + const device VECTOR(P, 4) *alpha [[buffer(3)]], #endif #ifdef PRELU_ELEMENT - const device VECTOR(P, 4) *alpha [[buffer(3)]], + const device VECTOR(P, 4) *alpha [[buffer(3)]], #endif #ifdef PRELU_OTHER - const device P *alpha [[buffer(3)]], + const device P *alpha [[buffer(3)]], #endif - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - VECTOR(P, 4) output = biase[gid.z]; - VECTOR(P, 4) inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - VECTOR(P, 4) input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + VECTOR(P, 4) output = biase[gid.z]; + VECTOR(P, 4) inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + VECTOR(P, 4) input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + #ifdef PRELU_CHANNEL - VECTOR(P, 4) alpha_value = alpha[gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); + VECTOR(P, 4) alpha_value = alpha[gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); #endif #ifdef PRELU_ELEMENT - int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); - VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); + int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); + VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); #endif #ifdef PRELU_OTHER - P alpha_value = alpha[0]; - output.x = output.x > 0 ? output.x : (alpha_value * output.x); - output.y = output.y > 0 ? output.y : (alpha_value * output.y); - output.z = output.z > 0 ? output.z : (alpha_value * output.z); - output.w = output.w > 0 ? output.w : (alpha_value * output.w); + P alpha_value = alpha[0]; + output.x = output.x > 0 ? output.x : (alpha_value * output.x); + output.y = output.y > 0 ? output.y : (alpha_value * output.y); + output.z = output.z > 0 ? output.z : (alpha_value * output.z); + output.w = output.w > 0 ? output.w : (alpha_value * output.w); #endif - outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); + outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); } #endif diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal index f03a1d5b62..407b8385b7 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal @@ -18,45 +18,45 @@ using namespace metal; #define P float - #define PRELU_CHANNEL prelu_channel - #define PRELU_TYPE prelu_channel - #include "ConvAddPrelu.inc.metal" - #undef PRELU_TYPE - #undef PRELU_CHANNEL +#define PRELU_CHANNEL prelu_channel +#define PRELU_TYPE prelu_channel +#include "ConvAddPrelu.inc.metal" +#undef PRELU_TYPE +#undef PRELU_CHANNEL - #define PRELU_ELEMENT prelu_element - #define PRELU_TYPE prelu_element - #include "ConvAddPrelu.inc.metal" - #undef PRELU_TYPE - #undef PRELU_ELEMENT +#define PRELU_ELEMENT prelu_element +#define PRELU_TYPE prelu_element +#include "ConvAddPrelu.inc.metal" +#undef PRELU_TYPE +#undef PRELU_ELEMENT - #define PRELU_OTHER prelu_other - #define PRELU_TYPE prelu_other - #include "ConvAddPrelu.inc.metal" - #undef PRELU_TYPE - #undef PRELU_OTHER +#define PRELU_OTHER prelu_other +#define PRELU_TYPE prelu_other +#include "ConvAddPrelu.inc.metal" +#undef PRELU_TYPE +#undef PRELU_OTHER #undef P #define P half - #define PRELU_CHANNEL prelu_channel - #define PRELU_TYPE prelu_channel - #include "ConvAddPrelu.inc.metal" - #undef PRELU_TYPE - #undef PRELU_CHANNEL +#define PRELU_CHANNEL prelu_channel +#define PRELU_TYPE prelu_channel +#include "ConvAddPrelu.inc.metal" +#undef PRELU_TYPE +#undef PRELU_CHANNEL - #define PRELU_ELEMENT prelu_element - #define PRELU_TYPE prelu_element - #include "ConvAddPrelu.inc.metal" - #undef PRELU_TYPE - #undef PRELU_ELEMENT +#define PRELU_ELEMENT prelu_element +#define PRELU_TYPE prelu_element +#include "ConvAddPrelu.inc.metal" +#undef PRELU_TYPE +#undef PRELU_ELEMENT - #define PRELU_OTHER prelu_other - #define PRELU_TYPE prelu_other - #include "ConvAddPrelu.inc.metal" - #undef PRELU_TYPE - #undef PRELU_OTHER +#define PRELU_OTHER prelu_other +#define PRELU_TYPE prelu_other +#include "ConvAddPrelu.inc.metal" +#undef PRELU_TYPE +#undef PRELU_OTHER #undef P diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal index 4b97b7829a..6851f8aa98 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal @@ -25,41 +25,41 @@ kernel void conv_batch_norm_relu_1x1(texture2d_array inTe const device float4 *new_scale [[buffer(2)]], const device float4 *new_biase [[buffer(3)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); + float4 output = float4(0.0); + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); } kernel void conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], @@ -69,50 +69,50 @@ kernel void conv_batch_norm_relu_3x3(texture2d_array inTe const device float4 *new_scale [[buffer(2)]], const device float4 *new_biase [[buffer(3)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; } - } - output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); } kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], @@ -122,176 +122,176 @@ kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - float4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - float4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + float4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + float4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); } #pragma mark - half kernel void conv_batch_norm_relu_1x1_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *new_scale [[buffer(2)]], - const device half4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - half4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(float4(input), float4(weight_x)); + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *new_scale [[buffer(2)]], + const device half4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(float4(input), float4(weight_y)); + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(float4(input), float4(weight_z)); + float4 output = float4(0.0); - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(float4(input), float4(weight_w)); - } - output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); - outTexture.write(half4(output), gid.xy, gid.z); + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(float4(input), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(float4(input), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(float4(input), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(float4(input), float4(weight_w)); + } + output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); } kernel void conv_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *new_scale [[buffer(2)]], - const device half4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - half4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(float4(input[j]), float4(weight_x)); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(float4(input[j]), float4(weight_y)); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(float4(input[j]), float4(weight_z)); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(float4(input[j]), float4(weight_w)); + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *new_scale [[buffer(2)]], + const device half4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; } - } - output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); - outTexture.write(half4(output), gid.xy, gid.z); + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); } kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half *weights [[buffer(1)]], - const device half4 *new_scale [[buffer(2)]], - const device half4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - half4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - half4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); - outTexture.write(half4(output), gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + const device half4 *new_scale [[buffer(2)]], + const device half4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal index c07515c13d..c7b3f792d6 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal @@ -23,49 +23,49 @@ kernel void conv_3x3(texture2d_array inTexture [[texture( constant MetalConvParam ¶m [[buffer(0)]], const device float4 *weights [[buffer(1)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } } - } - outTexture.write(output, gid.xy, gid.z); + outTexture.write(output, gid.xy, gid.z); } kernel void depthwise_conv_3x3(texture2d_array inTexture [[texture(0)]], @@ -73,37 +73,37 @@ kernel void depthwise_conv_3x3(texture2d_array inTexture constant MetalConvParam ¶m [[buffer(0)]], const device float *weights [[buffer(1)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - float4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - float4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - outTexture.write(output, gid.xy, gid.z); + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + float4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + float4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + outTexture.write(output, gid.xy, gid.z); } kernel void conv_1x1(texture2d_array inTexture [[texture(0)]], @@ -111,170 +111,170 @@ kernel void conv_1x1(texture2d_array inTexture [[texture( constant MetalConvParam ¶m [[buffer(0)]], const device float4 *weights [[buffer(1)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); + float4 output = float4(0.0); - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - outTexture.write(output, gid.xy, gid.z); + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + outTexture.write(output, gid.xy, gid.z); } kernel void conv_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - half4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(float4(input[j]), float4(weight_x)); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(float4(input[j]), float4(weight_y)); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(float4(input[j]), float4(weight_z)); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(float4(input[j]), float4(weight_w)); + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } } - } - outTexture.write(half4(output), gid.xy, gid.z); + outTexture.write(half4(output), gid.xy, gid.z); } kernel void depthwise_conv_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - half4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - half4 input = inputs[j]; - output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]); - output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]); - output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]); - output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]); - } - outTexture.write(half4(output), gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]); + output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]); + output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]); + output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]); + } + outTexture.write(half4(output), gid.xy, gid.z); } kernel void conv_1x1_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - half4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(float4(input), float4(weight_x)); + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(float4(input), float4(weight_y)); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(float4(input), float4(weight_z)); + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(float4(input), float4(weight_w)); - } - outTexture.write(half4(output), gid.xy, gid.z); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(float4(input), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(float4(input), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(float4(input), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(float4(input), float4(weight_w)); + } + outTexture.write(half4(output), gid.xy, gid.z); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal index baf3f31157..a324fac188 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal @@ -16,17 +16,17 @@ using namespace metal; struct MetalConvTransposeParam{ - ushort kernelW; - ushort kernelH; - - ushort strideX; - ushort strideY; - - ushort paddingX; - ushort paddingY; - - ushort dilationX; - ushort dilationY; + ushort kernelW; + ushort kernelH; + + ushort strideX; + ushort strideY; + + ushort paddingX; + ushort paddingY; + + ushort dilationX; + ushort dilationY; }; kernel void conv_transpose2x2_stride2(texture2d_array inTexture [[texture(0)]], @@ -34,83 +34,83 @@ kernel void conv_transpose2x2_stride2(texture2d_array inT constant MetalConvTransposeParam ¶m [[buffer(0)]], const device float4 *weights [[buffer(1)]], uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - int input_array_size = inTexture.get_array_size(); - int kernel_index_x = gid.x % 2; - int kernel_index_y = gid.y % 2; - int kernel_index = kernel_index_y * 2 + kernel_index_x; - int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size); - int input_x = gid.x / 2; - int input_y = gid.y / 2; - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 output = float4(0.0); - for (int i = 0; i < input_array_size; ++i) { - - float4 input = inTexture.sample(sample, float2(input_x, input_y), i); - - float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i]; - float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i]; - float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i]; - float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i]; - - output.x += dot(input, kernel_slice0); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } - output.y += dot(input, kernel_slice1); + int input_array_size = inTexture.get_array_size(); + int kernel_index_x = gid.x % 2; + int kernel_index_y = gid.y % 2; + int kernel_index = kernel_index_y * 2 + kernel_index_x; + int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size); + int input_x = gid.x / 2; + int input_y = gid.y / 2; - output.z += dot(input, kernel_slice2); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 output = float4(0.0); + for (int i = 0; i < input_array_size; ++i) { + + float4 input = inTexture.sample(sample, float2(input_x, input_y), i); + + float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i]; + float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i]; + float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i]; + float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i]; + + output.x += dot(input, kernel_slice0); + + output.y += dot(input, kernel_slice1); + + output.z += dot(input, kernel_slice2); + + output.w += dot(input, kernel_slice3); + } - output.w += dot(input, kernel_slice3); - } - - outTexture.write(output, gid.xy, gid.z); + outTexture.write(output, gid.xy, gid.z); } kernel void conv_transpose2x2_stride2_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvTransposeParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - int input_array_size = inTexture.get_array_size(); - int kernel_index_x = gid.x % 2; - int kernel_index_y = gid.y % 2; - int kernel_index = kernel_index_y * 2 + kernel_index_x; - int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size); - int input_x = gid.x / 2; - int input_y = gid.y / 2; - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 output = float4(0.0); - for (int i = 0; i < input_array_size; ++i) { - - half4 input = inTexture.sample(sample, float2(input_x, input_y), i); - - half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i]; - half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i]; - half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i]; - half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i]; - - output.x += dot(float4(input), float4(kernel_slice0)); + texture2d_array outTexture [[texture(1)]], + constant MetalConvTransposeParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } - output.y += dot(float4(input), float4(kernel_slice1)); + int input_array_size = inTexture.get_array_size(); + int kernel_index_x = gid.x % 2; + int kernel_index_y = gid.y % 2; + int kernel_index = kernel_index_y * 2 + kernel_index_x; + int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size); + int input_x = gid.x / 2; + int input_y = gid.y / 2; - output.z += dot(float4(input), float4(kernel_slice2)); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 output = float4(0.0); + for (int i = 0; i < input_array_size; ++i) { + + half4 input = inTexture.sample(sample, float2(input_x, input_y), i); + + half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i]; + half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i]; + half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i]; + half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i]; + + output.x += dot(float4(input), float4(kernel_slice0)); + + output.y += dot(float4(input), float4(kernel_slice1)); + + output.z += dot(float4(input), float4(kernel_slice2)); + + output.w += dot(float4(input), float4(kernel_slice3)); + } - output.w += dot(float4(input), float4(kernel_slice3)); - } - - outTexture.write(half4(output), gid.xy, gid.z); + outTexture.write(half4(output), gid.xy, gid.z); } //kernel void conv_transpose(texture2d_array inTexture [[texture(0)]], diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal index b152df8281..40cad28df1 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal @@ -18,13 +18,13 @@ using namespace metal; struct ElementwiseAddParam { - int32_t fast; - int32_t axis; - int32_t ylen; - int32_t xdim[4]; - int32_t xtrans[4]; - int32_t ydim[4]; - int32_t ytrans[4]; + int32_t fast; + int32_t axis; + int32_t ylen; + int32_t xdim[4]; + int32_t xtrans[4]; + int32_t ydim[4]; + int32_t ytrans[4]; }; kernel void elementwise_add(texture2d_array inputX [[texture(0)]], @@ -32,69 +32,69 @@ kernel void elementwise_add(texture2d_array inputX [[textur texture2d_array outTexture [[texture(2)]], constant ElementwiseAddParam &pm [[buffer(0)]], uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - float4 rx, ry; - - if (pm.fast == 1) { - rx = inputX.read(gid.xy, gid.z); - ry = inputY.read(gid.xy, gid.z); - } else { - rx = inputX.read(gid.xy, gid.z); - int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; - int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; - int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; - int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; - int32_t yshift = 4 - pm.ylen - pm.axis; - for (int n = 0; n < 4; n++) { - x_xyzn[3] = n; - xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); - invtrans(xtrans, x_abcd, t_abcd); - for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) { - y_abcd[yshift+k] = t_abcd[k]; - } - trans(ytrans, y_abcd, t_abcd); - abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); - ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + float4 rx, ry; + + if (pm.fast == 1) { + rx = inputX.read(gid.xy, gid.z); + ry = inputY.read(gid.xy, gid.z); + } else { + rx = inputX.read(gid.xy, gid.z); + int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; + int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; + int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; + int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; + int32_t yshift = 4 - pm.ylen - pm.axis; + for (int n = 0; n < 4; n++) { + x_xyzn[3] = n; + xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); + invtrans(xtrans, x_abcd, t_abcd); + for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) { + y_abcd[yshift+k] = t_abcd[k]; + } + trans(ytrans, y_abcd, t_abcd); + abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); + ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; + } } - } - float4 r = rx + ry; - outTexture.write(r, gid.xy, gid.z); + float4 r = rx + ry; + outTexture.write(r, gid.xy, gid.z); } kernel void elementwise_add_half(texture2d_array inputX [[texture(0)]], - texture2d_array inputY [[texture(1)]], - texture2d_array outTexture [[texture(2)]], - constant ElementwiseAddParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - half4 rx, ry; - - if (pm.fast == 1) { - rx = inputX.read(gid.xy, gid.z); - ry = inputY.read(gid.xy, gid.z); - } else { - rx = inputX.read(gid.xy, gid.z); - int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; - int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; - int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; - int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; - int32_t yshift = 4 - pm.ylen - pm.axis; - for (int n = 0; n < 4; n++) { - x_xyzn[3] = n; - xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); - invtrans(xtrans, x_abcd, t_abcd); - for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) { - y_abcd[yshift+k] = t_abcd[k]; - } - trans(ytrans, y_abcd, t_abcd); - abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); - ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; + texture2d_array inputY [[texture(1)]], + texture2d_array outTexture [[texture(2)]], + constant ElementwiseAddParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + half4 rx, ry; + + if (pm.fast == 1) { + rx = inputX.read(gid.xy, gid.z); + ry = inputY.read(gid.xy, gid.z); + } else { + rx = inputX.read(gid.xy, gid.z); + int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; + int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; + int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; + int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; + int32_t yshift = 4 - pm.ylen - pm.axis; + for (int n = 0; n < 4; n++) { + x_xyzn[3] = n; + xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); + invtrans(xtrans, x_abcd, t_abcd); + for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) { + y_abcd[yshift+k] = t_abcd[k]; + } + trans(ytrans, y_abcd, t_abcd); + abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); + ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; + } } - } - half4 r = rx + ry; - outTexture.write(r, gid.xy, gid.z); + half4 r = rx + ry; + outTexture.write(r, gid.xy, gid.z); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal index b1d68d6809..65566952ef 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal @@ -20,72 +20,72 @@ using namespace metal; kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array inputX [[texture(0)]], - texture2d_array inputY [[texture(1)]], - texture2d_array outTexture [[texture(2)]], - constant ElementwiseAddParam &pm [[buffer(0)]], + texture2d_array inputY [[texture(1)]], + texture2d_array outTexture [[texture(2)]], + constant ElementwiseAddParam &pm [[buffer(0)]], #ifdef PRELU_CHANNEL - const device VECTOR(P, 4) *alpha [[buffer(1)]], + const device VECTOR(P, 4) *alpha [[buffer(1)]], #endif #ifdef PRELU_ELEMENT - const device VECTOR(P, 4) *alpha [[buffer(1)]], + const device VECTOR(P, 4) *alpha [[buffer(1)]], #endif #ifdef PRELU_OTHER - const device P *alpha [[buffer(1)]], + const device P *alpha [[buffer(1)]], #endif - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - VECTOR(P, 4) rx, ry; - - if (pm.fast == 1) { - rx = inputX.read(gid.xy, gid.z); - ry = inputY.read(gid.xy, gid.z); + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + VECTOR(P, 4) rx, ry; + + if (pm.fast == 1) { + rx = inputX.read(gid.xy, gid.z); + ry = inputY.read(gid.xy, gid.z); } else { - rx = inputX.read(gid.xy, gid.z); - int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; - int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; - int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; - int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; - int32_t yshift = 4 - pm.ylen - pm.axis; - for (int n = 0; n < 4; n++) { - x_xyzn[3] = n; - xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); - invtrans(xtrans, x_abcd, t_abcd); - for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) { - y_abcd[yshift+k] = t_abcd[k]; + rx = inputX.read(gid.xy, gid.z); + int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; + int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; + int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; + int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; + int32_t yshift = 4 - pm.ylen - pm.axis; + for (int n = 0; n < 4; n++) { + x_xyzn[3] = n; + xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); + invtrans(xtrans, x_abcd, t_abcd); + for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) { + y_abcd[yshift+k] = t_abcd[k]; + } + trans(ytrans, y_abcd, t_abcd); + abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); + ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; } - trans(ytrans, y_abcd, t_abcd); - abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); - ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; - } - } - VECTOR(P, 4) output = rx + ry; - + } + VECTOR(P, 4) output = rx + ry; + #ifdef PRELU_CHANNEL - VECTOR(P, 4) alpha_value = alpha[gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); + VECTOR(P, 4) alpha_value = alpha[gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); #endif #ifdef PRELU_ELEMENT - int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); - VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); + int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); + VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); #endif #ifdef PRELU_OTHER - P alpha_value = alpha[0]; - output.x = output.x > 0 ? output.x : (alpha_value * output.x); - output.y = output.y > 0 ? output.y : (alpha_value * output.y); - output.z = output.z > 0 ? output.z : (alpha_value * output.z); - output.w = output.w > 0 ? output.w : (alpha_value * output.w); + P alpha_value = alpha[0]; + output.x = output.x > 0 ? output.x : (alpha_value * output.x); + output.y = output.y > 0 ? output.y : (alpha_value * output.y); + output.z = output.z > 0 ? output.z : (alpha_value * output.z); + output.w = output.w > 0 ? output.w : (alpha_value * output.w); #endif - - outTexture.write(output, gid.xy, gid.z); + + outTexture.write(output, gid.xy, gid.z); } #endif diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal index 8fd1a9fdab..cca11e8086 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal @@ -17,13 +17,13 @@ using namespace metal; struct ElementwiseAddParam { - int32_t fast; - int32_t axis; - int32_t ylen; - int32_t xdim[4]; - int32_t xtrans[4]; - int32_t ydim[4]; - int32_t ytrans[4]; + int32_t fast; + int32_t axis; + int32_t ylen; + int32_t xdim[4]; + int32_t xtrans[4]; + int32_t ydim[4]; + int32_t ytrans[4]; }; #define P float diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal index 9655b0fc1a..114aa15664 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal @@ -23,38 +23,38 @@ #define VECTOR(p, n) CONCAT2(p, n) kernel void FUNC_T(fetch, P)(texture2d_array inTexture [[texture(0)]], - device float *output [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height() || - gid.z >= inTexture.get_array_size()) { - return; - } - - int input_width = inTexture.get_width(); - int input_height = inTexture.get_height(); - const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z); - int output_to = 4 * input_width * input_height; - - output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x; - - output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y; - output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z; - output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w; + device float *output [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height() || + gid.z >= inTexture.get_array_size()) { + return; + } + + int input_width = inTexture.get_width(); + int input_height = inTexture.get_height(); + const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z); + int output_to = 4 * input_width * input_height; + + output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x; + + output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y; + output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z; + output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w; } kernel void FUNC(fetch, 1or2, P)(texture2d_array inTexture [[texture(0)]], - device float4 *output [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height() || - gid.z >= inTexture.get_array_size()) { - return; - } - - int input_width = inTexture.get_width(); - const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z); - output[gid.y * input_width + gid.x] = float4(input); + device float4 *output [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height() || + gid.z >= inTexture.get_array_size()) { + return; + } + + int input_width = inTexture.get_width(); + const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z); + output[gid.y * input_width + gid.x] = float4(input); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal index 87d304302f..df2de98648 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal @@ -31,7 +31,7 @@ using namespace metal; kernel void fetch_placeholder(texture2d_array inTexture [[texture(0)]], device float *output [[buffer(0)]], uint3 gid [[thread_position_in_grid]]) { - + } kernel void fetch_placeholder_half(texture2d_array inTexture [[texture(0)]], diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal index 368509f001..06bf42697e 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal @@ -23,47 +23,47 @@ kernel void place_holder(texture2d inTexture [[texture(0)]], } struct OutputDim { - ushort width; - ushort height; - ushort strideX; - ushort strideY; + ushort width; + ushort height; + ushort strideX; + ushort strideY; }; kernel void resize(texture2d inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], constant OutputDim ¶ms [[buffer(0)]], uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - - constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint2 pos = gid.xy * uint2(params.strideX, params.strideY); - const half4 input = inTexture.read(pos); - outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + + constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint2 pos = gid.xy * uint2(params.strideX, params.strideY); + const half4 input = inTexture.read(pos); + outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z); } kernel void texture2d_to_2d_array(texture2d inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height()){ - return; - } - const float4 input = inTexture.read(gid.xy); - outTexture.write(input, gid.xy, 0); + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height()){ + return; + } + const float4 input = inTexture.read(gid.xy); + outTexture.write(input, gid.xy, 0); } kernel void texture2d_to_2d_array_half(texture2d inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height()){ - return; - } - const half4 input = inTexture.read(gid.xy); - outTexture.write(input, gid.xy, 0); + texture2d_array outTexture [[texture(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height()){ + return; + } + const half4 input = inTexture.read(gid.xy); + outTexture.write(input, gid.xy, 0); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal index 44c57440e1..e32c98cc29 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal @@ -16,65 +16,65 @@ using namespace metal; kernel void nms_fetch_result(texture2d_array inTexture [[texture(0)]], - device float *output [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height() || - gid.z >= inTexture.get_array_size()) { - return; - } - - int input_width = inTexture.get_width(); - const float4 input = inTexture.read(gid.xy, gid.z); - output[gid.y * input_width + gid.x] = input.x; - + device float *output [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height() || + gid.z >= inTexture.get_array_size()) { + return; + } + + int input_width = inTexture.get_width(); + const float4 input = inTexture.read(gid.xy, gid.z); + output[gid.y * input_width + gid.x] = input.x; + } kernel void nms_fetch_result_half(texture2d_array inTexture [[texture(0)]], - device float *output [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height() || - gid.z >= inTexture.get_array_size()) { - return; - } - - int input_width = inTexture.get_width(); - const half4 input = inTexture.read(gid.xy, gid.z); - output[gid.y * input_width + gid.x] = input.x; + device float *output [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height() || + gid.z >= inTexture.get_array_size()) { + return; + } + + int input_width = inTexture.get_width(); + const half4 input = inTexture.read(gid.xy, gid.z); + output[gid.y * input_width + gid.x] = input.x; } kernel void nms_fetch_bbox(texture2d_array inTexture [[texture(0)]], - device float4 *output [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height() || - gid.z >= inTexture.get_array_size()) { - return; - } - - int input_width = inTexture.get_width(); -// int input_height = inTexture.get_height(); - const float4 input = inTexture.read(gid.xy, gid.z); - output[gid.y * input_width + gid.x] = input; + device float4 *output [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height() || + gid.z >= inTexture.get_array_size()) { + return; + } + + int input_width = inTexture.get_width(); + // int input_height = inTexture.get_height(); + const float4 input = inTexture.read(gid.xy, gid.z); + output[gid.y * input_width + gid.x] = input; } kernel void nms_fetch_bbox_half(texture2d_array inTexture [[texture(0)]], - device float4 *output [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height() || - gid.z >= inTexture.get_array_size()) { - return; - } - - int input_width = inTexture.get_width(); -// int input_height = inTexture.get_height(); - const half4 input = inTexture.read(gid.xy, gid.z); - output[gid.y * input_width + gid.x] = float4(input); + device float4 *output [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height() || + gid.z >= inTexture.get_array_size()) { + return; + } + + int input_width = inTexture.get_width(); + // int input_height = inTexture.get_height(); + const half4 input = inTexture.read(gid.xy, gid.z); + output[gid.y * input_width + gid.x] = float4(input); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal index 3c36ba06f5..05146b8d14 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal @@ -15,36 +15,36 @@ #ifdef P kernel void FUNC2_(pool, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant PoolParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - int xmin = gid.x * pm.strideX - pm.paddingX; - int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width())); - xmin = max(xmin, 0); - int ymin = gid.y * pm.strideX - pm.paddingX; - int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height())); - ymin = max(ymin, 0); - - VECTOR(P, 4) r = 0; - if (pm.poolType == 0) { - r = inTexture.read(uint2(xmin, ymin), gid.z); - for (int x = xmin; x < xmax; x++) { - for (int y = ymin; y < ymax; y++) { - r = fmax(r, inTexture.read(uint2(x, y), gid.z)); - } + texture2d_array outTexture [[texture(1)]], + constant PoolParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + int xmin = gid.x * pm.strideX - pm.paddingX; + int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width())); + xmin = max(xmin, 0); + int ymin = gid.y * pm.strideX - pm.paddingX; + int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height())); + ymin = max(ymin, 0); + + VECTOR(P, 4) r = 0; + if (pm.poolType == 0) { + r = inTexture.read(uint2(xmin, ymin), gid.z); + for (int x = xmin; x < xmax; x++) { + for (int y = ymin; y < ymax; y++) { + r = fmax(r, inTexture.read(uint2(x, y), gid.z)); + } + } + } else if (pm.poolType == 1) { + for (int x = xmin; x < xmax; x++) { + for (int y = ymin; y < ymax; y++) { + r += inTexture.read(uint2(x, y), gid.z); + } + } + r /= (xmax - xmin) * (ymax - ymin); } - } else if (pm.poolType == 1) { - for (int x = xmin; x < xmax; x++) { - for (int y = ymin; y < ymax; y++) { - r += inTexture.read(uint2(x, y), gid.z); - } - } - r /= (xmax - xmin) * (ymax - ymin); - } - outTexture.write(r, gid.xy, gid.z); + outTexture.write(r, gid.xy, gid.z); } #endif diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal index e76b4ac742..30111b7bcb 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal @@ -18,13 +18,13 @@ using namespace metal; struct PoolParam { - int ksizeX; - int ksizeY; - int strideX; - int strideY; - int paddingX; - int paddingY; - int poolType; + int ksizeX; + int ksizeY; + int strideX; + int strideY; + int paddingX; + int paddingY; + int poolType; }; #define P half diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal index 5978041377..6279821436 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal @@ -16,136 +16,136 @@ using namespace metal; kernel void prelu_channel(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device float4 *alpha [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); - float4 alpha_value = alpha[gid.z]; - float4 output; - output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); - output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); - output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); - output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); - outTexture.write(output, gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + const device float4 *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + float4 alpha_value = alpha[gid.z]; + float4 output; + output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); + output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); + output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); + output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); + outTexture.write(output, gid.xy, gid.z); } kernel void prelu_element(texture2d_array inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], const device float4 *alpha [[buffer(0)]], uint3 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); - - int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size(); - float4 alpha_value = alpha[alpha_to + gid.z]; - - float4 output; - output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); - output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); - output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); - output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); - outTexture.write(output, gid.xy, gid.z); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + + int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size(); + float4 alpha_value = alpha[alpha_to + gid.z]; + + float4 output; + output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); + output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); + output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); + output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); + outTexture.write(output, gid.xy, gid.z); } kernel void prelu_other(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device float *alpha [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); - float alpha_value = alpha[0]; - float4 output; - output.x = input.x > 0 ? input.x : (alpha_value * input.x); - output.y = input.y > 0 ? input.y : (alpha_value * input.y); - output.z = input.z > 0 ? input.z : (alpha_value * input.z); - output.w = input.w > 0 ? input.w : (alpha_value * input.w); - outTexture.write(output, gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + const device float *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + float alpha_value = alpha[0]; + float4 output; + output.x = input.x > 0 ? input.x : (alpha_value * input.x); + output.y = input.y > 0 ? input.y : (alpha_value * input.y); + output.z = input.z > 0 ? input.z : (alpha_value * input.z); + output.w = input.w > 0 ? input.w : (alpha_value * input.w); + outTexture.write(output, gid.xy, gid.z); } kernel void prelu_channel_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device half4 *alpha [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); - half4 alpha_value = alpha[gid.z]; - half4 output; - output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); - output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); - output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); - output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); - outTexture.write(output, gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + const device half4 *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + half4 alpha_value = alpha[gid.z]; + half4 output; + output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); + output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); + output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); + output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); + outTexture.write(output, gid.xy, gid.z); } kernel void prelu_element_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device half4 *alpha [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); - - int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size(); - half4 alpha_value = alpha[alpha_to + gid.z]; - - half4 output; - output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); - output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); - output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); - output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); - outTexture.write(output, gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + const device half4 *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + + int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size(); + half4 alpha_value = alpha[alpha_to + gid.z]; + + half4 output; + output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); + output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); + output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); + output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); + outTexture.write(output, gid.xy, gid.z); } kernel void prelu_other_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device half *alpha [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); - half alpha_value = alpha[0]; - half4 output; - output.x = input.x > 0 ? input.x : (alpha_value * input.x); - output.y = input.y > 0 ? input.y : (alpha_value * input.y); - output.z = input.z > 0 ? input.z : (alpha_value * input.z); - output.w = input.w > 0 ? input.w : (alpha_value * input.w); - outTexture.write(output, gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + const device half *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + half alpha_value = alpha[0]; + half4 output; + output.x = input.x > 0 ? input.x : (alpha_value * input.x); + output.y = input.y > 0 ? input.y : (alpha_value * input.y); + output.z = input.z > 0 ? input.z : (alpha_value * input.z); + output.w = input.w > 0 ? input.w : (alpha_value * input.w); + outTexture.write(output, gid.xy, gid.z); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal index 7630febf77..c7f97043bf 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal @@ -16,20 +16,20 @@ using namespace metal; struct PriorBoxMetalParam { - float offset; - float stepWidth; - float stepHeight; - float minSize; - float maxSize; - float imageWidth; - float imageHeight; - - bool clip; - - uint numPriors; - uint aspecRatiosSize; - uint minSizeSize; - uint maxSizeSize; + float offset; + float stepWidth; + float stepHeight; + float minSize; + float maxSize; + float imageWidth; + float imageHeight; + + bool clip; + + uint numPriors; + uint aspecRatiosSize; + uint minSizeSize; + uint maxSizeSize; }; kernel void prior_box(texture2d_array inTexture [[texture(0)]], @@ -39,329 +39,329 @@ kernel void prior_box(texture2d_array inTexture [[texture(0 constant PriorBoxMetalParam ¶m [[buffer(1)]], const device float4 *variances [[buffer(2)]], uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outBoxTexture.get_width() || - gid.y >= outBoxTexture.get_height() || - gid.z >= outBoxTexture.get_array_size()) return; - - float center_x = (gid.x + param.offset) * param.stepWidth; - float center_y = (gid.y + param.offset) * param.stepHeight; - - float box_width, box_height; - - if (gid.z < param.aspecRatiosSize) { - float ar = aspect_ratios[gid.z]; - box_width = param.minSize * sqrt(ar) / 2; - box_height = param.minSize / sqrt(ar) / 2; - float4 box; - box.x = (center_x - box_width) / param.imageWidth; - box.y = (center_y - box_height) / param.imageHeight; - box.z = (center_x + box_width) / param.imageWidth; - box.w = (center_y + box_height) / param.imageHeight; + if (gid.x >= outBoxTexture.get_width() || + gid.y >= outBoxTexture.get_height() || + gid.z >= outBoxTexture.get_array_size()) return; - float4 res; - if (param.clip) { - res = fmin(fmax(box, 0.0), 1.0); - } else { - res = box; + float center_x = (gid.x + param.offset) * param.stepWidth; + float center_y = (gid.y + param.offset) * param.stepHeight; + + float box_width, box_height; + + if (gid.z < param.aspecRatiosSize) { + float ar = aspect_ratios[gid.z]; + box_width = param.minSize * sqrt(ar) / 2; + box_height = param.minSize / sqrt(ar) / 2; + float4 box; + box.x = (center_x - box_width) / param.imageWidth; + box.y = (center_y - box_height) / param.imageHeight; + box.z = (center_x + box_width) / param.imageWidth; + box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = fmin(fmax(box, 0.0), 1.0); + } else { + res = box; + } + + outBoxTexture.write(res, gid.xy, gid.z); + } else if (gid.z >= param.aspecRatiosSize) { + if (param.maxSizeSize > 0) { + box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; + float4 max_box; + max_box.x = (center_x - box_width) / param.imageWidth; + max_box.y = (center_y - box_height) / param.imageHeight; + max_box.z = (center_x + box_width) / param.imageWidth; + max_box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = min(max(max_box, 0.0), 1.0); + } else { + res = max_box; + } + outBoxTexture.write(max_box, gid.xy, gid.z); + } } - outBoxTexture.write(res, gid.xy, gid.z); - } else if (gid.z >= param.aspecRatiosSize) { - if (param.maxSizeSize > 0) { - box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; - float4 max_box; - max_box.x = (center_x - box_width) / param.imageWidth; - max_box.y = (center_y - box_height) / param.imageHeight; - max_box.z = (center_x + box_width) / param.imageWidth; - max_box.w = (center_y + box_height) / param.imageHeight; - - float4 res; - if (param.clip) { - res = min(max(max_box, 0.0), 1.0); - } else { - res = max_box; - } - outBoxTexture.write(max_box, gid.xy, gid.z); + float4 variance = variances[0]; + if (gid.z < param.numPriors) { + float4 variances_output; + variances_output.x = variance.x; + variances_output.y = variance.y; + variances_output.z = variance.z; + variances_output.w = variance.w; + varianceTexture.write(variances_output, gid.xy, gid.z); } - } - - float4 variance = variances[0]; - if (gid.z < param.numPriors) { - float4 variances_output; - variances_output.x = variance.x; - variances_output.y = variance.y; - variances_output.z = variance.z; - variances_output.w = variance.w; - varianceTexture.write(variances_output, gid.xy, gid.z); - } } kernel void prior_box_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outBoxTexture [[texture(1)]], - texture2d_array varianceTexture [[texture(2)]], - const device half *aspect_ratios [[buffer(0)]], - constant PriorBoxMetalParam ¶m [[buffer(1)]], - const device float4 *variances [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outBoxTexture.get_width() || - gid.y >= outBoxTexture.get_height() || - gid.z >= outBoxTexture.get_array_size()) return; - - float center_x = (gid.x + param.offset) * param.stepWidth; - float center_y = (gid.y + param.offset) * param.stepHeight; - - float box_width, box_height; - - if (gid.z < param.aspecRatiosSize) { - half ar = aspect_ratios[gid.z]; - box_width = param.minSize * sqrt(ar) / 2; - box_height = param.minSize / sqrt(ar) / 2; - float4 box; - box.x = (center_x - box_width) / param.imageWidth; - box.y = (center_y - box_height) / param.imageHeight; - box.z = (center_x + box_width) / param.imageWidth; - box.w = (center_y + box_height) / param.imageHeight; + texture2d_array outBoxTexture [[texture(1)]], + texture2d_array varianceTexture [[texture(2)]], + const device half *aspect_ratios [[buffer(0)]], + constant PriorBoxMetalParam ¶m [[buffer(1)]], + const device float4 *variances [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outBoxTexture.get_width() || + gid.y >= outBoxTexture.get_height() || + gid.z >= outBoxTexture.get_array_size()) return; - float4 res; - if (param.clip) { - res = fmin(fmax(box, 0.0), 1.0); - } else { - res = box; + float center_x = (gid.x + param.offset) * param.stepWidth; + float center_y = (gid.y + param.offset) * param.stepHeight; + + float box_width, box_height; + + if (gid.z < param.aspecRatiosSize) { + half ar = aspect_ratios[gid.z]; + box_width = param.minSize * sqrt(ar) / 2; + box_height = param.minSize / sqrt(ar) / 2; + float4 box; + box.x = (center_x - box_width) / param.imageWidth; + box.y = (center_y - box_height) / param.imageHeight; + box.z = (center_x + box_width) / param.imageWidth; + box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = fmin(fmax(box, 0.0), 1.0); + } else { + res = box; + } + + outBoxTexture.write(half4(res), gid.xy, gid.z); + } else if (gid.z >= param.aspecRatiosSize) { + if (param.maxSizeSize > 0) { + box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; + float4 max_box; + max_box.x = (center_x - box_width) / param.imageWidth; + max_box.y = (center_y - box_height) / param.imageHeight; + max_box.z = (center_x + box_width) / param.imageWidth; + max_box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = min(max(max_box, 0.0), 1.0); + } else { + res = max_box; + } + outBoxTexture.write(half4(max_box), gid.xy, gid.z); + } } - outBoxTexture.write(half4(res), gid.xy, gid.z); - } else if (gid.z >= param.aspecRatiosSize) { - if (param.maxSizeSize > 0) { - box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; - float4 max_box; - max_box.x = (center_x - box_width) / param.imageWidth; - max_box.y = (center_y - box_height) / param.imageHeight; - max_box.z = (center_x + box_width) / param.imageWidth; - max_box.w = (center_y + box_height) / param.imageHeight; - - float4 res; - if (param.clip) { - res = min(max(max_box, 0.0), 1.0); - } else { - res = max_box; - } - outBoxTexture.write(half4(max_box), gid.xy, gid.z); + float4 variance = variances[0]; + if (gid.z < param.numPriors) { + float4 variances_output; + variances_output.x = variance.x; + variances_output.y = variance.y; + variances_output.z = variance.z; + variances_output.w = variance.w; + varianceTexture.write(half4(variances_output), gid.xy, gid.z); } - } - - float4 variance = variances[0]; - if (gid.z < param.numPriors) { - float4 variances_output; - variances_output.x = variance.x; - variances_output.y = variance.y; - variances_output.z = variance.z; - variances_output.w = variance.w; - varianceTexture.write(half4(variances_output), gid.xy, gid.z); - } } kernel void prior_box_MinMaxAspectRatiosOrder(texture2d_array inTexture [[texture(0)]], - texture2d_array outBoxTexture [[texture(1)]], - texture2d_array varianceTexture [[texture(2)]], - const device float *aspect_ratios [[buffer(0)]], - constant PriorBoxMetalParam ¶m [[buffer(1)]], - const device float4 *variances [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outBoxTexture.get_width() || - gid.y >= outBoxTexture.get_height() || - gid.z >= outBoxTexture.get_array_size()) return; - - float center_x = (gid.x + param.offset) * param.stepWidth; - float center_y = (gid.y + param.offset) * param.stepHeight; - - float box_width, box_height; - - - - if (gid.z == 0) { - box_width = box_height = param.minSize / 2; + texture2d_array outBoxTexture [[texture(1)]], + texture2d_array varianceTexture [[texture(2)]], + const device float *aspect_ratios [[buffer(0)]], + constant PriorBoxMetalParam ¶m [[buffer(1)]], + const device float4 *variances [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outBoxTexture.get_width() || + gid.y >= outBoxTexture.get_height() || + gid.z >= outBoxTexture.get_array_size()) return; - float4 box; - box.x = (center_x - box_width) / param.imageWidth; - box.y = (center_y - box_height) / param.imageHeight; - box.z = (center_x + box_width) / param.imageWidth; - box.w = (center_y + box_height) / param.imageHeight; + float center_x = (gid.x + param.offset) * param.stepWidth; + float center_y = (gid.y + param.offset) * param.stepHeight; - float4 res; - if (param.clip) { - res = fmin(fmax(box, 0.0), 1.0); - } else { - res = box; + float box_width, box_height; + + + + if (gid.z == 0) { + box_width = box_height = param.minSize / 2; + + float4 box; + box.x = (center_x - box_width) / param.imageWidth; + box.y = (center_y - box_height) / param.imageHeight; + box.z = (center_x + box_width) / param.imageWidth; + box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = fmin(fmax(box, 0.0), 1.0); + } else { + res = box; + } + + outBoxTexture.write(res, gid.xy, gid.z); } - - outBoxTexture.write(res, gid.xy, gid.z); - } - - if (gid.z == 1 && param.maxSizeSize > 0) { - box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; - float4 max_box; - max_box.x = (center_x - box_width) / param.imageWidth; - max_box.y = (center_y - box_height) / param.imageHeight; - max_box.z = (center_x + box_width) / param.imageWidth; - max_box.w = (center_y + box_height) / param.imageHeight; + if (gid.z == 1 && param.maxSizeSize > 0) { + + box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; + float4 max_box; + max_box.x = (center_x - box_width) / param.imageWidth; + max_box.y = (center_y - box_height) / param.imageHeight; + max_box.z = (center_x + box_width) / param.imageWidth; + max_box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = min(max(max_box, 0.0), 1.0); + } else { + res = max_box; + } + outBoxTexture.write(res, gid.xy, gid.z); + } - float4 res; - if (param.clip) { - res = min(max(max_box, 0.0), 1.0); + int aspect_to = 0; + if (param.maxSizeSize > 0) { + aspect_to = gid.z - 2; } else { - res = max_box; + aspect_to = gid.z - 1; } - outBoxTexture.write(res, gid.xy, gid.z); - } - - int aspect_to = 0; - if (param.maxSizeSize > 0) { - aspect_to = gid.z - 2; - } else { - aspect_to = gid.z - 1; - } - - - - - if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) { - int skip = 0; - for (int i = 0; i < aspect_to + 1; ++i) { - if (fabs(aspect_ratios[i] - 1.) < 1e-6) { - skip += 1; - } - } - aspect_to += skip; - float ar = aspect_ratios[aspect_to]; - box_width = param.minSize * sqrt(ar) / 2; - box_height = param.minSize / sqrt(ar) / 2; - float4 box; - box.x = (center_x - box_width) / param.imageWidth; - box.y = (center_y - box_height) / param.imageHeight; - box.z = (center_x + box_width) / param.imageWidth; - box.w = (center_y + box_height) / param.imageHeight; - float4 res; - if (param.clip) { - res = fmin(fmax(box, 0.0), 1.0); - } else { - res = box; + if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) { + + int skip = 0; + for (int i = 0; i < aspect_to + 1; ++i) { + if (fabs(aspect_ratios[i] - 1.) < 1e-6) { + skip += 1; + } + } + aspect_to += skip; + + float ar = aspect_ratios[aspect_to]; + + box_width = param.minSize * sqrt(ar) / 2; + box_height = param.minSize / sqrt(ar) / 2; + float4 box; + box.x = (center_x - box_width) / param.imageWidth; + box.y = (center_y - box_height) / param.imageHeight; + box.z = (center_x + box_width) / param.imageWidth; + box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = fmin(fmax(box, 0.0), 1.0); + } else { + res = box; + } + + outBoxTexture.write(res, gid.xy, gid.z); } - outBoxTexture.write(res, gid.xy, gid.z); - } - - float4 variance = variances[0]; - if (gid.z < param.numPriors) { - float4 variances_output; - variances_output.x = variance.x; - variances_output.y = variance.y; - variances_output.z = variance.z; - variances_output.w = variance.w; - varianceTexture.write(variances_output, gid.xy, gid.z); - } + float4 variance = variances[0]; + if (gid.z < param.numPriors) { + float4 variances_output; + variances_output.x = variance.x; + variances_output.y = variance.y; + variances_output.z = variance.z; + variances_output.w = variance.w; + varianceTexture.write(variances_output, gid.xy, gid.z); + } } kernel void prior_box_MinMaxAspectRatiosOrder_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outBoxTexture [[texture(1)]], - texture2d_array varianceTexture [[texture(2)]], - const device half *aspect_ratios [[buffer(0)]], - constant PriorBoxMetalParam ¶m [[buffer(1)]], - const device float4 *variances [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outBoxTexture.get_width() || - gid.y >= outBoxTexture.get_height() || - gid.z >= outBoxTexture.get_array_size()) return; - - float center_x = (gid.x + param.offset) * param.stepWidth; - float center_y = (gid.y + param.offset) * param.stepHeight; - - float box_width, box_height; - - - - if (gid.z == 0) { - box_width = box_height = param.minSize / 2; + texture2d_array outBoxTexture [[texture(1)]], + texture2d_array varianceTexture [[texture(2)]], + const device half *aspect_ratios [[buffer(0)]], + constant PriorBoxMetalParam ¶m [[buffer(1)]], + const device float4 *variances [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outBoxTexture.get_width() || + gid.y >= outBoxTexture.get_height() || + gid.z >= outBoxTexture.get_array_size()) return; - float4 box; - box.x = (center_x - box_width) / param.imageWidth; - box.y = (center_y - box_height) / param.imageHeight; - box.z = (center_x + box_width) / param.imageWidth; - box.w = (center_y + box_height) / param.imageHeight; + float center_x = (gid.x + param.offset) * param.stepWidth; + float center_y = (gid.y + param.offset) * param.stepHeight; - float4 res; - if (param.clip) { - res = fmin(fmax(box, 0.0), 1.0); - } else { - res = box; - } + float box_width, box_height; - outBoxTexture.write(half4(res), gid.xy, gid.z); - } - - if (gid.z == 1 && param.maxSizeSize > 0) { - box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; - float4 max_box; - max_box.x = (center_x - box_width) / param.imageWidth; - max_box.y = (center_y - box_height) / param.imageHeight; - max_box.z = (center_x + box_width) / param.imageWidth; - max_box.w = (center_y + box_height) / param.imageHeight; - float4 res; - if (param.clip) { - res = min(max(max_box, 0.0), 1.0); - } else { - res = max_box; + if (gid.z == 0) { + box_width = box_height = param.minSize / 2; + + float4 box; + box.x = (center_x - box_width) / param.imageWidth; + box.y = (center_y - box_height) / param.imageHeight; + box.z = (center_x + box_width) / param.imageWidth; + box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = fmin(fmax(box, 0.0), 1.0); + } else { + res = box; + } + + outBoxTexture.write(half4(res), gid.xy, gid.z); } - outBoxTexture.write(half4(res), gid.xy, gid.z); - } - - int aspect_to = 0; - if (param.maxSizeSize > 0) { - aspect_to = gid.z - 2; - } else { - aspect_to = gid.z - 1; - } - - if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) { - float ar = aspect_ratios[aspect_to]; - box_width = param.minSize * sqrt(ar) / 2; - box_height = param.minSize / sqrt(ar) / 2; - float4 box; - box.x = (center_x - box_width) / param.imageWidth; - box.y = (center_y - box_height) / param.imageHeight; - box.z = (center_x + box_width) / param.imageWidth; - box.w = (center_y + box_height) / param.imageHeight; + if (gid.z == 1 && param.maxSizeSize > 0) { + + box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; + float4 max_box; + max_box.x = (center_x - box_width) / param.imageWidth; + max_box.y = (center_y - box_height) / param.imageHeight; + max_box.z = (center_x + box_width) / param.imageWidth; + max_box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = min(max(max_box, 0.0), 1.0); + } else { + res = max_box; + } + outBoxTexture.write(half4(res), gid.xy, gid.z); + } - float4 res; - if (param.clip) { - res = fmin(fmax(box, 0.0), 1.0); + int aspect_to = 0; + if (param.maxSizeSize > 0) { + aspect_to = gid.z - 2; } else { - res = box; + aspect_to = gid.z - 1; } - outBoxTexture.write(half4(res), gid.xy, gid.z); - } - - float4 variance = variances[0]; - if (gid.z < param.numPriors) { - float4 variances_output; - variances_output.x = variance.x; - variances_output.y = variance.y; - variances_output.z = variance.z; - variances_output.w = variance.w; - varianceTexture.write(half4(variances_output), gid.xy, gid.z); - } + if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) { + float ar = aspect_ratios[aspect_to]; + + box_width = param.minSize * sqrt(ar) / 2; + box_height = param.minSize / sqrt(ar) / 2; + float4 box; + box.x = (center_x - box_width) / param.imageWidth; + box.y = (center_y - box_height) / param.imageHeight; + box.z = (center_x + box_width) / param.imageWidth; + box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = fmin(fmax(box, 0.0), 1.0); + } else { + res = box; + } + + outBoxTexture.write(half4(res), gid.xy, gid.z); + } + + float4 variance = variances[0]; + if (gid.z < param.numPriors) { + float4 variances_output; + variances_output.x = variance.x; + variances_output.y = variance.y; + variances_output.z = variance.z; + variances_output.w = variance.w; + varianceTexture.write(half4(variances_output), gid.xy, gid.z); + } } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal index e725440bbe..725222d75e 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal @@ -17,25 +17,25 @@ using namespace metal; kernel void relu_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero); - const half4 input = inTexture.read(gid.xy, gid.z); - const float4 relu = fmax((float4)input, 0.0); - outTexture.write(half4(relu), gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero); + const half4 input = inTexture.read(gid.xy, gid.z); + const float4 relu = fmax((float4)input, 0.0); + outTexture.write(half4(relu), gid.xy, gid.z); } kernel void relu(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero); - const float4 input = inTexture.read(gid.xy, gid.z); - const float4 relu = fmax((float4)input, 0.0); - outTexture.write(float4(relu), gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero); + const float4 input = inTexture.read(gid.xy, gid.z); + const float4 relu = fmax((float4)input, 0.0); + outTexture.write(float4(relu), gid.xy, gid.z); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal index 7583537c2b..3037e404a3 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal @@ -24,43 +24,43 @@ #define FUNC_R(f, r) CONCAT2_(f, r) kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant ReshapeParam &rp [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - - int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4]; - ReshapeParam lrp = rp; - int oC = lrp.odim[lrp.otrans[3]]; - int iC = lrp.idim[lrp.itrans[3]]; - int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3]; - VECTOR(P, 4) r; - for (int n = 0; n < 4; n++) { - oxyzn[3] = n; + texture2d_array outTexture [[texture(1)]], + constant ReshapeParam &rp [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + + int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4]; + ReshapeParam lrp = rp; + int oC = lrp.odim[lrp.otrans[3]]; + int iC = lrp.idim[lrp.itrans[3]]; + int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3]; + VECTOR(P, 4) r; + for (int n = 0; n < 4; n++) { + oxyzn[3] = n; #if ROUT == 4 - xyzn2abcd_4(oC, oxyzn, oabcd); + xyzn2abcd_4(oC, oxyzn, oabcd); #else - FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd); + FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd); #endif - int tabcd[4]; - invtrans(lrp.otrans, oabcd, tabcd); - int index = abcd2index(lrp.odim, tabcd); - if (index < count) { - index2abcd(lrp.idim, index, tabcd); - trans(lrp.itrans, tabcd, iabcd); + int tabcd[4]; + invtrans(lrp.otrans, oabcd, tabcd); + int index = abcd2index(lrp.odim, tabcd); + if (index < count) { + index2abcd(lrp.idim, index, tabcd); + trans(lrp.itrans, tabcd, iabcd); #if RIN == 4 - abcd2xyzn_4(iC, iabcd, ixyzn); + abcd2xyzn_4(iC, iabcd, ixyzn); #else - FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn); + FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn); #endif - r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]]; - } else { - r[n] = 0; + r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]]; + } else { + r[n] = 0; + } } - } - outTexture.write(r, gid.xy, gid.z); + outTexture.write(r, gid.xy, gid.z); } #endif diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal index d2f5815d42..bb155a87a3 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal @@ -18,10 +18,10 @@ using namespace metal; struct ReshapeParam { - int32_t idim[4]; - int32_t itrans[4]; - int32_t odim[4]; - int32_t otrans[4]; + int32_t idim[4]; + int32_t itrans[4]; + int32_t odim[4]; + int32_t otrans[4]; }; #define P float diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal index fbb4e12cb8..3cca15d551 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal @@ -16,60 +16,60 @@ using namespace metal; struct resize_bilinear_param { -// int32_t out_h; -// int32_t out_w; - float ratio_h; - float ratio_w; + // int32_t out_h; + // int32_t out_w; + float ratio_h; + float ratio_w; }; kernel void resize_bilinear(texture2d_array input [[texture(0)]], - texture2d_array output [[texture(2)]], - constant resize_bilinear_param & pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - float4 r; - if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) { - r = input.read(gid.xy, gid.z); - } else { - float w = gid.x * pm.ratio_w; - float h = gid.y * pm.ratio_h; - uint w0 = w, h0 = h; - uint w1 = w0 + 1, h1 = h0 + 1; - float w1lambda = w - w0, h1lambda = h - h0; - float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda; - if (w1 >= input.get_width()) w1 = w0; - if (h1 >= input.get_height()) h1 = h0; - float4 r0 = input.read(uint2(w0, h0), gid.z); - float4 r1 = input.read(uint2(w1, h0), gid.z); - float4 r2 = input.read(uint2(w0, h1), gid.z); - float4 r3 = input.read(uint2(w1, h1), gid.z); - r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3); - } - output.write(r, gid.xy, gid.z); + texture2d_array output [[texture(2)]], + constant resize_bilinear_param & pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + float4 r; + if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) { + r = input.read(gid.xy, gid.z); + } else { + float w = gid.x * pm.ratio_w; + float h = gid.y * pm.ratio_h; + uint w0 = w, h0 = h; + uint w1 = w0 + 1, h1 = h0 + 1; + float w1lambda = w - w0, h1lambda = h - h0; + float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda; + if (w1 >= input.get_width()) w1 = w0; + if (h1 >= input.get_height()) h1 = h0; + float4 r0 = input.read(uint2(w0, h0), gid.z); + float4 r1 = input.read(uint2(w1, h0), gid.z); + float4 r2 = input.read(uint2(w0, h1), gid.z); + float4 r3 = input.read(uint2(w1, h1), gid.z); + r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3); + } + output.write(r, gid.xy, gid.z); } kernel void resize_bilinear_half(texture2d_array input [[texture(0)]], - texture2d_array output [[texture(2)]], - constant resize_bilinear_param & pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - - half4 r; - if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) { - r = input.read(gid.xy, gid.z); - } else { - half w = gid.x * pm.ratio_w; - half h = gid.y * pm.ratio_h; - uint w0 = w, h0 = h; - uint w1 = w0 + 1, h1 = h0 + 1; - half w1lambda = w - w0, h1lambda = h - h0; - half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda; - if (w1 >= input.get_width()) w1 = w0; - if (h1 >= input.get_height()) h1 = h0; - half4 r0 = input.read(uint2(w0, h0), gid.z); - half4 r1 = input.read(uint2(w1, h0), gid.z); - half4 r2 = input.read(uint2(w0, h1), gid.z); - half4 r3 = input.read(uint2(w1, h1), gid.z); - r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3); - } - output.write(r, gid.xy, gid.z); - output.write(r, gid.xy, gid.z); + texture2d_array output [[texture(2)]], + constant resize_bilinear_param & pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + half4 r; + if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) { + r = input.read(gid.xy, gid.z); + } else { + half w = gid.x * pm.ratio_w; + half h = gid.y * pm.ratio_h; + uint w0 = w, h0 = h; + uint w1 = w0 + 1, h1 = h0 + 1; + half w1lambda = w - w0, h1lambda = h - h0; + half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda; + if (w1 >= input.get_width()) w1 = w0; + if (h1 >= input.get_height()) h1 = h0; + half4 r0 = input.read(uint2(w0, h0), gid.z); + half4 r1 = input.read(uint2(w1, h0), gid.z); + half4 r2 = input.read(uint2(w0, h1), gid.z); + half4 r3 = input.read(uint2(w1, h1), gid.z); + r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3); + } + output.write(r, gid.xy, gid.z); + output.write(r, gid.xy, gid.z); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal index ae4ccdef75..62b5fd0c92 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal @@ -10,21 +10,21 @@ using namespace metal; kernel void scale(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) return; - float w_stride = inTexture.get_width() / outTexture.get_width(); - float h_stride = inTexture.get_height() / outTexture.get_height(); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 input = inTexture.sample(sample, float2(gid.x * w_stride, gid.y * h_stride), 0); - outTexture.write(input, gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) return; + float w_stride = inTexture.get_width() / outTexture.get_width(); + float h_stride = inTexture.get_height() / outTexture.get_height(); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 input = inTexture.sample(sample, float2(gid.x * w_stride, gid.y * h_stride), 0); + outTexture.write(input, gid); } kernel void scale_half(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) return; - float w_stride = inTexture.get_width() / outTexture.get_width(); - float h_stride = inTexture.get_height() / outTexture.get_height(); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 input = inTexture.sample(sample, float2(gid.x * w_stride, gid.y * h_stride), 0); - outTexture.write(half4(input), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) return; + float w_stride = inTexture.get_width() / outTexture.get_width(); + float h_stride = inTexture.get_height() / outTexture.get_height(); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 input = inTexture.sample(sample, float2(gid.x * w_stride, gid.y * h_stride), 0); + outTexture.write(half4(input), gid); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal index 455cf1471b..3affcadd79 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal @@ -21,41 +21,41 @@ #define VECTOR(p, n) CONCAT2(p, n) kernel void FUNC(softmax, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant SoftmaxParam &sp [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; -// int zsize = inTexture.get_array_size(); - P maxv = inTexture.read(uint2(0, gid.y), 0)[0]; - int group = sp.K / 4; - int remain = sp.K % 4; - for (int x = 0; x < group; x++) { - VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0); - maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3])))); - } - if (remain > 0) { - VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0); - for (int i = 0; i < remain; i++) { - maxv = max(maxv, r[i]); + texture2d_array outTexture [[texture(1)]], + constant SoftmaxParam &sp [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + // int zsize = inTexture.get_array_size(); + P maxv = inTexture.read(uint2(0, gid.y), 0)[0]; + int group = sp.K / 4; + int remain = sp.K % 4; + for (int x = 0; x < group; x++) { + VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0); + maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3])))); } - } - VECTOR(P, 4) rsum = {0, 0, 0, 0}; - for (int x = 0; x < group; x++) { - VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0); - rsum += exp(r - maxv); - } - P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3]; - if (remain > 0) { - VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0); - for (int i = 0; i < remain; i++) { - sum += exp(r[i] - maxv); + if (remain > 0) { + VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0); + for (int i = 0; i < remain; i++) { + maxv = max(maxv, r[i]); + } } - } - VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z); - rr = exp(rr - maxv) / sum; - outTexture.write(rr, gid.xy, gid.z); + VECTOR(P, 4) rsum = {0, 0, 0, 0}; + for (int x = 0; x < group; x++) { + VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0); + rsum += exp(r - maxv); + } + P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3]; + if (remain > 0) { + VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0); + for (int i = 0; i < remain; i++) { + sum += exp(r[i] - maxv); + } + } + VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z); + rr = exp(rr - maxv) / sum; + outTexture.write(rr, gid.xy, gid.z); } #endif diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal index 67c279a444..f4bc8de4bc 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal @@ -16,8 +16,8 @@ using namespace metal; struct SoftmaxParam { - int N; - int K; + int N; + int K; }; #define P float diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal index 54e3f21e79..1c9bcc7e18 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal @@ -36,41 +36,41 @@ #if V == VY kernel void FUNC(split, R, N, VV, P)(texture2d_array input [[texture(0)]], - texture2d_array out1 [[texture(1)]], - texture2d_array out2 [[texture(2)]], + texture2d_array out1 [[texture(1)]], + texture2d_array out2 [[texture(2)]], #if N >= 3 - texture2d_array out3 [[texture(3)]], + texture2d_array out3 [[texture(3)]], #endif // N >= 3 #if N >= 4 - texture2d_array out4 [[texture(4)]], + texture2d_array out4 [[texture(4)]], #endif // N >= 4 - constant SplitParam &sp [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - - VECTOR(P, 4) r = input.read(gid.xy, gid.z); - int y = gid.y - sp.offset; - if (y < sp.vdim[0]) { - out1.write(r, gid.xy, gid.z); - return; - } - y -= sp.vdim[0]; - if (y < sp.vdim[1]) { - out2.write(r, uint2(gid.x, y), gid.z); - return; - } + constant SplitParam &sp [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + VECTOR(P, 4) r = input.read(gid.xy, gid.z); + int y = gid.y - sp.offset; + if (y < sp.vdim[0]) { + out1.write(r, gid.xy, gid.z); + return; + } + y -= sp.vdim[0]; + if (y < sp.vdim[1]) { + out2.write(r, uint2(gid.x, y), gid.z); + return; + } #if N >= 3 - y -= sp.vdim[1]; - if (y < sp.vdim[2]) { - out3.write(r, uint2(gid.x, y), gid.z); - return; - } + y -= sp.vdim[1]; + if (y < sp.vdim[2]) { + out3.write(r, uint2(gid.x, y), gid.z); + return; + } #endif // N >= 3 #if N >= 4 - y -= sp.vdim[2]; - if (y < sp.vdim[3]) { - out4.write(r, uint2(gid.x, y), gid.z); - return; - } + y -= sp.vdim[2]; + if (y < sp.vdim[3]) { + out4.write(r, uint2(gid.x, y), gid.z); + return; + } #endif // N >= 4 } #endif // V == VY @@ -88,30 +88,30 @@ kernel void FUNC(split, R, N, VV, P)(texture2d_array input [[te #endif // N >= 4 constant SplitParam &sp [[buffer(0)]], uint3 gid [[thread_position_in_grid]]) { - VECTOR(P, 4) r = input.read(gid.xy, gid.z); - int x = gid.x; - if (x < sp.vdim[0]) { - out1.write(r, gid.xy, gid.z); - return; - } - x -= sp.vdim[0]; - if (x < sp.vdim[1]) { - out2.write(r, uint2(x, gid.y), gid.z); - return; - } + VECTOR(P, 4) r = input.read(gid.xy, gid.z); + int x = gid.x; + if (x < sp.vdim[0]) { + out1.write(r, gid.xy, gid.z); + return; + } + x -= sp.vdim[0]; + if (x < sp.vdim[1]) { + out2.write(r, uint2(x, gid.y), gid.z); + return; + } #if N >= 3 - x -= sp.vdim[1]; - if (x < sp.vdim[2]) { - out3.write(r, uint2(x, gid.y), gid.z); - return; - } + x -= sp.vdim[1]; + if (x < sp.vdim[2]) { + out3.write(r, uint2(x, gid.y), gid.z); + return; + } #endif // N >= 3 #if N >= 4 - x -= sp.vdim[2]; - if (x < sp.vdim[3]) { - out4.write(r, uint2(x, gid.y), gid.z); - return; - } + x -= sp.vdim[2]; + if (x < sp.vdim[3]) { + out4.write(r, uint2(x, gid.y), gid.z); + return; + } #endif // N >= 4 } #endif // V == VX diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal index 4c1e818d2b..d167608fbb 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal @@ -18,11 +18,11 @@ using namespace metal; struct SplitParam { - int32_t idim[4]; - int32_t axis; - int32_t offset; - int32_t trans[4]; - int32_t vdim[4]; + int32_t idim[4]; + int32_t axis; + int32_t offset; + int32_t trans[4]; + int32_t vdim[4]; }; #define VNORMAL 1 @@ -36,29 +36,29 @@ struct SplitParam { //// ssd-ar: (R=3, N=2, V=y) #define V VY - #define R 3 - #define N 2 - #define P float - #include "Split.inc.metal" - #undef P - #define P half - #include "Split.inc.metal" - #undef P - #undef N - #undef R +#define R 3 +#define N 2 +#define P float +#include "Split.inc.metal" +#undef P +#define P half +#include "Split.inc.metal" +#undef P +#undef N +#undef R #undef V //// ssd-ar: (R=2, N=2, V=y) #define V VY - #define R 2 - #define N 2 - #define P float - #include "Split.inc.metal" - #undef P - #define P half - #include "Split.inc.metal" - #undef P - #undef N - #undef R +#define R 2 +#define N 2 +#define P float +#include "Split.inc.metal" +#undef P +#define P half +#include "Split.inc.metal" +#undef P +#undef N +#undef R #undef V diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal index 534166e45f..d80361da46 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal @@ -22,39 +22,39 @@ #define VECTOR(p, n) CONCAT2(p, n) kernel void FUNC(transpose, R, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant TransposeParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - VECTOR(P, 4) r; - int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}; - int iabcd[4], oabcd[4], ixyzn[4]; - for (int n = 0; n < 4; n++) { - oxyzn[3] = n; + texture2d_array outTexture [[texture(1)]], + constant TransposeParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + VECTOR(P, 4) r; + int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}; + int iabcd[4], oabcd[4], ixyzn[4]; + for (int n = 0; n < 4; n++) { + oxyzn[3] = n; #if R == 4 - xyzn2abcd_4(pm.oC, oxyzn, iabcd); + xyzn2abcd_4(pm.oC, oxyzn, iabcd); #endif // R == 4 #if R == 3 - xyzn2abcd_3(oxyzn, oabcd); + xyzn2abcd_3(oxyzn, oabcd); #endif // R == 3 #if R == 2 - xyzn2abcd_2(oxyzn, oabcd); + xyzn2abcd_2(oxyzn, oabcd); #endif // R == 2 - iabcd[pm.axis[0]] = oabcd[0]; - iabcd[pm.axis[1]] = oabcd[1]; - iabcd[pm.axis[2]] = oabcd[2]; - iabcd[pm.axis[3]] = oabcd[3]; + iabcd[pm.axis[0]] = oabcd[0]; + iabcd[pm.axis[1]] = oabcd[1]; + iabcd[pm.axis[2]] = oabcd[2]; + iabcd[pm.axis[3]] = oabcd[3]; #if R == 4 - abcd2xyzn_4(pm.iC, iabcd, ixyzn); + abcd2xyzn_4(pm.iC, iabcd, ixyzn); #endif // R == 4 #if R == 3 - abcd2xyzn_3(iabcd, ixyzn); + abcd2xyzn_3(iabcd, ixyzn); #endif // R == 3 #if R == 2 - abcd2xyzn_2(iabcd, ixyzn); + abcd2xyzn_2(iabcd, ixyzn); #endif // R == 2 - r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]]; - } - outTexture.write(r, gid.xy, gid.z); + r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]]; + } + outTexture.write(r, gid.xy, gid.z); } #endif diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal index 321663b9b7..66c22f0388 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal @@ -17,47 +17,47 @@ using namespace metal; struct TransposeParam { - int iC; - int oC; - int axis[4]; + int iC; + int oC; + int axis[4]; }; kernel void transpose_copy_float(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant TransposeParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + constant TransposeParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z); } kernel void transpose_copy_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant TransposeParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + constant TransposeParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z); } #define R 4 - #define P float - #include "TransposeKernel.inc.metal" - #undef P - #define P half - #include "TransposeKernel.inc.metal" - #undef P +#define P float +#include "TransposeKernel.inc.metal" +#undef P +#define P half +#include "TransposeKernel.inc.metal" +#undef P #undef R #define R 3 - #define P float - #include "TransposeKernel.inc.metal" - #undef P - #define P half - #include "TransposeKernel.inc.metal" - #undef P +#define P float +#include "TransposeKernel.inc.metal" +#undef P +#define P half +#include "TransposeKernel.inc.metal" +#undef P #undef R #define R 2 - #define P float - #include "TransposeKernel.inc.metal" - #undef P - #define P half - #include "TransposeKernel.inc.metal" - #undef P +#define P float +#include "TransposeKernel.inc.metal" +#undef P +#define P half +#include "TransposeKernel.inc.metal" +#undef P #undef R diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift index 7817befaed..29730fd3b6 100644 --- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift +++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift @@ -1,11 +1,11 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - + http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,35 +16,35 @@ import UIKit @UIApplicationMain class AppDelegate: UIResponder, UIApplicationDelegate { - + var window: UIWindow? - + func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool { // Override point for customization after application launch. return true } - + func applicationWillResignActive(_ application: UIApplication) { // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state. // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game. } - + func applicationDidEnterBackground(_ application: UIApplication) { // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later. // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits. } - + func applicationWillEnterForeground(_ application: UIApplication) { // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background. } - + func applicationDidBecomeActive(_ application: UIApplication) { // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface. } - + func applicationWillTerminate(_ application: UIApplication) { // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:. } - - + + } diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift index 98f03affa2..4c5886c7c1 100644 --- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift +++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift @@ -1,11 +1,11 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - + http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,9 +27,9 @@ class ViewController: UIViewController { inQueue: queue ) test.testConcat() -// test.testReshape() -// test.testTranspose() + // test.testReshape() + // test.testTranspose() print(" done ") } - + } diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj index 3aa4e88541..afa580e3cb 100644 --- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj +++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj @@ -741,7 +741,7 @@ CODE_SIGN_IDENTITY = "iPhone Developer"; CODE_SIGN_STYLE = Automatic; DEFINES_MODULE = YES; - DEVELOPMENT_TEAM = ""; + DEVELOPMENT_TEAM = A798K58VVL; DYLIB_COMPATIBILITY_VERSION = 1; DYLIB_CURRENT_VERSION = 1; DYLIB_INSTALL_NAME_BASE = "@rpath"; @@ -778,7 +778,7 @@ CODE_SIGN_IDENTITY = "iPhone Developer"; CODE_SIGN_STYLE = Automatic; DEFINES_MODULE = YES; - DEVELOPMENT_TEAM = ""; + DEVELOPMENT_TEAM = A798K58VVL; DYLIB_COMPATIBILITY_VERSION = 1; DYLIB_CURRENT_VERSION = 1; DYLIB_INSTALL_NAME_BASE = "@rpath"; diff --git a/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift b/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift index da66460d8b..ba15d890a4 100644 --- a/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift +++ b/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift @@ -15,24 +15,26 @@ import Foundation @objc public enum MetalLoadMode: Int { - case - LoadMetalInPaddleMobile = 1, // 使用 paddle-mobile 中的 metal 代码 - LoadMetalInDefaultLib = 2, // 使用 main bundle 中的 metal 代码 - LoadMetalInCustomMetalLib = 3 // 使用 metal 库文件 + case + LoadMetalInPaddleMobile = 1, // 使用 paddle-mobile 中的 metal 代码 + LoadMetalInDefaultLib = 2, // 使用 main bundle 中的 metal 代码 + LoadMetalInCustomMetalLib = 3 // 使用 metal 库文件 } @objc public enum ComputePrecision: Int { - case - Float32 = 1, - Float16 = 2 + case + Float32 = 1, + Float16 = 2 } @objc public class GlobalConfig: NSObject { - - /// 单例 - @objc public static let shared: GlobalConfig = GlobalConfig.init() - - /// 运算精度, runner 生命周期中不可变 - @objc public var computePrecision: ComputePrecision = .Float16 - + + /// 单例 + @objc public static let shared: GlobalConfig = GlobalConfig.init() + + /// 运算精度, runner 生命周期中不可变 + @objc public var computePrecision: ComputePrecision = .Float16 + + /// 是否开启 log + @objc public var debug: Bool = true; } diff --git a/metal/paddle-mobile/paddle-mobile/API/Net.swift b/metal/paddle-mobile/paddle-mobile/API/Net.swift index 33cedb5712..5087ebfd82 100644 --- a/metal/paddle-mobile/paddle-mobile/API/Net.swift +++ b/metal/paddle-mobile/paddle-mobile/API/Net.swift @@ -17,74 +17,74 @@ import Foundation /// 网络的基类, 参数已经给了默认值,请在子类实现中修改需要改的参数 @objc open class Net: NSObject { - - /// 默认为0, 如果指定个数, 后边 except 个op不使用 GPU 运算, 中间结果会通过 fetchResult 传参过来 - @objc public var except: Int = 0 - - /// 预处理 kernel, 如果输入图像需要预处理, 则指定预处理 kernel - @objc public var preprocessKernel: CusomKernel? = nil - - // 以下四个参数为从内存中读取模型时用到的参数 - /// 模型在内存中的指针 - @objc public var modelPointer: UnsafeMutableRawPointer? = nil - - /// 模型大小 单位: 字节 - @objc public var modelSize: Int = 0 - - /// 权重参数在内存中的指针 - @objc public var paramPointer: UnsafeMutableRawPointer? = nil - - /// 权重大小 单位: 字节 - @objc public var paramSize: Int = 0 - - // 以下两个为从文件中读取模型时用到的参数 - /// 模型文件路径 - @objc public var modelPath: String? = nil - - /// 权重文件路径 - @objc public var paramPath: String? = nil - - /// 代表着 GPU 处理器 - @objc public let device: MTLDevice - - /// metal 代码加载方式 注意: 如果静态库只能使用 LoadMetalInDefaultLib LoadMetalInCustomMetalLib 进行 load metal 代码 - @objc public var metalLoadMode: MetalLoadMode = .LoadMetalInPaddleMobile - - /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时, metal library 路径不能为空 - @objc public var metalLibPath: String? = nil - - /// 输入维度,按照 n h w c 方式传入 - @objc public var inputDim: Dim = Dim.init(inDim: []) - - - @objc public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) { - self.paramPointer = inParamPointer - self.paramSize = inParamSize - self.modelPointer = inModelPointer - self.modelSize = inModelSize - self.device = device - super.init() - } - - @objc public init(device: MTLDevice) { - self.device = device - super.init() - } - - @objc open func resultStr(res: [ResultHolder]) -> String { - fatalError() - } - - @objc open func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] { - return paddleMobileRes.map { (gpuRes) -> ResultHolder in - guard let inResPointer = gpuRes.resultPointer else { + + /// 默认为0, 如果指定个数, 后边 except 个op不使用 GPU 运算, 中间结果会通过 fetchResult 传参过来 + @objc public var except: Int = 0 + + /// 预处理 kernel, 如果输入图像需要预处理, 则指定预处理 kernel + @objc public var preprocessKernel: CusomKernel? = nil + + // 以下四个参数为从内存中读取模型时用到的参数 + /// 模型在内存中的指针 + @objc public var modelPointer: UnsafeMutableRawPointer? = nil + + /// 模型大小 单位: 字节 + @objc public var modelSize: Int = 0 + + /// 权重参数在内存中的指针 + @objc public var paramPointer: UnsafeMutableRawPointer? = nil + + /// 权重大小 单位: 字节 + @objc public var paramSize: Int = 0 + + // 以下两个为从文件中读取模型时用到的参数 + /// 模型文件路径 + @objc public var modelPath: String? = nil + + /// 权重文件路径 + @objc public var paramPath: String? = nil + + /// 代表着 GPU 处理器 + @objc public let device: MTLDevice + + /// metal 代码加载方式 注意: 如果静态库只能使用 LoadMetalInDefaultLib LoadMetalInCustomMetalLib 进行 load metal 代码 + @objc public var metalLoadMode: MetalLoadMode = .LoadMetalInPaddleMobile + + /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时, metal library 路径不能为空 + @objc public var metalLibPath: String? = nil + + /// 输入维度,按照 n h w c 方式传入 + @objc public var inputDim: Dim = Dim.init(inDim: []) + + + @objc public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) { + self.paramPointer = inParamPointer + self.paramSize = inParamSize + self.modelPointer = inModelPointer + self.modelSize = inModelSize + self.device = device + super.init() + } + + @objc public init(device: MTLDevice) { + self.device = device + super.init() + } + + @objc open func resultStr(res: [ResultHolder]) -> String { fatalError() - } - return ResultHolder.init(inResult: inResPointer, inCapacity: gpuRes.capacity, inDim: gpuRes.dim) } - } - - open func updateProgram(program: Program) { - } - + + @objc open func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] { + return paddleMobileRes.map { (gpuRes) -> ResultHolder in + guard let inResPointer = gpuRes.resultPointer else { + fatalError() + } + return ResultHolder.init(inResult: inResPointer, inCapacity: gpuRes.capacity, inDim: gpuRes.dim) + } + } + + open func updateProgram(program: Program) { + } + } diff --git a/metal/paddle-mobile/paddle-mobile/API/Runner.swift b/metal/paddle-mobile/paddle-mobile/API/Runner.swift index 2d7bf9d190..d6c30805eb 100644 --- a/metal/paddle-mobile/paddle-mobile/API/Runner.swift +++ b/metal/paddle-mobile/paddle-mobile/API/Runner.swift @@ -16,187 +16,187 @@ import MetalKit import Foundation @objc public class ResultHolder: NSObject { - @objc public let result: UnsafeMutablePointer - @objc public let capacity: Int - @objc public let dim: [Int] - - init(inResult: UnsafeMutablePointer, inCapacity: Int, inDim: [Int]) { - result = inResult - capacity = inCapacity - dim = inDim - } - - @objc public func releasePointer() { - result.deinitialize(count: capacity) - result.deallocate() - } + @objc public let result: UnsafeMutablePointer + @objc public let capacity: Int + @objc public let dim: [Int] + + init(inResult: UnsafeMutablePointer, inCapacity: Int, inDim: [Int]) { + result = inResult + capacity = inCapacity + dim = inDim + } + + @objc public func releasePointer() { + result.deinitialize(count: capacity) + result.deallocate() + } } @objc public class Runner: NSObject { - var program: Program? - var executor: Executor? - var queue: MTLCommandQueue? - var textureLoader: MTKTextureLoader? - public let net: Net - let device: MTLDevice? - let numel: Int - - /// 初始化函数 - /// - /// - Parameters: - /// - inNet: 传入自定义的网络 - /// - commandQueue: commandQueue - @objc public init(inNet: Net, commandQueue: MTLCommandQueue?) { - guard inNet.inputDim.cout() == 4 else { - fatalError(" input dim count must 4 ") - } + var program: Program? + var executor: Executor? + var queue: MTLCommandQueue? + var textureLoader: MTKTextureLoader? + public let net: Net + let device: MTLDevice? + let numel: Int - net = inNet - queue = commandQueue - device = queue?.device - if let inDevice = device { - textureLoader = MTKTextureLoader.init(device: inDevice) - } - numel = net.inputDim.numel() - } - - /// load 模型, 返回 true 可进行预测 - /// - /// - Returns: load 成功或失败 - @objc public func load() -> Bool { - guard let inDevice = device, let inQueue = queue else { - print(" paddle mobile gpu load error, need MTLCommandQueue") - return false - } - let loader = Loader.init() - do { - - if let inParamPointer = net.paramPointer, let inModelPointer = net.modelPointer { - guard net.paramSize > 0 && net.modelSize > 0 else { - print(" load from memory param size or model size can't 0 ") - return false - } - program = try loader.load(device: inDevice, paramPointer: inParamPointer, paramSize: net.paramSize,modePointer:inModelPointer,modelSize:net.modelSize) - } else if let inModelPath = net.modelPath, let inParamPath = net.paramPath { - program = try loader.load(device: inDevice, modelPath: inModelPath, paraPath: inParamPath) - } else { - print(" model pointer or model file path need be specified") - return false + /// 初始化函数 + /// + /// - Parameters: + /// - inNet: 传入自定义的网络 + /// - commandQueue: commandQueue + @objc public init(inNet: Net, commandQueue: MTLCommandQueue?) { + guard inNet.inputDim.cout() == 4 else { + fatalError(" input dim count must 4 ") } - let initContext: InitContext = InitContext.init() - initContext.metalLoadMode = net.metalLoadMode - initContext.metalLibPath = net.metalLibPath - executor = try Executor.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!, initContext: initContext) - net.updateProgram(program: program!) - } catch let error { - print(error) - return false - } - return true - } - - /// 预测 - /// - /// - Parameters: - /// - texture: 输入 texture 需要使用 getTexture 获得 - /// - completion: 结果回调, 当 success 为 true 时 result 不为 nil - @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: [ResultHolder]?) -> Void) { - do { - try self.executor?.predict(input: texture, dim: self.net.inputDim, completionHandle: { [weak self] (res) in - guard let SSelf = self else { - fatalError( " self nil " ) + net = inNet + queue = commandQueue + device = queue?.device + if let inDevice = device { + textureLoader = MTKTextureLoader.init(device: inDevice) } - let result = SSelf.net.fetchResult(paddleMobileRes: res) - completion(true, result) - }, preProcessKernle: self.net.preprocessKernel, except: self.net.except) - } catch let error { - print(error) - completion(false, nil) - return - } - } - - /// 清理内存, 调用此函数后, 不能再使用, 需重新 load - @objc public func clear() { - executor?.clear() - executor = nil - program = nil - } - - /// 获取 texture, 对 texture 进行预处理, 预测时使用 - /// - /// - Parameters: - /// - image: 输入图像 - /// - getTexture: 获取 texture 回调 - @objc public func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) { - let texture = try? textureLoader?.newTexture(cgImage: image, options: [:]) ?! " texture loader error" - scaleTexture(input: texture!, complete: getTexture) - } - - /// 通过 buffer 获取 texture, 内部会使用GPU进行转换操作 - /// - /// - Parameters: - /// - inBuffer: 输入buffer - /// - getTexture: 结果回调 - @objc public func getTexture(inBuffer: MTLBuffer, getTexture: @escaping (MTLTexture) -> Void) { - guard let inQueue = queue, let inDevice = device else { - fatalError( " queue or devcie nil " ) + numel = net.inputDim.numel() } - guard let buffer = inQueue.makeCommandBuffer() else { - fatalError( " make buffer error" ) - } - - let bufferToTextureKernel = BufferToTextureKernel.init(device: inDevice, outputDim: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: net.inputDim[3]), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath) - do { - try bufferToTextureKernel.compute(inputBuffer: inBuffer, commandBuffer: buffer) - } catch { - fatalError(" bufferToTextureKernel error ") + /// load 模型, 返回 true 可进行预测 + /// + /// - Returns: load 成功或失败 + @objc public func load() -> Bool { + guard let inDevice = device, let inQueue = queue else { + print(" paddle mobile gpu load error, need MTLCommandQueue") + return false + } + let loader = Loader.init() + do { + + if let inParamPointer = net.paramPointer, let inModelPointer = net.modelPointer { + guard net.paramSize > 0 && net.modelSize > 0 else { + print(" load from memory param size or model size can't 0 ") + return false + } + program = try loader.load(device: inDevice, paramPointer: inParamPointer, paramSize: net.paramSize,modePointer:inModelPointer,modelSize:net.modelSize) + } else if let inModelPath = net.modelPath, let inParamPath = net.paramPath { + program = try loader.load(device: inDevice, modelPath: inModelPath, paraPath: inParamPath) + } else { + print(" model pointer or model file path need be specified") + return false + } + + let initContext: InitContext = InitContext.init() + initContext.metalLoadMode = net.metalLoadMode + initContext.metalLibPath = net.metalLibPath + executor = try Executor.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!, initContext: initContext) + net.updateProgram(program: program!) + } catch let error { + print(error) + return false + } + return true } - buffer.addCompletedHandler { (buffer) in - getTexture(bufferToTextureKernel.outputTexture) + /// 预测 + /// + /// - Parameters: + /// - texture: 输入 texture 需要使用 getTexture 获得 + /// - completion: 结果回调, 当 success 为 true 时 result 不为 nil + @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: [ResultHolder]?) -> Void) { + do { + try self.executor?.predict(input: texture, dim: self.net.inputDim, completionHandle: { [weak self] (res) in + guard let SSelf = self else { + fatalError( " self nil " ) + } + let result = SSelf.net.fetchResult(paddleMobileRes: res) + completion(true, result) + }, preProcessKernle: self.net.preprocessKernel, except: self.net.except) + } catch let error { + print(error) + completion(false, nil) + return + } } - buffer.commit() - } - - /// 更新输入维度, 针对可变长输入模型 - /// - /// - Parameter inDim: 输入维度 - @objc public func updateInputDim(inDim: Dim) { - if net.inputDim != inDim { - guard let inProgram = program else { - fatalError(" need load first ") - } - net.inputDim = inDim - net.updateProgram(program: inProgram) + /// 清理内存, 调用此函数后, 不能再使用, 需重新 load + @objc public func clear() { + executor?.clear() + executor = nil + program = nil } - } - - public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) { - guard let inQueue = queue, let inDevice = device else { - fatalError( " queue or devcie nil " ) + /// 获取 texture, 对 texture 进行预处理, 预测时使用 + /// + /// - Parameters: + /// - image: 输入图像 + /// - getTexture: 获取 texture 回调 + @objc public func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) { + let texture = try? textureLoader?.newTexture(cgImage: image, options: [:]) ?! " texture loader error" + scaleTexture(input: texture!, complete: getTexture) } - guard let buffer = inQueue.makeCommandBuffer() else { - fatalError( " make buffer error" ) + /// 通过 buffer 获取 texture, 内部会使用GPU进行转换操作 + /// + /// - Parameters: + /// - inBuffer: 输入buffer + /// - getTexture: 结果回调 + @objc public func getTexture(inBuffer: MTLBuffer, getTexture: @escaping (MTLTexture) -> Void) { + guard let inQueue = queue, let inDevice = device else { + fatalError( " queue or devcie nil " ) + } + + guard let buffer = inQueue.makeCommandBuffer() else { + fatalError( " make buffer error" ) + } + + let bufferToTextureKernel = BufferToTextureKernel.init(device: inDevice, outputDim: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: net.inputDim[3]), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath) + do { + try bufferToTextureKernel.compute(inputBuffer: inBuffer, commandBuffer: buffer) + } catch { + fatalError(" bufferToTextureKernel error ") + } + + buffer.addCompletedHandler { (buffer) in + getTexture(bufferToTextureKernel.outputTexture) + } + + buffer.commit() } - let scaleKernel = ScaleKernel.init(device: inDevice, shape: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: 3), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath) - - do { - try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer) - } catch let error { - print(error) - fatalError() + /// 更新输入维度, 针对可变长输入模型 + /// + /// - Parameter inDim: 输入维度 + @objc public func updateInputDim(inDim: Dim) { + if net.inputDim != inDim { + guard let inProgram = program else { + fatalError(" need load first ") + } + net.inputDim = inDim + net.updateProgram(program: inProgram) + } } - buffer.addCompletedHandler { (buffer) in - complete(scaleKernel.outputTexture) + public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) { + + guard let inQueue = queue, let inDevice = device else { + fatalError( " queue or devcie nil " ) + } + + guard let buffer = inQueue.makeCommandBuffer() else { + fatalError( " make buffer error" ) + } + + let scaleKernel = ScaleKernel.init(device: inDevice, shape: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: 3), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath) + + do { + try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer) + } catch let error { + print(error) + fatalError() + } + + buffer.addCompletedHandler { (buffer) in + complete(scaleKernel.outputTexture) + } + buffer.commit() } - buffer.commit() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift index 12bc909be9..64786d0a45 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift @@ -16,128 +16,128 @@ import Foundation // 自定义 ?! 如果 ?! 前的返回值为一个可选值, 则进行隐式解包, 如果有值则返回这个值, 如果为nil 则fatalError 传入的信息 precedencegroup ExecutedOrFatalError{ - associativity: left - higherThan: AssignmentPrecedence + associativity: left + higherThan: AssignmentPrecedence } infix operator ?!: ExecutedOrFatalError public func ?!(option: T?, excuteOrError: @autoclosure () -> String) -> T{ - if let inOpt = option { - return inOpt - }else{ - print(excuteOrError()) - fatalError(excuteOrError()) - } + if let inOpt = option { + return inOpt + }else{ + print(excuteOrError()) + fatalError(excuteOrError()) + } } //Lense struct Lense { - let from: (A) -> B - let to: (B, A) -> A + let from: (A) -> B + let to: (B, A) -> A } precedencegroup CombineLense{ - associativity: left - higherThan: AssignmentPrecedence + associativity: left + higherThan: AssignmentPrecedence } infix operator >>>: CombineLense func >>>(left: Lense, right: Lense) -> Lense { - return Lense.init(from: { (a) -> C in - left.from(right.from(a)) - }, to: { (c, a) -> A in - right.to( left.to(c, right.from(a)),a) - }) + return Lense.init(from: { (a) -> C in + left.from(right.from(a)) + }, to: { (c, a) -> A in + right.to( left.to(c, right.from(a)),a) + }) } protocol CIntIndex { - associatedtype T; - subscript(index: CInt) -> T { get set}; + associatedtype T; + subscript(index: CInt) -> T { get set}; } extension Array: CIntIndex{ - typealias T = Element - subscript(index: CInt) -> T { - get{ - guard Int64(Int.max) >= Int64(index) else{ - fatalError("cint index out of Int range") - } - return self[Int(index)] - } - set{ - guard Int64(Int.max) >= Int64(index) else{ - fatalError("cint index out of Int range") - } - self[Int(index)] = newValue + typealias T = Element + subscript(index: CInt) -> T { + get{ + guard Int64(Int.max) >= Int64(index) else{ + fatalError("cint index out of Int range") + } + return self[Int(index)] + } + set{ + guard Int64(Int.max) >= Int64(index) else{ + fatalError("cint index out of Int range") + } + self[Int(index)] = newValue + } + } - - } } extension Array where Element: AnyObject{ - mutating func remove(element: Element) { - if let index = index(where: { (node) -> Bool in - return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self) - }) { - remove(at: index) + mutating func remove(element: Element) { + if let index = index(where: { (node) -> Bool in + return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self) + }) { + remove(at: index) + } } - } - + } //MARK: Array extension extension Array where Element: Comparable{ - - /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回 - /// - /// - Parameter r: 前 r 个元素 - /// - Returns: [(原有位置, 排好位置的元素)] - public func top(r: Int) -> [(Int, Element)] { - precondition(r <= self.count) - return Array<(Int, Element)>(zip(0.. $1.1 }.prefix(through: r - 1)) - } + + /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回 + /// + /// - Parameter r: 前 r 个元素 + /// - Returns: [(原有位置, 排好位置的元素)] + public func top(r: Int) -> [(Int, Element)] { + precondition(r <= self.count) + return Array<(Int, Element)>(zip(0.. $1.1 }.prefix(through: r - 1)) + } } extension Array { - public func strideArray(inCount: Int = 20) -> [(Int, Element)] { - if count < inCount { - return (0.. [(Int, Element)] { + if count < inCount { + return (0.., count: Int) -> [Float32] { - var arr: [Float32] = [] - for i in 0.., count: Int) -> [Float32] { + var arr: [Float32] = [] + for i in 0.. [Pointee]{ - var arr: [Pointee] = [] - for i in 0.. [Pointee]{ + var arr: [Pointee] = [] + for i in 0.. UnsafePointer? { - return (self as NSString).utf8String - } + func cStr() -> UnsafePointer? { + return (self as NSString).utf8String + } } func address(o: T) -> String { - return String.init(format: "%018p", unsafeBitCast(o, to: Int.self)) + return String.init(format: "%018p", unsafeBitCast(o, to: Int.self)) } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift index c3ba777b27..35fffb52ec 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift @@ -21,615 +21,615 @@ fileprivate var paddleMobileMetalLibrary: MTLLibrary? fileprivate var customMetalLibrary: MTLLibrary? extension MTLDevice { - func defaultLibrary() -> MTLLibrary { - if defaultMetalLibrary == nil { - defaultMetalLibrary = makeDefaultLibrary() - } - if let inDefaultLib = defaultMetalLibrary { - return inDefaultLib - } else { - fatalError(" default metal libary is nil") - } - } - - func customLibrary(metalLibPath: String) -> MTLLibrary { - if customMetalLibrary == nil { - do { - customMetalLibrary = try makeLibrary(filepath: metalLibPath) - } catch let error { - fatalError("\(error)") - } + func defaultLibrary() -> MTLLibrary { + if defaultMetalLibrary == nil { + defaultMetalLibrary = makeDefaultLibrary() + } + if let inDefaultLib = defaultMetalLibrary { + return inDefaultLib + } else { + fatalError(" default metal libary is nil") + } } - if let inMetalLib = customMetalLibrary { - return inMetalLib - } else { - fatalError(" customlib is nil ") - } - } - - func paddleMobileLibrary() -> MTLLibrary { - if paddleMobileMetalLibrary == nil { - guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else { - fatalError("Counld't find paddle mobile library") - } - do { - paddleMobileMetalLibrary = try makeLibrary(filepath: path) - } catch _ { - fatalError("Counld't load paddle mobile library") - } + func customLibrary(metalLibPath: String) -> MTLLibrary { + if customMetalLibrary == nil { + do { + customMetalLibrary = try makeLibrary(filepath: metalLibPath) + } catch let error { + fatalError("\(error)") + } + } + + if let inMetalLib = customMetalLibrary { + return inMetalLib + } else { + fatalError(" customlib is nil ") + } } - if let inPaddleMobileLib = paddleMobileMetalLibrary { - return inPaddleMobileLib - } else { - fatalError("PaddleMobile metal libary is nil") - } - } - - func pipeLine(funcName: String, metalLoadMode: MetalLoadMode, metalLibPath: String?) -> MTLComputePipelineState { - let useLib: MTLLibrary - switch metalLoadMode { - case .LoadMetalInDefaultLib: - useLib = defaultLibrary() - case .LoadMetalInPaddleMobile: - useLib = paddleMobileLibrary() - case .LoadMetalInCustomMetalLib: - useLib = customLibrary(metalLibPath: metalLibPath ?! " can't be nil ") - default: - fatalError() + func paddleMobileLibrary() -> MTLLibrary { + if paddleMobileMetalLibrary == nil { + guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else { + fatalError("Counld't find paddle mobile library") + } + do { + paddleMobileMetalLibrary = try makeLibrary(filepath: path) + } catch _ { + fatalError("Counld't load paddle mobile library") + } + } + + if let inPaddleMobileLib = paddleMobileMetalLibrary { + return inPaddleMobileLib + } else { + fatalError("PaddleMobile metal libary is nil") + } } - guard let function = useLib.makeFunction(name: funcName) else { - fatalError(" function " + funcName + " not found") - } - do { - let pipLine = try makeComputePipelineState(function: function) - return pipLine - } catch let error { - print(error) - fatalError("make pip line error occured : \(error)") + func pipeLine(funcName: String, metalLoadMode: MetalLoadMode, metalLibPath: String?) -> MTLComputePipelineState { + let useLib: MTLLibrary + switch metalLoadMode { + case .LoadMetalInDefaultLib: + useLib = defaultLibrary() + case .LoadMetalInPaddleMobile: + useLib = paddleMobileLibrary() + case .LoadMetalInCustomMetalLib: + useLib = customLibrary(metalLibPath: metalLibPath ?! " can't be nil ") + default: + fatalError() + } + + guard let function = useLib.makeFunction(name: funcName) else { + fatalError(" function " + funcName + " not found") + } + do { + let pipLine = try makeComputePipelineState(function: function) + return pipLine + } catch let error { + print(error) + fatalError("make pip line error occured : \(error)") + } + } - } - - func makeBuffer

(value: [P]) -> MTLBuffer { - let buffer = makeBuffer(length: value.count * MemoryLayout

.size, options: MTLResourceOptions.storageModeShared) - let contents = buffer?.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout

.size) - for i in 0..(texture: MTLTexture, cb: ([Int], P)->Void) -> Void { - let bpR = texture.width * 4 * MemoryLayout

.size - let bpI = texture.height * bpR - let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1)) - for i in 0.. = UnsafeMutablePointer

.allocate(capacity: bpI) - texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i) - for tx in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { - var tdim: [Int] = [1, 1, 1, 1] - for i in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { - var tdim: [Int] = [1, 1, 1, 1] - for i in 0..(value: [P]) -> MTLBuffer { + let buffer = makeBuffer(length: value.count * MemoryLayout

.size, options: MTLResourceOptions.storageModeShared) + let contents = buffer?.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout

.size) + for i in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { - var tdim: [Int] = [1, 1, 1, 1] - for i in 0..(texture: MTLTexture, cb: ([Int], P)->Void) -> Void { + let bpR = texture.width * 4 * MemoryLayout

.size + let bpI = texture.height * bpR + let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1)) + for i in 0.. = UnsafeMutablePointer

.allocate(capacity: bpI) + texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i) + for tx in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { - if dim.count == 3 { - return texture2tensor_3(texture: texture, dim: dim, transpose: transpose) - } else if dim.count == 2 { - return texture2tensor_2(texture: texture, dim: dim, transpose: transpose) - } else if dim.count == 1 { - return texture2tensor_1(texture: texture, dim: dim, transpose: transpose) - } - var tdim: [Int] = [1, 1, 1, 1] - for i in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { + var tdim: [Int] = [1, 1, 1, 1] + for i in 0..(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: ComputePrecision = .Float32) -> MTLTexture { - if value.count > 0 { - assert(value.count == dim.reduce(1) { $0 * $1 }) + func texture2tensor_2

(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { + var tdim: [Int] = [1, 1, 1, 1] + for i in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { + var tdim: [Int] = [1, 1, 1, 1] + for i in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { + if dim.count == 3 { + return texture2tensor_3(texture: texture, dim: dim, transpose: transpose) + } else if dim.count == 2 { + return texture2tensor_2(texture: texture, dim: dim, transpose: transpose) + } else if dim.count == 1 { + return texture2tensor_1(texture: texture, dim: dim, transpose: transpose) + } + var tdim: [Int] = [1, 1, 1, 1] + for i in 0.. 0 { - var rcount: Int = (ndim[0] * ndim[3] + 3) / 4 - rcount = rcount * 4 * ndim[1] * ndim[2] - var nvalue: [Float32] = .init(repeating: 0.0, count: rcount) - - for i0 in 0.. = UnsafeMutablePointer(mutating: nvalue) - let outputP: UnsafeMutablePointer = UnsafeMutablePointer(mutating: xvalue) - float32ToFloat16(input: pointer, output: outputP, count: rcount) - let bpR = ndim[2] * 4 * 2 - let bpI = ndim[1] * bpR - for i in 0.. = UnsafeMutablePointer(mutating: nvalue) - let bpR = ndim[2] * 4 * MemoryLayout

.size - let bpI = ndim[1] * bpR - for i in 0..(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: ComputePrecision = .Float32) -> MTLTexture { + if value.count > 0 { + assert(value.count == dim.reduce(1) { $0 * $1 }) + } + + var tdim: [Int] = [1, 1, 1, 1] + for i in 0.. 0 { + var rcount: Int = (ndim[0] * ndim[3] + 3) / 4 + rcount = rcount * 4 * ndim[1] * ndim[2] + var nvalue: [Float32] = .init(repeating: 0.0, count: rcount) + + for i0 in 0.. = UnsafeMutablePointer(mutating: nvalue) + let outputP: UnsafeMutablePointer = UnsafeMutablePointer(mutating: xvalue) + float32ToFloat16(input: pointer, output: outputP, count: rcount) + let bpR = ndim[2] * 4 * 2 + let bpI = ndim[1] * bpR + for i in 0.. = UnsafeMutablePointer(mutating: nvalue) + let bpR = ndim[2] * 4 * MemoryLayout

.size + let bpI = ndim[1] * bpR + for i in 0..(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{ - - let textureDesc = MTLTextureDescriptor.init() - textureDesc.width = textureWidth - textureDesc.height = textureHeight - textureDesc.depth = 1 - textureDesc.usage = [.shaderRead, .shaderWrite] - textureDesc.pixelFormat = .rgba32Float - textureDesc.textureType = .type2DArray - textureDesc.storageMode = .shared - textureDesc.cpuCacheMode = .defaultCache - textureDesc.arrayLength = arrayLength - let texture = makeTexture(descriptor: textureDesc)! - if value.count >= 4{ - let counts = arrayLength * 4 * textureWidth * textureHeight - let pointer: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: counts * MemoryLayout

.size) - for i in 0...size - let bytesPerImage = texture.height * bytesPerRow - let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth)) - for i in 0..(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{ + + let textureDesc = MTLTextureDescriptor.init() + textureDesc.width = textureWidth + textureDesc.height = textureHeight + textureDesc.depth = 1 + textureDesc.usage = [.shaderRead, .shaderWrite] + textureDesc.pixelFormat = .rgba32Float + textureDesc.textureType = .type2DArray + textureDesc.storageMode = .shared + textureDesc.cpuCacheMode = .defaultCache + textureDesc.arrayLength = arrayLength + let texture = makeTexture(descriptor: textureDesc)! + + if value.count >= 4{ + let counts = arrayLength * 4 * textureWidth * textureHeight + let pointer: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: counts * MemoryLayout

.size) + for i in 0...size + let bytesPerImage = texture.height * bytesPerRow + let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth)) + for i in 0..(stridable: Bool = true) -> [(index: Int, value: P)] { - var arr: [P] = floatArray { (p: P) -> P in - return p; - } - var result: [(index: Int, value: P)] = [] - if arr.count > 100 && stridable { - for j in stride(from: 0, to: arr.count , by: arr.count / 100){ - result.append((j, arr[j])) - } - } else { - for j in 0..(res: (P) -> T) -> [T] { - var fArr: [T] = [] - if textureType == .type2DArray { - for i in 0...size, alignment: MemoryLayout

.alignment) - let bytesPerRow = width * depth * 4 * MemoryLayout

.size - let bytesPerImage = width * height * depth * 4 * MemoryLayout

.size - let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) - getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i) - let p = bytes.assumingMemoryBound(to: P.self) - - for j in 0...size, alignment: MemoryLayout

.alignment) - let bytesPerRow = width * depth * 4 * MemoryLayout

.size - let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) - getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0) - let p = bytes.assumingMemoryBound(to: P.self) - - for j in 0..(stridable: Bool = true) -> [(index: Int, value: P)] { + var arr: [P] = floatArray { (p: P) -> P in + return p; + } + var result: [(index: Int, value: P)] = [] + if arr.count > 100 && stridable { + for j in stride(from: 0, to: arr.count , by: arr.count / 100){ + result.append((j, arr[j])) + } + } else { + for j in 0.. [Float32] { - if pixelFormat == .rgba32Float { - let float32Array = floatArray { (f: Float32) -> Float32 in - return f - } - return float32Array - } else if pixelFormat == .rgba16Float { - - var float16Array = floatArray { (f: Float16) -> Float16 in - return f - } - return float16To32(input: &float16Array, count: float16Array.count) - } else { - fatalError() + + func floatArray(res: (P) -> T) -> [T] { + var fArr: [T] = [] + if textureType == .type2DArray { + for i in 0...size, alignment: MemoryLayout

.alignment) + let bytesPerRow = width * depth * 4 * MemoryLayout

.size + let bytesPerImage = width * height * depth * 4 * MemoryLayout

.size + let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) + getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i) + let p = bytes.assumingMemoryBound(to: P.self) + + for j in 0...size, alignment: MemoryLayout

.alignment) + let bytesPerRow = width * depth * 4 * MemoryLayout

.size + let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) + getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0) + let p = bytes.assumingMemoryBound(to: P.self) + + for j in 0..(header: String = "", stridable: Bool = true) -> T? { - print(header) - print("texture: \(self)") - // let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable) - // print(res) - if textureType == .type2DArray { - for i in 0...size, alignment: MemoryLayout.alignment) - let bytesPerRow = width * depth * 4 * MemoryLayout.size - let bytesPerImage = width * height * depth * 4 * MemoryLayout.size - let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) - getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i) - let p = bytes.assumingMemoryBound(to: T.self) - str += "2d array count : \(width * height * depth * 4) \n" - if stridable && width * height * depth * 4 > 20 { - for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 20){ - str += " index \(j): \(p[j])" - } + func float32Array() -> [Float32] { + if pixelFormat == .rgba32Float { + let float32Array = floatArray { (f: Float32) -> Float32 in + return f + } + return float32Array + } else if pixelFormat == .rgba16Float { + + var float16Array = floatArray { (f: Float16) -> Float16 in + return f + } + return float16To32(input: &float16Array, count: float16Array.count) } else { - for j in 0...size, alignment: MemoryLayout.alignment) - let bytesPerRow = width * depth * 4 * MemoryLayout.size - let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) - getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0) - let p = bytes.assumingMemoryBound(to: T.self) - str += "2d count : \(width * width * 4) \n" - - if stridable { - for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 20){ - str += "index \(j): \(p[j]) " - } - } else { - for j in 0.. [Float32] { - var textureArray: [Float32] - if pixelFormat == .rgba32Float { - textureArray = floatArray { (i : Float32) -> Float32 in - return i - } - } else if pixelFormat == .rgba16Float { - - var textureFloat16Array = floatArray { (i : Float16) -> Float16 in - return i - } - textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count) - } else { - fatalError(" 目前还不支持其他类型 ") - } - print(textureArray.count) - var output: [Float32] = [] - for s in 0..(header: String = "", stridable: Bool = true) -> T? { + print(header) + print("texture: \(self)") + // let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable) + // print(res) + + if textureType == .type2DArray { + for i in 0...size, alignment: MemoryLayout.alignment) + let bytesPerRow = width * depth * 4 * MemoryLayout.size + let bytesPerImage = width * height * depth * 4 * MemoryLayout.size + let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) + getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i) + let p = bytes.assumingMemoryBound(to: T.self) + str += "2d array count : \(width * height * depth * 4) \n" + if stridable && width * height * depth * 4 > 20 { + for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 20){ + str += " index \(j): \(p[j])" + } + } else { + for j in 0...size, alignment: MemoryLayout.alignment) + let bytesPerRow = width * depth * 4 * MemoryLayout.size + let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) + getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0) + let p = bytes.assumingMemoryBound(to: T.self) + str += "2d count : \(width * width * 4) \n" + + if stridable { + for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 20){ + str += "index \(j): \(p[j]) " + } + } else { + for j in 0.. [Float32] { -// print("origin dim: \(dim)") -// print("texture: ") -// print(self) - var textureArray: [Float32] - if pixelFormat == .rgba32Float { - textureArray = floatArray { (i : Float32) -> Float32 in - return i - } - } else if pixelFormat == .rgba16Float { - var textureFloat16Array = floatArray { (i : Float16) -> Float16 in - return i - } - textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count) - } else { - fatalError(" 目前还不支持其他类型 ") + // n c h w - dim + func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] { + var textureArray: [Float32] + if pixelFormat == .rgba32Float { + textureArray = floatArray { (i : Float32) -> Float32 in + return i + } + } else if pixelFormat == .rgba16Float { + + var textureFloat16Array = floatArray { (i : Float16) -> Float16 in + return i + } + textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count) + } else { + fatalError(" 目前还不支持其他类型 ") + } + print(textureArray.count) + var output: [Float32] = [] + for s in 0.. dim.c { - for i in 0..<(4 - ((sliceIndex * 4 + 4) - dim.c)) { - let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i] - output.append(value) - } - } else { - for i in 0..<4 { - let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i] - output.append(value) - } - } - } - } + func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] { + // print("origin dim: \(dim)") + // print("texture: ") + // print(self) + + var textureArray: [Float32] + if pixelFormat == .rgba32Float { + textureArray = floatArray { (i : Float32) -> Float32 in + return i + } + } else if pixelFormat == .rgba16Float { + var textureFloat16Array = floatArray { (i : Float16) -> Float16 in + return i + } + textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count) + } else { + fatalError(" 目前还不支持其他类型 ") + } + + var output: [Float32] = [] + let numOfASlice = dim.h * dim.w * 4 + for h in 0.. dim.c { + for i in 0..<(4 - ((sliceIndex * 4 + 4) - dim.c)) { + let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i] + output.append(value) + } + } else { + for i in 0..<4 { + let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i] + output.append(value) + } + } + } + } + } + return output } - return output - } - + } public extension MTLBuffer { - func logDesc(header: String = "", stridable: Bool = true) -> T? { - print(header) - print("MTLBuffer: \(self) ") - var str = "" - if stridable && length/MemoryLayout.stride > 1000{ - for j in stride(from: 0, to: length, by: length/MemoryLayout.stride / 100){ - str += " \(contents().assumingMemoryBound(to: T.self)[j])" - } - } else { - for i in 0...size { - str += " \(contents().assumingMemoryBound(to: T.self)[i])" - } + func logDesc(header: String = "", stridable: Bool = true) -> T? { + print(header) + print("MTLBuffer: \(self) ") + var str = "" + if stridable && length/MemoryLayout.stride > 1000{ + for j in stride(from: 0, to: length, by: length/MemoryLayout.stride / 100){ + str += " \(contents().assumingMemoryBound(to: T.self)[j])" + } + } else { + for i in 0...size { + str += " \(contents().assumingMemoryBound(to: T.self)[i])" + } + } + print(str) + return nil } - print(str) - return nil - } - - func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture { - let textureDesc = MTLTextureDescriptor.init() - textureDesc.width = textureWidth - textureDesc.height = textureHeight - textureDesc.depth = 1 - textureDesc.usage = [.shaderRead, .shaderWrite] - textureDesc.pixelFormat = .rgba32Float - textureDesc.textureType = .type2DArray - textureDesc.storageMode = .shared - textureDesc.cpuCacheMode = .defaultCache - textureDesc.arrayLength = arrayLength - let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)! - return texture - } - - func array() -> [T] { - var array: [T] = [] - let pointer = contents().bindMemory(to: T.self, capacity: length) - for i in 0..<(length / MemoryLayout.size) { - array.append(pointer[i]) + + func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture { + let textureDesc = MTLTextureDescriptor.init() + textureDesc.width = textureWidth + textureDesc.height = textureHeight + textureDesc.depth = 1 + textureDesc.usage = [.shaderRead, .shaderWrite] + textureDesc.pixelFormat = .rgba32Float + textureDesc.textureType = .type2DArray + textureDesc.storageMode = .shared + textureDesc.cpuCacheMode = .defaultCache + textureDesc.arrayLength = arrayLength + let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)! + return texture + } + + func array() -> [T] { + var array: [T] = [] + let pointer = contents().bindMemory(to: T.self, capacity: length) + for i in 0..<(length / MemoryLayout.size) { + array.append(pointer[i]) + } + return array; } - return array; - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift index 724a44b0f4..52c27ccead 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift @@ -89,135 +89,135 @@ public class PaddleMobileUnitTest { } public func testConcat() { -// let buffer = queue.makeCommandBuffer() ?! "buffer is nil" -// var it: [[Float32]] = [] -// for _ in 0..<7 { -// it.append((0..<12).map { Float32($0) }) -// } -// let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) } -// let output = device.tensor2texture(value: [Float32](), dim: [3, 28]) -// -// let param = ConcatTestParam.init( -// input: input, -// output: output, -// dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]], -// axis: 1, -// odim: [3, 28] -// ) -// let concatKernel = ConcatKernel.init(device: device, testParam: param) -// concatKernel.test(cmdBuffer: buffer, param: param) -// buffer.addCompletedHandler { (buffer) in -// for i in 0...init(device: device, testParam: param) + // concatKernel.test(cmdBuffer: buffer, param: param) + // buffer.addCompletedHandler { (buffer) in + // for i in 0...init(device: device, testParam: param) -// reshapeKernel.test(commandBuffer: buffer, testParam: param) -// buffer.addCompletedHandler { (buffer) in -// let _: Float32? = inTexture.logDesc() -// let _: Float32? = outTexture.logDesc() -// self.tensorPrint(tensor: input, dim: [2, 3, 4]) -// let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [4, 6]) -// self.tensorPrint(tensor: tx, dim: [4, 6]) -// } + // let buffer = queue.makeCommandBuffer() ?! "buffer is nil" + // let input: [Float32] = (0..<24).map { Float32($0) } + // let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4]) + // let outTexture = device.tensor2texture(value: [Float32](), dim: [4, 6]) + // let mp = ReshapeMetalParam.init( + // idim: (1, 2, 3, 4), + // itrans: (0, 1, 2, 3), + // odim: (1, 1, 4, 6), + // otrans: (0, 1, 2, 3) + // ) + // let param = ReshapeTestParam.init( + // inputTexture: inTexture, + // outputTexture: outTexture, + // param: mp + // ) + // let reshapeKernel = ReshapeKernel.init(device: device, testParam: param) + // reshapeKernel.test(commandBuffer: buffer, testParam: param) + // buffer.addCompletedHandler { (buffer) in + // let _: Float32? = inTexture.logDesc() + // let _: Float32? = outTexture.logDesc() + // self.tensorPrint(tensor: input, dim: [2, 3, 4]) + // let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [4, 6]) + // self.tensorPrint(tensor: tx, dim: [4, 6]) + // } -// let input: [Float32] = (0..<24).map { Float32($0) } -// let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4]) -// let outTexture = device.tensor2texture(value: [Float32](), dim: [24]) -// let mp = ReshapeMetalParam.init( -// idim: (1, 2, 3, 4), -// itrans: (0, 1, 2, 3), -// odim: (1, 1, 1, 24), -// otrans: (0, 1, 2, 3) -// ) -// let param = ReshapeTestParam.init( -// inputTexture: inTexture, -// outputTexture: outTexture, -// param: mp -// ) -// let reshapeKernel = ReshapeKernel.init(device: device, testParam: param) -// reshapeKernel.test(commandBuffer: buffer, testParam: param) -// buffer.addCompletedHandler { (buffer) in -// let _: Float32? = inTexture.logDesc() -// let _: Float32? = outTexture.logDesc() -// self.tensorPrint(tensor: input, dim: [2, 3, 4]) -// let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24]) -// self.tensorPrint(tensor: tx, dim: [24]) -// } -// -// -// buffer.commit() + // let input: [Float32] = (0..<24).map { Float32($0) } + // let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4]) + // let outTexture = device.tensor2texture(value: [Float32](), dim: [24]) + // let mp = ReshapeMetalParam.init( + // idim: (1, 2, 3, 4), + // itrans: (0, 1, 2, 3), + // odim: (1, 1, 1, 24), + // otrans: (0, 1, 2, 3) + // ) + // let param = ReshapeTestParam.init( + // inputTexture: inTexture, + // outputTexture: outTexture, + // param: mp + // ) + // let reshapeKernel = ReshapeKernel.init(device: device, testParam: param) + // reshapeKernel.test(commandBuffer: buffer, testParam: param) + // buffer.addCompletedHandler { (buffer) in + // let _: Float32? = inTexture.logDesc() + // let _: Float32? = outTexture.logDesc() + // self.tensorPrint(tensor: input, dim: [2, 3, 4]) + // let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24]) + // self.tensorPrint(tensor: tx, dim: [24]) + // } + // + // + // buffer.commit() } public func testTranspose() { - + let buffer = queue.makeCommandBuffer() ?! "buffer is nil" -// var input: [Float32] = [] -// for i in 0..<72 { -// input.append(Float32(i)) -// } -//// let inputTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 2, arrayLength: 3) -// let inputTexture = device.tensor2texture(value: input, dim: [4, 3, 2, 3]); -// // group 1 -// let outputTexture = device.tensor2texture(value: [Float32](), dim: [3, 3, 2, 4]) -// let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 3, oC: 4, axis: [3, 1, 2, 0]) -//// let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [3, 0, 2, 1]) -//// // group 2 -//// let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 3, textureHeight: 3, arrayLength: 6) -//// let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 4, axis: [3, 0, 2, 1]) -//// -// let transposeKernel = TransposeKernel.init(device: device, testParam: param) -// -// transposeKernel.test(commandBuffer: buffer, param: param) -// -// buffer.addCompletedHandler { (buffer) in -// let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false) -// let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false) -// self.tensorPrint(tensor: input, dim: [4, 3, 2, 3]) -// let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4]) -// self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4]) -// } -// -// let input: [Float32] = (0..<24).map { Float32($0) } -// let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4]) -// let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2]) -// let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1]) -// let transposeKernel = TransposeKernel.init(device: device, testParam: param) -// -// transposeKernel.test(commandBuffer: buffer, param: param) -// -// buffer.addCompletedHandler { (buffer) in -// let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false) -// let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false) -// self.tensorPrint(tensor: input, dim: [2, 3, 4]) -// let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2]) -// self.tensorPrint(tensor: tx, dim: [3, 4, 2]) -// } -// + // var input: [Float32] = [] + // for i in 0..<72 { + // input.append(Float32(i)) + // } + //// let inputTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 2, arrayLength: 3) + // let inputTexture = device.tensor2texture(value: input, dim: [4, 3, 2, 3]); + // // group 1 + // let outputTexture = device.tensor2texture(value: [Float32](), dim: [3, 3, 2, 4]) + // let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 3, oC: 4, axis: [3, 1, 2, 0]) + //// let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [3, 0, 2, 1]) + //// // group 2 + //// let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 3, textureHeight: 3, arrayLength: 6) + //// let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 4, axis: [3, 0, 2, 1]) + //// + // let transposeKernel = TransposeKernel.init(device: device, testParam: param) + // + // transposeKernel.test(commandBuffer: buffer, param: param) + // + // buffer.addCompletedHandler { (buffer) in + // let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false) + // let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false) + // self.tensorPrint(tensor: input, dim: [4, 3, 2, 3]) + // let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4]) + // self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4]) + // } + // + // let input: [Float32] = (0..<24).map { Float32($0) } + // let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4]) + // let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2]) + // let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1]) + // let transposeKernel = TransposeKernel.init(device: device, testParam: param) + // + // transposeKernel.test(commandBuffer: buffer, param: param) + // + // buffer.addCompletedHandler { (buffer) in + // let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false) + // let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false) + // self.tensorPrint(tensor: input, dim: [2, 3, 4]) + // let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2]) + // self.tensorPrint(tensor: tx, dim: [3, 4, 2]) + // } + // buffer.commit() } @@ -225,72 +225,72 @@ public class PaddleMobileUnitTest { let buffer = queue.makeCommandBuffer() ?! " buffer is nil " let input: [Float32] = [ - 1.0, 2.0, 3.0, 4.0, - 1.0, 2.0, 3.0, 4.0, - 1.0, 2.0, 3.0, 4.0, - - 1.0, 2.0, 3.0, 4.0, - 1.0, 2.0, 3.0, 4.0, - 1.0, 2.0, 3.0, 4.0, - - 1.0, 2.0, 3.0, 4.0, - 1.0, 2.0, 3.0, 4.0, - 1.0, 2.0, 3.0, 4.0, - ] + 1.0, 2.0, 3.0, 4.0, + 1.0, 2.0, 3.0, 4.0, + 1.0, 2.0, 3.0, 4.0, + + 1.0, 2.0, 3.0, 4.0, + 1.0, 2.0, 3.0, 4.0, + 1.0, 2.0, 3.0, 4.0, + + 1.0, 2.0, 3.0, 4.0, + 1.0, 2.0, 3.0, 4.0, + 1.0, 2.0, 3.0, 4.0, + ] let filter: [Float32] = [ - //1.0 - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - //2.0 - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - //3.0 - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - //4.0 - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - ] + //1.0 + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + //2.0 + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + //3.0 + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + //4.0 + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + ] let biase: [Float32] = [1.0, 1.0, 1.0, 100.0] let newScalue: [Float32] = [1.0, 1.0, 1.0, 1.0] @@ -324,10 +324,10 @@ public class PaddleMobileUnitTest { let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize) - let initContext = InitContext.init() - initContext.metalLoadMode = .LoadMetalInDefaultLib + let initContext = InitContext.init() + initContext.metalLoadMode = .LoadMetalInDefaultLib - let convAddBnReluKernel = ConvAddBatchNormReluKernel.init(device: device, testParam: param, initContext: initContext) + let convAddBnReluKernel = ConvAddBatchNormReluKernel.init(device: device, testParam: param, initContext: initContext) convAddBnReluKernel.test(commandBuffer: buffer, param: param) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift index ae7b898a8e..701bb37bf2 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift @@ -16,222 +16,222 @@ import Foundation import Accelerate public protocol SummableMultipliable: Equatable { - static func +(lhs: Self, rhs: Self) -> Self - static func *(lhs: Self, rhs: Self) -> Self - static func -(lhs: Self, rhs: Self) -> Self + static func +(lhs: Self, rhs: Self) -> Self + static func *(lhs: Self, rhs: Self) -> Self + static func -(lhs: Self, rhs: Self) -> Self } public protocol PrecisionType: SummableMultipliable{ - init(inFloat: Float32) - init(inFloat16: Float16) - init(_ inP: P) - static var bitSize: UInt { get } + init(inFloat: Float32) + init(inFloat16: Float16) + init(_ inP: P) + static var bitSize: UInt { get } } public typealias Float16 = Int16 extension Float16: PrecisionType { - public static func * (prefix: Float16, postfix: Float16) { - return prefix * postfix - } - - public init

(_ inP: P) where P : PrecisionType { - if P.bitSize == Float32.bitSize { - self = Float16(inFloat: inP as! Float32) - } else if P.bitSize == Float16.bitSize { - self = inP as! Float16 - } else { - fatalError() + public static func * (prefix: Float16, postfix: Float16) { + return prefix * postfix + } + + public init

(_ inP: P) where P : PrecisionType { + if P.bitSize == Float32.bitSize { + self = Float16(inFloat: inP as! Float32) + } else if P.bitSize == Float16.bitSize { + self = inP as! Float16 + } else { + fatalError() + } + } + + public static var bitSize: UInt { + return 16 + } + + public init(inFloat16: Float16) { + self = inFloat16 + } + public init(inFloat: Float32) { + self = Int16(inFloat) } - } - - public static var bitSize: UInt { - return 16 - } - - public init(inFloat16: Float16) { - self = inFloat16 - } - public init(inFloat: Float32) { - self = Int16(inFloat) - } } extension Float32: PrecisionType { - public init

(_ inP: P) where P : PrecisionType { - if P.bitSize == Float32.bitSize { - self = inP as! Float32 - } else if P.bitSize == Float16.bitSize { - self = Float32.init(inP as! Float16) - } else { - fatalError() + public init

(_ inP: P) where P : PrecisionType { + if P.bitSize == Float32.bitSize { + self = inP as! Float32 + } else if P.bitSize == Float16.bitSize { + self = Float32.init(inP as! Float16) + } else { + fatalError() + } + } + + public init(inFloat: Float32) { + self = inFloat + } + + public init(inFloat16: Float16) { + self = Float32.init(inFloat16) + } + + public static var bitSize: UInt { + return 32 } - } - - public init(inFloat: Float32) { - self = inFloat - } - - public init(inFloat16: Float16) { - self = Float32.init(inFloat16) - } - - public static var bitSize: UInt { - return 32 - } } public func float32ToFloat16(input: UnsafeMutablePointer, output: UnsafeMutableRawPointer, count: Int) { - var float32Buffer = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 4) - var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2) - guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else { - fatalError(" float 32 to float 16 error ! ") - } + var float32Buffer = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 4) + var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2) + guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else { + fatalError(" float 32 to float 16 error ! ") + } } public func float16To32(input: UnsafeMutablePointer, count: Int) -> [Float32] { - var output = Array.init(repeating: 0.0, count: count) - float16to32(input: input, output: &output, count: count) - return output + var output = Array.init(repeating: 0.0, count: count) + float16to32(input: input, output: &output, count: count) + return output } public func float16to32(input: UnsafeMutablePointer, output: UnsafeMutablePointer, count: Int) { - var bufferFloat16 = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 2) - var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4) - if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError { - fatalError(" convert float16 to float32 error") - } + var bufferFloat16 = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 2) + var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4) + if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError { + fatalError(" convert float16 to float32 error") + } } // N - 0 C - 1 H - 2 W - 3 struct DataLayout { - - static func NCHW(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout { - return DataLayout.init([(.N, dim[0]), (.C, dim[1]), (.H, dim[2]), (.W, dim[3])]) - } - - static func NHWC(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout { - return DataLayout.init([(.N, dim[0]), (.H, dim[1]), (.W, dim[2]), (.C, dim[3])]) - } - - func count() -> Int { - return layoutWithDim.count - } - - var N: Int? { - get { - for layoutDim in layoutWithDim { - if layoutDim.0 == .N { - return layoutDim.1 - } - } - return nil + + static func NCHW(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout { + return DataLayout.init([(.N, dim[0]), (.C, dim[1]), (.H, dim[2]), (.W, dim[3])]) } - set { - var newN = (Layout.N, newValue) - if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in - return layout == .N - }) { - fatalError() - } + + static func NHWC(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout { + return DataLayout.init([(.N, dim[0]), (.H, dim[1]), (.W, dim[2]), (.C, dim[3])]) } - } - var C: Int? { - get { - for layoutDim in layoutWithDim { - if layoutDim.0 == .C { - return layoutDim.1 - } - } - return nil + + func count() -> Int { + return layoutWithDim.count } - set { - var newN = (Layout.C, newValue) - if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in - return layout == .N - }) { - fatalError() - } + + var N: Int? { + get { + for layoutDim in layoutWithDim { + if layoutDim.0 == .N { + return layoutDim.1 + } + } + return nil + } + set { + var newN = (Layout.N, newValue) + if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in + return layout == .N + }) { + fatalError() + } + } } - } - var H: Int? { - get { - for layoutDim in layoutWithDim { - if layoutDim.0 == .H { - return layoutDim.1 + var C: Int? { + get { + for layoutDim in layoutWithDim { + if layoutDim.0 == .C { + return layoutDim.1 + } + } + return nil + } + set { + var newN = (Layout.C, newValue) + if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in + return layout == .N + }) { + fatalError() + } } - } - return nil } - set { - var newN = (Layout.H, newValue) - if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in - return layout == .H - }) { - fatalError() - } + var H: Int? { + get { + for layoutDim in layoutWithDim { + if layoutDim.0 == .H { + return layoutDim.1 + } + } + return nil + } + set { + var newN = (Layout.H, newValue) + if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in + return layout == .H + }) { + fatalError() + } + } } - } - var W: Int? { - get { - for layoutDim in layoutWithDim { - if layoutDim.0 == .W { - return layoutDim.1 + var W: Int? { + get { + for layoutDim in layoutWithDim { + if layoutDim.0 == .W { + return layoutDim.1 + } + } + return nil + } + set { + var newN = (Layout.W, newValue) + if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in + return layout == .W + }) { + fatalError() + } } - } - return nil } - set { - var newN = (Layout.W, newValue) - if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in - return layout == .W - }) { - fatalError() - } + + + init(_ inLayout: [(Layout, Int)]) { + layoutWithDim = inLayout } - } - - - init(_ inLayout: [(Layout, Int)]) { - layoutWithDim = inLayout - } - - func layout() -> [Layout] { - return layoutWithDim.map({ (layout: Layout, dim: Int) -> Layout in - return layout - }) - } - - var layoutWithDim: [(Layout, Int)] = [(.N, 0), (.C, 0), (.H, 0), (.W, 0)] - - func convertTo(inLayout: [Layout]) { - } - - enum Layout: Int{ - case N = 0 - case C = 1 - case H = 2 - case W = 3 - static func defaultLayout() -> [Layout] { - return [N, C, H, W] + func layout() -> [Layout] { + return layoutWithDim.map({ (layout: Layout, dim: Int) -> Layout in + return layout + }) + } + + var layoutWithDim: [(Layout, Int)] = [(.N, 0), (.C, 0), (.H, 0), (.W, 0)] + + func convertTo(inLayout: [Layout]) { + + } + + enum Layout: Int{ + case N = 0 + case C = 1 + case H = 2 + case W = 3 + static func defaultLayout() -> [Layout] { + return [N, C, H, W] + } } - } } extension DataLayout: Equatable { - public static func == (lhs: DataLayout, rhs: DataLayout) -> Bool { - if lhs.layoutWithDim.count == rhs.layoutWithDim.count { - var result = true - for i in 0.. Bool { + if lhs.layoutWithDim.count == rhs.layoutWithDim.count { + var result = true + for i in 0.. { - guard let inResultBuffer = resultBuffer else { - fatalError() + var resultBuffer: MTLBuffer? + public var dim: Dim + public var capacity: Int + public var paddedCapacity: Int + + init(inPaddedCapacity: Int, inDim: Dim) { + paddedCapacity = inPaddedCapacity + capacity = inDim.numel() + dim = inDim } - return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: paddedCapacity) - } - + + public func initBuffer(device: MTLDevice) { + resultBuffer = device.makeBuffer(length: paddedCapacity * 4, options: []) + } + + var result: UnsafeMutablePointer { + guard let inResultBuffer = resultBuffer else { + fatalError() + } + return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: paddedCapacity) + } + } extension FetchHolder: CustomStringConvertible, CustomDebugStringConvertible { - public var description: String { - fatalError() -// return "\(result)" - } - - public var debugDescription: String { - fatalError() -// return "\(result)" - } - - + public var description: String { + fatalError() + // return "\(result)" + } + + public var debugDescription: String { + fatalError() + // return "\(result)" + } + + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift index 1817184bf7..77b67bf16c 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift @@ -15,41 +15,41 @@ import Foundation @objc public class Dim: NSObject { - private(set) var dims: [Int] - - @objc public init(inDim: [Int]) { - dims = inDim - } - - public func cout() -> Int { - return dims.count - } - - public func numel() -> Int { - return dims.reduce(1) { $0 * $1 } - } - - public static func ==(left: Dim, right: Dim) -> Bool { - return left.dims == right.dims; - } - - public static func !=(left: Dim, right: Dim) -> Bool { - return left.dims != right.dims; - } - - public subscript(index: Int) -> Int { - return dims[index]; - } - - public override var description: String { - return "\(dims)" - } - - func swapeDimAt(index1: Int, index2: Int) { - dims.swapAt(index1, index2) - } - - private override init(){ - fatalError() - } + private(set) var dims: [Int] + + @objc public init(inDim: [Int]) { + dims = inDim + } + + public func cout() -> Int { + return dims.count + } + + public func numel() -> Int { + return dims.reduce(1) { $0 * $1 } + } + + public static func ==(left: Dim, right: Dim) -> Bool { + return left.dims == right.dims; + } + + public static func !=(left: Dim, right: Dim) -> Bool { + return left.dims != right.dims; + } + + public subscript(index: Int) -> Int { + return dims[index]; + } + + public override var description: String { + return "\(dims)" + } + + func swapeDimAt(index1: Int, index2: Int) { + dims.swapAt(index1, index2) + } + + private override init(){ + fatalError() + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift index 8f02bf17bc..9f257200b1 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift @@ -14,136 +14,141 @@ import Foundation - let testTo = 5 var isTest = false @objc public class GPUResultHolder: NSObject{ - @objc public let dim: [Int] - @objc public let capacity: Int - @objc public var resultPointer: UnsafeMutablePointer? - @objc public var intermediateResults: [String : [MTLBuffer]]? - public init(inDim: [Int], inPointer: UnsafeMutablePointer?, inCapacity: Int, inIntermediateResults: [String : [MTLBuffer]]? = nil) { - dim = inDim - capacity = inCapacity + @objc public let dim: [Int] + @objc public let capacity: Int + @objc public var resultPointer: UnsafeMutablePointer? + @objc public var intermediateResults: [String : [MTLBuffer]]? + public init(inDim: [Int], inPointer: UnsafeMutablePointer?, inCapacity: Int, inIntermediateResults: [String : [MTLBuffer]]? = nil) { + dim = inDim + capacity = inCapacity + + if let inInPointer = inPointer { + resultPointer = UnsafeMutablePointer.allocate(capacity: inCapacity) + resultPointer?.initialize(from: inInPointer, count: inCapacity) + } + + intermediateResults = inIntermediateResults + } - if let inInPointer = inPointer { - resultPointer = UnsafeMutablePointer.allocate(capacity: inCapacity) - resultPointer?.initialize(from: inInPointer, count: inCapacity) + public override var description: String { + fatalError() } - intermediateResults = inIntermediateResults - } - - public override var description: String { - fatalError() - } - } public class Executor { - var ops: [Runable & InferShaperable] = [] - var preInputDim: Dim = Dim.init(inDim: []) - let program: Program - let device: MTLDevice - let inflightSemaphore: DispatchSemaphore - let queue: MTLCommandQueue - init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program, initContext: InitContext) throws { - self.inflightSemaphore = DispatchSemaphore(value: 1) - program = inProgram - device = inDevice - queue = inQueue - - for block in inProgram.programDesc.blocks { - //block.ops.count - for i in 0...shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope, initContext: initContext) - ops.append(op) - } catch let error { - throw error + var ops: [Runable & InferShaperable] = [] + var preInputDim: Dim = Dim.init(inDim: []) + let program: Program + let device: MTLDevice + let inflightSemaphore: DispatchSemaphore + let queue: MTLCommandQueue + init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program, initContext: InitContext) throws { + self.inflightSemaphore = DispatchSemaphore(value: 1) + program = inProgram + device = inDevice + queue = inQueue + + for block in inProgram.programDesc.blocks { + //block.ops.count + for i in 0...shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope, initContext: initContext) + ops.append(op) + } catch let error { + throw error + } + } } - } - } - } - - public func predict(input: MTLTexture, dim: Dim, completionHandle: @escaping ([GPUResultHolder]) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws { - inflightSemaphore.wait() - - guard let buffer = queue.makeCommandBuffer() else { - throw PaddleMobileError.predictError(message: "CommandBuffer is nil") - } - - let resInput: MTLTexture - if let inPre = preProcessKernle { - do { - try inPre.compute(inputTexuture: input, commandBuffer: buffer) - resInput = inPre.outputTexture - } catch let error { - throw error - } - } else { - resInput = input } - let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: dim) - program.scope.setInput(input: inputTexture) - //(ops.count - except) - for i in 0..<(ops.count - except) { - let op = ops[i] - do { - try op.run(device: device, buffer: buffer) - } catch let error { - throw error - } - } - - var outputTextures: [String : [MTLBuffer]]? - if except > 0 { - ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer) - outputTextures = ops[ops.count - except].inputVariant() + public func predict(input: MTLTexture, dim: Dim, completionHandle: @escaping ([GPUResultHolder]) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws { + inflightSemaphore.wait() + + guard let buffer = queue.makeCommandBuffer() else { + throw PaddleMobileError.predictError(message: "CommandBuffer is nil") + } + + let resInput: MTLTexture + if let inPre = preProcessKernle { + do { + try inPre.compute(inputTexuture: input, commandBuffer: buffer) + resInput = inPre.outputTexture + } catch let error { + throw error + } + } else { + resInput = input + } + + let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: dim) + program.scope.setInput(input: inputTexture) + //(ops.count - except) + for i in 0..<(ops.count - except) { + let op = ops[i] + do { + try op.run(device: device, buffer: buffer) + } catch let error { + throw error + } + } + + var outputTextures: [String : [MTLBuffer]]? + if except > 0 { + ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer) + outputTextures = ops[ops.count - except].inputVariant() + } + + buffer.addCompletedHandler { [weak self] (commandbuffer) in + guard let SSelf = self else { + fatalError() + } + + //将输入写进文件 + /* + + let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2])) + print(dim) + writeToLibrary(fileName: "yolo_input", array: inputArr) + print(" write done ") + return + */ + + + + //输出 op 计算结果 + if GlobalConfig.shared.debug { + for i in 0.. 0 { + resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0, inIntermediateResults: outputTextures) + } else { + let outputVar: Variant = SSelf.program.scope.output()! + let output: FetchHolder = outputVar as! FetchHolder + resultHolder = GPUResultHolder.init(inDim: output.dim.dims, inPointer: output.result, inCapacity: output.capacity) + } + + completionHandle([resultHolder]) + SSelf.inflightSemaphore.signal() + } + + buffer.commit() } - buffer.addCompletedHandler { [weak self] (commandbuffer) in - guard let SSelf = self else { - fatalError() - } - - //将输入写进文件 - /* - let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2])) - print(dim) - writeToLibrary(fileName: "test_image_super", array: inputArr) - print(" write done ") - return - */ - - /* 输出 op 计算结果 - for op in SSelf.ops { - op.delogOutput() - } - */ - - var resultHolder: GPUResultHolder - if except > 0 { - resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0, inIntermediateResults: outputTextures) - } else { - let outputVar: Variant = SSelf.program.scope.output()! - let output: FetchHolder = outputVar as! FetchHolder - resultHolder = GPUResultHolder.init(inDim: output.dim.dims, inPointer: output.result, inCapacity: output.capacity) - } - - completionHandle([resultHolder]) - SSelf.inflightSemaphore.signal() + public func clear() { + program.scope.clear() } - buffer.commit() - } - - public func clear() { - program.scope.clear() - } - } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift index 1d4f0ec14f..790b961480 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift @@ -16,251 +16,251 @@ import Foundation //import SwiftProtobuf public class Loader { - class ParaLoader { - let file: UnsafeMutablePointer - let fileSize: Int - var nowIndex: Int - init(paramPath: String) throws { - guard let tmpFile = fopen(paramPath, "rb") else { - throw PaddleMobileError.loaderError(message: "open param file error" + paramPath) - } - file = tmpFile - fseek(file, 0, SEEK_END) - fileSize = ftell(file) - guard fileSize > 0 else { - throw PaddleMobileError.loaderError(message: "param file size is too small") - } - rewind(file) - nowIndex = 0 - } - - func read(tensor: Tensor

) throws { - guard nowIndex <= fileSize else { - throw PaddleMobileError.loaderError(message: "out of the file range") - } - - func pointerReader(type: T.Type) -> T { - let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size) - fread(ptr, 1, MemoryLayout.size, file) - nowIndex += MemoryLayout.size - let pointee = ptr.pointee - ptr.deinitialize(count: MemoryLayout.size) - ptr.deallocate() - return pointee - } - - let _ = pointerReader(type: UInt32.self) - let lodLevel = pointerReader(type: UInt64.self) - for _ in 0...size)){ - _ = pointerReader(type: size_t.self) - } - } - - let _ = pointerReader(type: UInt32.self) - - let tensorDescSize = pointerReader(type: Int32.self) - - fseek(file, Int(tensorDescSize), SEEK_CUR) - nowIndex += Int(tensorDescSize) - - /* - 这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度 - */ - - //现在模型传入模型为 Float 类型, 这块应该根据模型来 - // let tmpCapacity = MemoryLayout.size * tensor.numel() - // let tmpPointer = UnsafeMutablePointer.allocate(capacity: tmpCapacity); - let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file) - - guard bytesRead == tensor.data.size else { - throw PaddleMobileError.loaderError(message: "param read size error") - } - - // TODO: use script to convert - // let bytesRead = fread(tmpPointer, 1, tmpCapacity, file) - // for i in 0..) throws { - guard nowIndex <= paramSize else { - throw PaddleMobileError.loaderError(message: "out of the file range") - } - var readerIndex: Int = 0 - func pointerReader(type: T.Type) -> T { - let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size) - memcpy(ptr, paramPointer.advanced(by: Int(readerIndex)), MemoryLayout.size) - nowIndex += MemoryLayout.size - readerIndex += MemoryLayout.size - let pointee = ptr.pointee - ptr.deinitialize(count: MemoryLayout.size) - ptr.deallocate() - - return pointee - } - let _ = pointerReader(type: UInt32.self) - let lodLevel = pointerReader(type: UInt64.self) - for _ in 0...size)){ - _ = pointerReader(type: size_t.self) - } + class ParaLoader { + let file: UnsafeMutablePointer + let fileSize: Int + var nowIndex: Int + init(paramPath: String) throws { + guard let tmpFile = fopen(paramPath, "rb") else { + throw PaddleMobileError.loaderError(message: "open param file error" + paramPath) + } + file = tmpFile + fseek(file, 0, SEEK_END) + fileSize = ftell(file) + guard fileSize > 0 else { + throw PaddleMobileError.loaderError(message: "param file size is too small") + } + rewind(file) + nowIndex = 0 } - let _ = pointerReader(type: UInt32.self) - let tensorDescSize = pointerReader(type: Int32.self) - - paramPointer = paramPointer.advanced(by: Int(readerIndex)) - paramPointer = paramPointer.advanced(by: Int(tensorDescSize)) - nowIndex += Int(tensorDescSize) + func read(tensor: Tensor

) throws { + guard nowIndex <= fileSize else { + throw PaddleMobileError.loaderError(message: "out of the file range") + } + + func pointerReader(type: T.Type) -> T { + let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size) + fread(ptr, 1, MemoryLayout.size, file) + nowIndex += MemoryLayout.size + let pointee = ptr.pointee + ptr.deinitialize(count: MemoryLayout.size) + ptr.deallocate() + return pointee + } + + let _ = pointerReader(type: UInt32.self) + let lodLevel = pointerReader(type: UInt64.self) + for _ in 0...size)){ + _ = pointerReader(type: size_t.self) + } + } + + let _ = pointerReader(type: UInt32.self) + + let tensorDescSize = pointerReader(type: Int32.self) + + fseek(file, Int(tensorDescSize), SEEK_CUR) + nowIndex += Int(tensorDescSize) + + /* + 这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度 + */ + + //现在模型传入模型为 Float 类型, 这块应该根据模型来 + // let tmpCapacity = MemoryLayout.size * tensor.numel() + // let tmpPointer = UnsafeMutablePointer.allocate(capacity: tmpCapacity); + let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file) + + guard bytesRead == tensor.data.size else { + throw PaddleMobileError.loaderError(message: "param read size error") + } + + // TODO: use script to convert + // let bytesRead = fread(tmpPointer, 1, tmpCapacity, file) + // for i in 0.. Program { - do { - /// swift protobuf serialized Data to instance class - // let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init( - // serializedData: modelData) + class ParaLoaderWithPointer { + var paramPointer: UnsafeMutableRawPointer + let paramSize: Int + var nowIndex: Int + init(pPointer: UnsafeMutableRawPointer,pSize:Int) throws { + paramPointer = UnsafeMutableRawPointer.init(pPointer) + paramSize = pSize + nowIndex = 0 + } - /// oc protobuf serialized Data to instance class - let protoProgram = try ProgramDesc.init(data: (modelData as NSData) as Data) - - let originProgramDesc = PMProgramDesc.init(protoProgram: protoProgram) - let programDesc = ProgramOptimize

.init().optimize(originProgramDesc: originProgramDesc) - -// let programDesc = PMProgramDesc.init(protoProgram: protoProgram) - - print(programDesc) - - guard programDesc.blocks.count > 0 else { - throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0") - } - - // to get feed key and fetch key - let block = programDesc.blocks[0] - guard let firstOp = block.ops.first, let lastOp = block.ops.last else { - throw PaddleMobileError.loaderError(message: "at least two operator") - } - - guard firstOp.type == gFeedType, lastOp.type == gFetchType else { - throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch") - } - - guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else { - throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found") - } - guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else { - throw PaddleMobileError.loaderError(message: "feed key or fetch key not found") - } - - let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey) - - // to load memory - for block in programDesc.blocks { - for varDesc in block.vars { - if (varDesc.type == .LodTensor) { - guard let tensorDesc = varDesc.tensorDesc else { - throw PaddleMobileError.loaderError(message: "get tensor desc failed") + func read(tensor: Tensor

) throws { + guard nowIndex <= paramSize else { + throw PaddleMobileError.loaderError(message: "out of the file range") } - - if (varDesc.persistable - && varDesc.type != .FeedMiniBatch - && varDesc.type != .FetchList) { - let dimArr = tensorDesc.dims - - guard dimArr.count > 0 else { - throw PaddleMobileError.loaderError(message: "tensor desc dim size error") - } - - let dim = Dim.init(inDim: dimArr) - let tensor = Tensor

.init(inDim: dim, inLayout: tensorDesc.dataLayout) - do { - if paraLoaderPointer != nil { - try paraLoaderPointer!.read(tensor: tensor) - } + var readerIndex: Int = 0 + func pointerReader(type: T.Type) -> T { + let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size) + memcpy(ptr, paramPointer.advanced(by: Int(readerIndex)), MemoryLayout.size) + nowIndex += MemoryLayout.size + readerIndex += MemoryLayout.size + let pointee = ptr.pointee + ptr.deinitialize(count: MemoryLayout.size) + ptr.deallocate() - if paraLoader != nil { - try paraLoader!.read(tensor: tensor) - } - } catch let error { - throw error - } - // tensor.convert(to: DataLayout.NHWC()) - // tensor.initBuffer(device: device) - scope[varDesc.name] = tensor - } else { - let dim = Dim.init(inDim: tensorDesc.dims) - scope[varDesc.name] = Texture.init(device: device, inDim: dim) + return pointee } - } else { - if varDesc.name == fetchKey { -// scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inCapacity: <#Int#>, inElapsedTime: 0.0) - } else if varDesc.name == feedKey { + let _ = pointerReader(type: UInt32.self) + let lodLevel = pointerReader(type: UInt64.self) + for _ in 0...size)){ + _ = pointerReader(type: size_t.self) + } } - } + + let _ = pointerReader(type: UInt32.self) + let tensorDescSize = pointerReader(type: Int32.self) + + paramPointer = paramPointer.advanced(by: Int(readerIndex)) + paramPointer = paramPointer.advanced(by: Int(tensorDescSize)) + nowIndex += Int(tensorDescSize) + + let _ = memcpy(tensor.data.pointer, paramPointer, tensor.data.size) + paramPointer = paramPointer.advanced(by: Int(tensor.data.size)) + nowIndex += tensor.data.size + } + deinit { } - } - - let program = Program.init(inProgramDesc: programDesc, inScope: scope) - - return program - } catch _ { - throw PaddleMobileError.loaderError(message: "protobuf decoder error") - } - } - public func load(device:MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) throws -> Program { - let modelData = Data.init(bytes:modePointer, count:modelSize) - guard let paraLoader = try? ParaLoaderWithPointer.init(pPointer: paramPointer,pSize: paramSize) else { - throw PaddleMobileError.loaderError(message: "load para error") - } - do { - let program = try loadModelandParam(device,modelData,paraLoader,nil) - return program - } catch let error { - throw error } - } - - public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{ - guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else { - throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !") + public init(){} + func loadModelandParam(_ device:MTLDevice,_ modelData:Data, _ paraLoaderPointer:ParaLoaderWithPointer?, _ paraLoader:ParaLoader?) throws -> Program { + do { + /// swift protobuf serialized Data to instance class + // let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init( + // serializedData: modelData) + + /// oc protobuf serialized Data to instance class + let protoProgram = try ProgramDesc.init(data: (modelData as NSData) as Data) + + let originProgramDesc = PMProgramDesc.init(protoProgram: protoProgram) + let programDesc = ProgramOptimize

.init().optimize(originProgramDesc: originProgramDesc) + + // let programDesc = PMProgramDesc.init(protoProgram: protoProgram) + + print(programDesc) + + guard programDesc.blocks.count > 0 else { + throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0") + } + + // to get feed key and fetch key + let block = programDesc.blocks[0] + guard let firstOp = block.ops.first, let lastOp = block.ops.last else { + throw PaddleMobileError.loaderError(message: "at least two operator") + } + + guard firstOp.type == gFeedType, lastOp.type == gFetchType else { + throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch") + } + + guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else { + throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found") + } + guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else { + throw PaddleMobileError.loaderError(message: "feed key or fetch key not found") + } + + let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey) + + // to load memory + for block in programDesc.blocks { + for varDesc in block.vars { + if (varDesc.type == .LodTensor) { + guard let tensorDesc = varDesc.tensorDesc else { + throw PaddleMobileError.loaderError(message: "get tensor desc failed") + } + + if (varDesc.persistable + && varDesc.type != .FeedMiniBatch + && varDesc.type != .FetchList) { + let dimArr = tensorDesc.dims + + guard dimArr.count > 0 else { + throw PaddleMobileError.loaderError(message: "tensor desc dim size error") + } + + let dim = Dim.init(inDim: dimArr) + let tensor = Tensor

.init(inDim: dim, inLayout: tensorDesc.dataLayout) + do { + if paraLoaderPointer != nil { + try paraLoaderPointer!.read(tensor: tensor) + } + + if paraLoader != nil { + try paraLoader!.read(tensor: tensor) + } + } catch let error { + throw error + } + // tensor.convert(to: DataLayout.NHWC()) + // tensor.initBuffer(device: device) + scope[varDesc.name] = tensor + } else { + let dim = Dim.init(inDim: tensorDesc.dims) + scope[varDesc.name] = Texture.init(device: device, inDim: dim) + } + } else { + if varDesc.name == fetchKey { + // scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inCapacity: <#Int#>, inElapsedTime: 0.0) + } else if varDesc.name == feedKey { + } + } + } + } + + let program = Program.init(inProgramDesc: programDesc, inScope: scope) + + return program + } catch _ { + throw PaddleMobileError.loaderError(message: "protobuf decoder error") + } } - guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else { - throw PaddleMobileError.loaderError(message: "load para error") + public func load(device:MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) throws -> Program { + let modelData = Data.init(bytes:modePointer, count:modelSize) + guard let paraLoader = try? ParaLoaderWithPointer.init(pPointer: paramPointer,pSize: paramSize) else { + throw PaddleMobileError.loaderError(message: "load para error") + } + do { + let program = try loadModelandParam(device,modelData,paraLoader,nil) + return program + } catch let error { + throw error + } } - do { - let program = try loadModelandParam(device,modelData,nil,paraLoader) - return program - } catch let error { - throw error + public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{ + guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else { + throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !") + } + guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else { + throw PaddleMobileError.loaderError(message: "load para error") + } + + do { + let program = try loadModelandParam(device,modelData,nil,paraLoader) + return program + } catch let error { + throw error + } } - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift index 97fe0a8fba..adce101552 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift @@ -17,337 +17,337 @@ import MetalKit import CoreMedia protocol Tensorial: Variant { - var dim: Dim { get set } - func numel() -> Int - var layout: DataLayout { get } + var dim: Dim { get set } + func numel() -> Int + var layout: DataLayout { get } } extension Tensorial { - func numel() -> Int { - return dim.numel() - } + func numel() -> Int { + return dim.numel() + } } class Tensor: Tensorial { - - var data: Data - var dim: Dim - var buffer: MTLBuffer! - private(set) var layout: DataLayout - - class Data { - init(inSize: Int, inPointer: UnsafeMutablePointer

) { - size = inSize - pointer = inPointer - } - let size: Int - var pointer: UnsafeMutablePointer

- subscript(index: Int) -> P{ - get { - return pointer[index] - } - set { - pointer[index] = newValue - } - } - func release() { - pointer.deinitialize(count: size) - pointer.deallocate() - } - deinit { - // release() - } - } - - init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) { - dim = inDim - let size = inDim.numel() * MemoryLayout

.size - let pointer = UnsafeMutablePointer

.allocate(capacity: size) - data = Data.init(inSize: size, inPointer: pointer) - layout = inLayout - } - - func convert(to: DataLayout) { - guard to != layout else { - return - } - guard dim.cout() == 4 else { - return - } + var data: Data + var dim: Dim + var buffer: MTLBuffer! + private(set) var layout: DataLayout - guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else { - // other not support - return - } - let newPointer = UnsafeMutablePointer

.allocate(capacity: data.size) - - if layout == DataLayout.NCHW() { - NCHW2NHWC(newPtr: newPointer) + class Data { + init(inSize: Int, inPointer: UnsafeMutablePointer

) { + size = inSize + pointer = inPointer + } + let size: Int + var pointer: UnsafeMutablePointer

+ subscript(index: Int) -> P{ + get { + return pointer[index] + } + set { + pointer[index] = newValue + } + } + func release() { + pointer.deinitialize(count: size) + pointer.deallocate() + } + deinit { + // release() + } } - data.release() - data.pointer = newPointer - layout = to - } - - - - func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, padWhenOneC: Bool = false, convertToNHWC: Bool = true, withTranspose: Bool = false) { - if convertToNHWC { -// print(layout) - convert(to: DataLayout.NHWC()) + init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) { + dim = inDim + let size = inDim.numel() * MemoryLayout

.size + let pointer = UnsafeMutablePointer

.allocate(capacity: size) + data = Data.init(inSize: size, inPointer: pointer) + layout = inLayout } - if withTranspose { - let transposePointer = UnsafeMutablePointer

.allocate(capacity: numel()) - let n = dim[0] - let hwc = numel()/n - for j in 0...allocate(capacity: data.size) + + if layout == DataLayout.NCHW() { + NCHW2NHWC(newPtr: newPointer) + } + + data.release() + data.pointer = newPointer + layout = to } - guard let floatPointer = data.pointer as? UnsafeMutablePointer else { - fatalError(" not support yet ") - } - let precisionSize: Int - switch precision { - case .Float32: - precisionSize = 4 - case .Float16: - precisionSize = 2 - } - if dim.cout() == 4 { - if layout == DataLayout.NHWC() { - let C = dim[3] - let cSlices = (C + 3) / 4 - let paddedC = cSlices * 4 - let count = paddedC * dim[0] * dim[1] * dim[2] - if C == paddedC { - buffer = device.makeBuffer(length: count * precisionSize) - switch precision { - case .Float32: - buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) - case .Float16: - float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count) - } - } else if C == 1 && !padWhenOneC { - buffer = device.makeBuffer(length: numel() * precisionSize) - switch precision { - case .Float32: - buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout

.stride) - case .Float16: - float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel()) - } - } else { - buffer = device.makeBuffer(length: count * precisionSize) - let convertedPointer = UnsafeMutablePointer.allocate(capacity: count) - var tmpPointer = floatPointer - var dstPtr = convertedPointer - for _ in 0...allocate(capacity: numel()) + let n = dim[0] + let hwc = numel()/n + for j in 0...stride) - case .Float16: - float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) - } - - convertedPointer.deinitialize(count: count) - convertedPointer.deallocate() + + dim.swapeDimAt(index1: 0, index2: 3) + data.release() + data.pointer = transposePointer } - } else { - let C = dim[3] - let cSlices = (C + 3) / 4 - let paddedC = cSlices * 4 - let count = paddedC * dim[0] * dim[1] * dim[2] - if C == paddedC { - buffer = device.makeBuffer(length: count * precisionSize) - switch precision { - case .Float32: - buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) - case .Float16: - float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count) - } - } else if C == 1 { - fatalError(" not support ") - } else { - buffer = device.makeBuffer(length: count * precisionSize) - let convertedPointer = UnsafeMutablePointer.allocate(capacity: count) - var tmpPointer = floatPointer - var dstPtr = convertedPointer - for _ in 0.. else { + fatalError(" not support yet ") + } + + let precisionSize: Int + switch precision { + case .Float32: + precisionSize = 4 + case .Float16: + precisionSize = 2 + } + + if dim.cout() == 4 { + if layout == DataLayout.NHWC() { + let C = dim[3] + let cSlices = (C + 3) / 4 + let paddedC = cSlices * 4 + let count = paddedC * dim[0] * dim[1] * dim[2] + if C == paddedC { + buffer = device.makeBuffer(length: count * precisionSize) + switch precision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count) + } + } else if C == 1 && !padWhenOneC { + buffer = device.makeBuffer(length: numel() * precisionSize) + switch precision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel()) + } + } else { + buffer = device.makeBuffer(length: count * precisionSize) + let convertedPointer = UnsafeMutablePointer.allocate(capacity: count) + var tmpPointer = floatPointer + var dstPtr = convertedPointer + for _ in 0...stride) + case .Float16: + float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) + } + + convertedPointer.deinitialize(count: count) + convertedPointer.deallocate() + } + } else { + let C = dim[3] + let cSlices = (C + 3) / 4 + let paddedC = cSlices * 4 + let count = paddedC * dim[0] * dim[1] * dim[2] + if C == paddedC { + buffer = device.makeBuffer(length: count * precisionSize) + switch precision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count) + } + } else if C == 1 { + fatalError(" not support ") + } else { + buffer = device.makeBuffer(length: count * precisionSize) + let convertedPointer = UnsafeMutablePointer.allocate(capacity: count) + var tmpPointer = floatPointer + var dstPtr = convertedPointer + for _ in 0...stride) + case .Float16: + float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) + } + convertedPointer.deinitialize(count: count) + convertedPointer.deallocate() + } } - tmpPointer += C - dstPtr += paddedC - } - - switch precision { - case .Float32: - buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout

.stride) - case .Float16: - float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) - } - convertedPointer.deinitialize(count: count) - convertedPointer.deallocate() + } else if dim.cout() == 1 { + let num = ((numel() + 3) / 4) * 4 + buffer = device.makeBuffer(length: num * precisionSize) + switch precision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num) + } + } else { + fatalError(" not support !") } - } - } else if dim.cout() == 1 { - let num = ((numel() + 3) / 4) * 4 - buffer = device.makeBuffer(length: num * precisionSize) - switch precision { - case .Float32: - buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout

.stride) - case .Float16: - float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num) - } - } else { - fatalError(" not support !") + //TODO: release + data.release() } - //TODO: release - data.release() - } - - var n: Int { - get { - if dim.cout() == 4 { - if layout == DataLayout.NCHW() { - return dim[0] - } else if layout == DataLayout.NHWC() { - return dim[0] - } else { - fatalError(" unsupport ") + + var n: Int { + get { + if dim.cout() == 4 { + if layout == DataLayout.NCHW() { + return dim[0] + } else if layout == DataLayout.NHWC() { + return dim[0] + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } } - } else { - fatalError() - } } - } - - var width: Int { - get { - if dim.cout() == 4 { - if layout == DataLayout.NHWC() { - return dim[2] - } else if layout == DataLayout.NCHW() { - return dim[3] - } else { - fatalError(" unsupport ") + + var width: Int { + get { + if dim.cout() == 4 { + if layout == DataLayout.NHWC() { + return dim[2] + } else if layout == DataLayout.NCHW() { + return dim[3] + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } } - } else { - fatalError() - } } - } - - var height: Int { - get { - if dim.cout() == 4 { - if layout == DataLayout.NHWC() { - return dim[1] - } else if layout == DataLayout.NCHW() { - return dim[2] - } else { - fatalError(" unsupport ") + + var height: Int { + get { + if dim.cout() == 4 { + if layout == DataLayout.NHWC() { + return dim[1] + } else if layout == DataLayout.NCHW() { + return dim[2] + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } } - } else { - fatalError() - } } - } - - var channel: Int { - get { - if dim.cout() == 4 { - if layout == DataLayout.NHWC() { - return dim[3] - } else if layout == DataLayout.NCHW() { - return dim[1] - } else { - fatalError(" unsupport ") + + var channel: Int { + get { + if dim.cout() == 4 { + if layout == DataLayout.NHWC() { + return dim[3] + } else if layout == DataLayout.NCHW() { + return dim[1] + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } } - } else { - fatalError() - } } - } - - - func NCHW2NHWC(newPtr: UnsafeMutablePointer

) { - let N = dim[0] - let C = dim[1] - let H = dim[2] - let W = dim[3] - let HXW = H * W - let CXHXW = C * H * W - var index: Int = 0 - for n in 0..) { + let N = dim[0] + let C = dim[1] + let H = dim[2] + let W = dim[3] + let HXW = H * W + let CXHXW = C * H * W + + var index: Int = 0 + for n in 0...size { - str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])" + + var debugDescription: String { + var str = "dim: \(dim) \n" + str += "MTLBuffer: \(self.buffer) \n" + for i in 0...size { + str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])" + } + return str + } + + func logDataPointer(header: String = "") { + print(header) + var str = "" + str += "data size: \(data.size) \n" + str += "dim: \(dim) \n" + for i in 0.. [1, 1, a, b] transpose 必须为 [0, 1, x, x] -// [a] -> [1, 1, 1, a] transpose 必须为 [0, 1, 2, 3] -// [a, b, c] -> [1, a, b, c] tranpose 必须为 [0, x, x, x] - -3 维 tensor [a, b, c] 对应的 texture_2darray, -.width = c -.height = b -.len = a + 3 / 4 + // TODO transpose 对于低维 tensor 的扩展原则。。。 + // [a, b] -> [1, 1, a, b] transpose 必须为 [0, 1, x, x] + // [a] -> [1, 1, 1, a] transpose 必须为 [0, 1, 2, 3] + // [a, b, c] -> [1, a, b, c] tranpose 必须为 [0, x, x, x] + + 3 维 tensor [a, b, c] 对应的 texture_2darray, + .width = c + .height = b + .len = a + 3 / 4 2 维 tensor [a, b] 对应的 texture_2darray .width = b + 3 / 4 @@ -69,136 +69,136 @@ extension InputTexture { .len = 1 */ public class Texture: Tensorial { - public var dim: Dim - public var tensorDim: Dim - - /// tensor dim pad to four - public var padToFourDim: Dim - private var textureDesc: MTLTextureDescriptor! - public var metalTexture: MTLTexture! - var transpose: [Int] = [0, 1, 2, 3] - - func elementCount() -> Int { - return metalTexture.width * metalTexture.height * metalTexture.arrayLength * 4 - } - - func toTensor() -> [Float32] { - guard padToFourDim.cout() == 4 else { - fatalError("- not support -") - } - return metalTexture.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2])) - } - - func realNHWC() -> [Float32] { - guard padToFourDim.cout() == 4 else { - fatalError(" - not support - ") - } - return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) - } - - public func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) { - transpose = inTranspose - for i in 0..<(4 - tensorDim.cout()) { - if i != inTranspose[i] { - fatalError() - } - } + public var dim: Dim + public var tensorDim: Dim - let newDim = transpose.map { padToFourDim[$0] } - let newLayout = transpose.map { layout.layoutWithDim[$0] } + /// tensor dim pad to four + public var padToFourDim: Dim + private var textureDesc: MTLTextureDescriptor! + public var metalTexture: MTLTexture! + var transpose: [Int] = [0, 1, 2, 3] - layout = DataLayout.init(newLayout) - dim = Dim.init(inDim: newDim) + func elementCount() -> Int { + return metalTexture.width * metalTexture.height * metalTexture.arrayLength * 4 + } - let tmpTextureDes = MTLTextureDescriptor.init() - tmpTextureDes.textureType = .type2DArray - tmpTextureDes.depth = 1 + func toTensor() -> [Float32] { + guard padToFourDim.cout() == 4 else { + fatalError("- not support -") + } + return metalTexture.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2])) + } - switch tensorDim.cout() { - case 4: - tmpTextureDes.width = newDim[2] - tmpTextureDes.height = newDim[1] - tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4 - case 3: - tmpTextureDes.width = newDim[3] - tmpTextureDes.height = newDim[2] - tmpTextureDes.arrayLength = (newDim[1] + 3) / 4 - case 2, 1: - tmpTextureDes.width = (newDim[3] + 3) / 4 - tmpTextureDes.height = newDim[2] - tmpTextureDes.arrayLength = 1 - default: - fatalError("unreachable") + func realNHWC() -> [Float32] { + guard padToFourDim.cout() == 4 else { + fatalError(" - not support - ") + } + return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) } - - if computePrecision == .Float16 { - tmpTextureDes.pixelFormat = .rgba16Float - } else if computePrecision == .Float32 { - tmpTextureDes.pixelFormat = .rgba32Float + + public func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) { + transpose = inTranspose + for i in 0..<(4 - tensorDim.cout()) { + if i != inTranspose[i] { + fatalError() + } + } + + let newDim = transpose.map { padToFourDim[$0] } + let newLayout = transpose.map { layout.layoutWithDim[$0] } + + layout = DataLayout.init(newLayout) + dim = Dim.init(inDim: newDim) + + let tmpTextureDes = MTLTextureDescriptor.init() + tmpTextureDes.textureType = .type2DArray + tmpTextureDes.depth = 1 + + switch tensorDim.cout() { + case 4: + tmpTextureDes.width = newDim[2] + tmpTextureDes.height = newDim[1] + tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4 + case 3: + tmpTextureDes.width = newDim[3] + tmpTextureDes.height = newDim[2] + tmpTextureDes.arrayLength = (newDim[1] + 3) / 4 + case 2, 1: + tmpTextureDes.width = (newDim[3] + 3) / 4 + tmpTextureDes.height = newDim[2] + tmpTextureDes.arrayLength = 1 + default: + fatalError("unreachable") + } + + if computePrecision == .Float16 { + tmpTextureDes.pixelFormat = .rgba16Float + } else if computePrecision == .Float32 { + tmpTextureDes.pixelFormat = .rgba32Float + } + + tmpTextureDes.usage = [.shaderRead, .shaderWrite] + tmpTextureDes.storageMode = .shared + textureDesc = tmpTextureDes + metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil " } - tmpTextureDes.usage = [.shaderRead, .shaderWrite] - tmpTextureDes.storageMode = .shared - textureDesc = tmpTextureDes - metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil " - } - - public func updateDims(inTensorDim: Dim, inDim: Dim) { - var fourDim: Dim - if inDim.cout() == 4 { - fourDim = inDim - } else if inDim.cout() < 4 { - var fourDimNum: [Int] = [] - for _ in 0..<(4 - inDim.cout()) { - fourDimNum.append(1) - } - fourDimNum.append(contentsOf: inDim.dims) - fourDim = Dim.init(inDim: fourDimNum) - } else { - fatalError(" not support ") + public func updateDims(inTensorDim: Dim, inDim: Dim) { + var fourDim: Dim + if inDim.cout() == 4 { + fourDim = inDim + } else if inDim.cout() < 4 { + var fourDimNum: [Int] = [] + for _ in 0..<(4 - inDim.cout()) { + fourDimNum.append(1) + } + fourDimNum.append(contentsOf: inDim.dims) + fourDim = Dim.init(inDim: fourDimNum) + } else { + fatalError(" not support ") + } + + tensorDim = inTensorDim + dim = fourDim + padToFourDim = fourDim } - tensorDim = inTensorDim - dim = fourDim - padToFourDim = fourDim - } - - // 初始化时 dim padToFourDim 模型中的维度(一般来说 nchw),前面补全0 - init(device: MTLDevice, inDim: Dim) { - print(" in dim > \(inDim)") - var fourDim: Dim - if inDim.cout() == 4 { - fourDim = inDim - } else if inDim.cout() < 4 { - var fourDimNum: [Int] = [] - for _ in 0..<(4 - inDim.cout()) { - fourDimNum.append(1) - } - fourDimNum.append(contentsOf: inDim.dims) - fourDim = Dim.init(inDim: fourDimNum) - } else { - fatalError(" not support ") + // 初始化时 dim padToFourDim 模型中的维度(一般来说 nchw),前面补全0 + init(device: MTLDevice, inDim: Dim) { + print(" in dim > \(inDim)") + var fourDim: Dim + if inDim.cout() == 4 { + fourDim = inDim + } else if inDim.cout() < 4 { + var fourDimNum: [Int] = [] + for _ in 0..<(4 - inDim.cout()) { + fourDimNum.append(1) + } + fourDimNum.append(contentsOf: inDim.dims) + fourDim = Dim.init(inDim: fourDimNum) + } else { + fatalError(" not support ") + } + tensorDim = inDim + dim = fourDim + padToFourDim = fourDim + layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])]) } - tensorDim = inDim - dim = fourDim - padToFourDim = fourDim - layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])]) - } - - private(set) var layout: DataLayout + + private(set) var layout: DataLayout } extension Texture { - public var description: String { - return debugDescription - } - - public var debugDescription: String{ - var str = "" - str += "Dim: \(dim) \n value:[ " - str += "\(metalTexture)" - str += " ]" - return str - } - + public var description: String { + return debugDescription + } + + public var debugDescription: String{ + var str = "" + str += "Dim: \(dim) \n value:[ " + str += "\(metalTexture)" + str += " ]" + return str + } + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift index fcedbd36f7..f16344e500 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift @@ -27,7 +27,7 @@ class OpCreator { } } - func creat(device: MTLDevice, opDesc: PMOpDesc, scope: Scope, initContext: InitContext) throws -> Runable & InferShaperable { + func creat(device: MTLDevice, opDesc: PMOpDesc, scope: Scope, initContext: InitContext) throws -> Runable & InferShaperable { guard let opCreator = opCreators[opDesc.type] else { throw PaddleMobileError.opError(message: "there is no " + opDesc.type + " yet") } @@ -69,6 +69,6 @@ class OpCreator { gConvAddAddPreluType : ConvAddAddPreluOp

.creat, gElementwiseAddPreluType : ElementwiseAddPreluOp

.creat, gFusionConvAddType : ConvAddOp

.creat] - + private init(){} } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift index 01c2216664..0af90e411b 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift @@ -22,199 +22,199 @@ import Foundation */ protocol OpParam { - associatedtype OutputType: Variant - var output: OutputType { get set } - func outputDesc() -> String - - //associatedtype ParamPrecisionType: PrecisionType - init(opDesc: PMOpDesc, inScope: Scope) throws - static func getFirstTensor(key: String, map: [String : [String]], from: Scope) throws -> VarType - static func inputX(inputs: [String : [String]], from: Scope) throws -> VarType - static func inputBiase(inputs: [String : [String]], from: Scope) throws -> VarType - static func inputMean(inputs: [String : [String]], from: Scope) throws -> VarType - static func inputScale(inputs: [String : [String]], from: Scope) throws -> VarType - static func inputVariance(inputs: [String : [String]], from: Scope) throws -> VarType - static func inputFilter(paraInputs: [String : [String]], from: Scope) throws -> VarType - static func input(inputs: [String : [String]], from: Scope) throws -> VarType - static func output(outputs: [String : [String]], from: Scope) throws -> VarType - static func outputY(outputs: [String : [String]], from: Scope) throws -> VarType - static func inputY(inputs: [String : [String]], from: Scope) throws -> VarType - - static func inputImage(inputs: [String : [String]], from: Scope) throws -> VarType - - static func outputBoxes(outputs: [String : [String]], from: Scope) throws -> VarType - - static func outputOut(outputs: [String : [String]], from: Scope) throws -> VarType - - static func outputVariances(outputs: [String : [String]], from: Scope) throws -> VarType - - static func getAttr(key: String, attrs: [String : Attr]) throws -> T - - static func paramInputAlpha(inputs: [String : [String]], from: Scope) throws -> VarType - + associatedtype OutputType: Variant + var output: OutputType { get set } + func outputDesc() -> String + + //associatedtype ParamPrecisionType: PrecisionType + init(opDesc: PMOpDesc, inScope: Scope) throws + static func getFirstTensor(key: String, map: [String : [String]], from: Scope) throws -> VarType + static func inputX(inputs: [String : [String]], from: Scope) throws -> VarType + static func inputBiase(inputs: [String : [String]], from: Scope) throws -> VarType + static func inputMean(inputs: [String : [String]], from: Scope) throws -> VarType + static func inputScale(inputs: [String : [String]], from: Scope) throws -> VarType + static func inputVariance(inputs: [String : [String]], from: Scope) throws -> VarType + static func inputFilter(paraInputs: [String : [String]], from: Scope) throws -> VarType + static func input(inputs: [String : [String]], from: Scope) throws -> VarType + static func output(outputs: [String : [String]], from: Scope) throws -> VarType + static func outputY(outputs: [String : [String]], from: Scope) throws -> VarType + static func inputY(inputs: [String : [String]], from: Scope) throws -> VarType + + static func inputImage(inputs: [String : [String]], from: Scope) throws -> VarType + + static func outputBoxes(outputs: [String : [String]], from: Scope) throws -> VarType + + static func outputOut(outputs: [String : [String]], from: Scope) throws -> VarType + + static func outputVariances(outputs: [String : [String]], from: Scope) throws -> VarType + + static func getAttr(key: String, attrs: [String : Attr]) throws -> T + + static func paramInputAlpha(inputs: [String : [String]], from: Scope) throws -> VarType + } extension OpParam { - func outputDesc() -> String { - return output.debugDescription - } - - static func getFirstTensor(key: String, map: [String : [String]], from: Scope) throws -> VarType { - guard let mapKeys = map[key], mapKeys.count > 0 else { - throw PaddleMobileError.paramError(message: key + " not found in \(map) or maped values is empty") + func outputDesc() -> String { + return output.debugDescription + } + + static func getFirstTensor(key: String, map: [String : [String]], from: Scope) throws -> VarType { + guard let mapKeys = map[key], mapKeys.count > 0 else { + throw PaddleMobileError.paramError(message: key + " not found in \(map) or maped values is empty") + } + guard let variant = from[mapKeys[0]] else { + throw PaddleMobileError.paramError(message: mapKeys[0] + " not found in scope") + } + + guard let v = variant as? VarType else { + throw PaddleMobileError.paramError(message: " type error") + + } + return v } - guard let variant = from[mapKeys[0]] else { - throw PaddleMobileError.paramError(message: mapKeys[0] + " not found in scope") + + static func outputVariances(outputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorVariances: VarType = try getFirstTensor(key: "Variances", map: outputs, from: from) + return tensorVariances + } catch let error { + throw error + } } - guard let v = variant as? VarType else { - throw PaddleMobileError.paramError(message: " type error") - + static func paramInputAlpha(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let alphaTensor: VarType = try getFirstTensor(key: "Alpha", map: inputs, from: from) + return alphaTensor + } catch let error { + throw error + } + } + + + static func inputImage(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorImage: VarType = try getFirstTensor(key: "Image", map: inputs, from: from) + return tensorImage + } catch let error { + throw error + } + } + + static func inputX(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from) + return tensorX + } catch let error { + throw error + } + } + + static func outputBoxes(outputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorBox: VarType = try getFirstTensor(key: "Boxes", map: outputs, from: from) + return tensorBox + } catch let error { + throw error + } + } + + static func input(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from) + return tensorInput + } catch let error { + throw error + } + } + + static func output(outputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from) + return tensorOutput + } catch let error { + throw error + } + } + static func outputY(outputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from) + return tensorOutputY + } catch let error { + throw error + } + } + static func inputY(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from) + return tensorY + } catch let error { + throw error + } + } + + static func outputOut(outputs: [String : [String]], from: Scope) throws -> VarType { + do { + let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from) + return out + } catch let error { + throw error + } + } + static func inputFilter(paraInputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from) + return tensorFilter + } catch let error { + throw error + } + } + + static func inputBiase(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from) + return tensorBias + } catch let error { + throw error + } + } + + static func inputMean(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from) + return tensorMean + } catch let error { + throw error + } + } + + static func inputScale(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from) + return tensorScale + } catch let error { + throw error + } + } + + static func inputVariance(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from) + return tensorVariance + } catch let error { + throw error + } + } + + static func getAttr(key: String, attrs: [String : Attr]) throws -> T{ + guard let attr = attrs[key] else { + throw PaddleMobileError.paramError(message: "attr \(key) can't found in: \(attrs)" ) + } + + guard let tAttr = attr as? T else { + throw PaddleMobileError.paramError(message: "key: \(key) attr: \(attr) type error" ) + } + return tAttr } - return v - } - - static func outputVariances(outputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorVariances: VarType = try getFirstTensor(key: "Variances", map: outputs, from: from) - return tensorVariances - } catch let error { - throw error - } - } - - static func paramInputAlpha(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let alphaTensor: VarType = try getFirstTensor(key: "Alpha", map: inputs, from: from) - return alphaTensor - } catch let error { - throw error - } - } - - - static func inputImage(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorImage: VarType = try getFirstTensor(key: "Image", map: inputs, from: from) - return tensorImage - } catch let error { - throw error - } - } - - static func inputX(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from) - return tensorX - } catch let error { - throw error - } - } - - static func outputBoxes(outputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorBox: VarType = try getFirstTensor(key: "Boxes", map: outputs, from: from) - return tensorBox - } catch let error { - throw error - } - } - - static func input(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from) - return tensorInput - } catch let error { - throw error - } - } - - static func output(outputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from) - return tensorOutput - } catch let error { - throw error - } - } - static func outputY(outputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from) - return tensorOutputY - } catch let error { - throw error - } - } - static func inputY(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from) - return tensorY - } catch let error { - throw error - } - } - - static func outputOut(outputs: [String : [String]], from: Scope) throws -> VarType { - do { - let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from) - return out - } catch let error { - throw error - } - } - static func inputFilter(paraInputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from) - return tensorFilter - } catch let error { - throw error - } - } - - static func inputBiase(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from) - return tensorBias - } catch let error { - throw error - } - } - - static func inputMean(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from) - return tensorMean - } catch let error { - throw error - } - } - - static func inputScale(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from) - return tensorScale - } catch let error { - throw error - } - } - - static func inputVariance(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from) - return tensorVariance - } catch let error { - throw error - } - } - - static func getAttr(key: String, attrs: [String : Attr]) throws -> T{ - guard let attr = attrs[key] else { - throw PaddleMobileError.paramError(message: "attr \(key) can't found in: \(attrs)" ) - } - - guard let tAttr = attr as? T else { - throw PaddleMobileError.paramError(message: "key: \(key) attr: \(attr) type error" ) - } - return tAttr - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift index 532d1b661d..df7a765d2d 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift @@ -16,129 +16,129 @@ import Metal import Foundation protocol Fusion { - static func fusionNode() -> Node - static func change() -> [String : [(from: String, to: String)]] - static func fusionType() -> String - static func needCheck() -> [(Int, String)] + static func fusionNode() -> Node + static func change() -> [String : [(from: String, to: String)]] + static func fusionType() -> String + static func needCheck() -> [(Int, String)] } extension Fusion { - static func needCheck() -> [(Int, String)] { - return [] - } + static func needCheck() -> [(Int, String)] { + return [] + } } protocol Runable { - func run(device: MTLDevice, buffer: MTLCommandBuffer) throws - func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws - func delogOutput() - func inputVariant() -> [String : [MTLBuffer]] - func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) + func run(device: MTLDevice, buffer: MTLCommandBuffer) throws + func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws + func delogOutput() + func inputVariant() -> [String : [MTLBuffer]] + func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) } extension Runable where Self: OperatorProtocol{ - func run(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try runImpl(device: device, buffer: buffer) - } catch let error { - throw error + func run(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try runImpl(device: device, buffer: buffer) + } catch let error { + throw error + } + } + + func inputVariant() -> [String : [MTLBuffer]] { + // return [:] + fatalError(" op \(type) need implement inputVariant") + } + + func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) { + fatalError(" need implement ") } - } - - func inputVariant() -> [String : [MTLBuffer]] { -// return [:] - fatalError(" op \(type) need implement inputVariant") - } - - func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) { - fatalError(" need implement ") - } - - func delogOutput() { - print(type + ": has no implementation" ) - } + func delogOutput() { + + print(type + ": has no implementation" ) + } } public class InitContext { - /// metal 代码加载方式 - var metalLoadMode: MetalLoadMode = .LoadMetalInDefaultLib - /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时, metal library 路径不能为空 - var metalLibPath: String? = nil - init() { - metalLoadMode = .LoadMetalInDefaultLib - metalLibPath = nil - } + /// metal 代码加载方式 + var metalLoadMode: MetalLoadMode = .LoadMetalInDefaultLib + /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时, metal library 路径不能为空 + var metalLibPath: String? = nil + init() { + metalLoadMode = .LoadMetalInDefaultLib + metalLibPath = nil + } } protocol Creator where Self: OperatorProtocol{ - associatedtype OpType: OperatorProtocol & Runable & InferShaperable - static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType + associatedtype OpType: OperatorProtocol & Runable & InferShaperable + static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType } extension Creator where Self: OperatorProtocol { - static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType { - do { - return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope, initContext: initContext) - } catch let error { - throw error + static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType { + do { + return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope, initContext: initContext) + } catch let error { + throw error + } } - } } protocol InferShaperable { - func inferShape() + func inferShape() } protocol OperatorProtocol { - associatedtype ParamType - associatedtype KerType: Computable where Self.KerType.ParamType == ParamType - var type: String { get } - var scope: Scope { get } - var inputs: [String : [String]] { get } - var paraInputs: [String : [String]] { get set } - var outpus: [String : [String]] { get } - var attrs: [String : Attr] { get } - var para: ParamType { get } - var kernel: KerType { get } - init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws + associatedtype ParamType + associatedtype KerType: Computable where Self.KerType.ParamType == ParamType + var type: String { get } + var scope: Scope { get } + var inputs: [String : [String]] { get } + var paraInputs: [String : [String]] { get set } + var outpus: [String : [String]] { get } + var attrs: [String : Attr] { get } + var para: ParamType { get } + var kernel: KerType { get } + init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws } extension OperatorProtocol { - static func provide(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> Self { - do { - return try Self.init(device: device, opDesc: opDesc, inScope: inScope, initContext: initContext) - } catch let error { - throw error + static func provide(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> Self { + do { + return try Self.init(device: device, opDesc: opDesc, inScope: inScope, initContext: initContext) + } catch let error { + throw error + } } - } } class Operator : OperatorProtocol where KernelType.ParamType == ParameterType { - required init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws { - type = opDesc.type - scope = inScope - inputs = opDesc.inputs - outpus = opDesc.outputs - attrs = opDesc.attrs - paraInputs = opDesc.paraInputs - do { - para = try ParamType.init(opDesc:opDesc, inScope: inScope) - } catch let error { - throw error + required init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws { + type = opDesc.type + scope = inScope + inputs = opDesc.inputs + outpus = opDesc.outputs + attrs = opDesc.attrs + paraInputs = opDesc.paraInputs + do { + para = try ParamType.init(opDesc:opDesc, inScope: inScope) + } catch let error { + throw error + } + kernel = KernelType.init(device: device, param: para, initContext: initContext) } - kernel = KernelType.init(device: device, param: para, initContext: initContext) - } - - typealias ParamType = ParameterType - typealias KerType = KernelType - let type: String - let inputs: [String : [String]] - var paraInputs: [String : [String]] - let outpus: [String : [String]] - let attrs: [String : Attr] - let para: ParamType - let scope: Scope - var kernel: KerType + + typealias ParamType = ParameterType + typealias KerType = KernelType + let type: String + let inputs: [String : [String]] + var paraInputs: [String : [String]] + let outpus: [String : [String]] + let attrs: [String : Attr] + let para: ParamType + let scope: Scope + var kernel: KerType } // op infos @@ -202,4 +202,4 @@ let opInfos = [gConvType : (inputs: ["Input"], outputs: ["Out gConvAddAddPreluType : (inputs: ["Input"], outputs: ["Out"]), gElementwiseAddPreluType : (inputs: ["X"], outputs: ["Out"]), gFusionConvAddType : (inputs: ["Input"], outputs: ["Out"]) - ] +] diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift index a877620416..904e04c468 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift @@ -16,52 +16,52 @@ import Foundation import Metal class BatchNormParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope) - if input.transpose != [0, 2, 3, 1] { - fatalError("batch norm only accepts NHWC") - } - output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope) - bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope) - mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope) - scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope) - variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope) - epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs) - momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope) + if input.transpose != [0, 2, 3, 1] { + fatalError("batch norm only accepts NHWC") + } + output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope) + bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope) + mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope) + scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope) + variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope) + epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs) + momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs) + } catch let error { + throw error + } } - } - let input: Texture - var output: Texture - let bias: Tensor

- let mean: Tensor

- let scale: Tensor

- let variance: Tensor

- let epsilon: Float - let momentum: Float + let input: Texture + var output: Texture + let bias: Tensor

+ let mean: Tensor

+ let scale: Tensor

+ let variance: Tensor

+ let epsilon: Float + let momentum: Float } class BatchNormOp: Operator, BatchNormParam

>, Runable, Creator, InferShaperable{ - typealias OpType = BatchNormOp

- - func inferShape() { - para.output.dim = para.input.dim - } - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + typealias OpType = BatchNormOp

+ + func inferShape() { + para.output.dim = para.input.dim + } + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + let device = para.output.metalTexture!.device + let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + print(outputArray.strideArray()) } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.output.metalTexture!.device - let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) - print(outputArray.strideArray()) - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift index a19dd10390..e44a49d900 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift @@ -16,50 +16,50 @@ import Foundation import Metal class BilinearInterpParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope) - out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs) - out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope) + out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs) + out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs) + } catch let error { + throw error + } + if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) { + fatalError() + } } - if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) { - fatalError() - } - } - let input: Texture - var output: Texture - let out_h: Int - let out_w: Int + let input: Texture + var output: Texture + let out_h: Int + let out_w: Int } class BilinearInterpOp: Operator, BilinearInterpParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = BilinearInterpOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = BilinearInterpOp

+ + func inferShape() { + // para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + let device = para.output.metalTexture!.device + let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + // print(outputArray) + print(outputArray.strideArray()) } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.output.metalTexture!.device - let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) -// print(outputArray) - print(outputArray.strideArray()) - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift index 4679885ab6..442d1af9ea 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift @@ -15,69 +15,69 @@ import Foundation class BoxcoderParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - priorBox = try BoxcoderParam.getFirstTensor(key: "PriorBox", map: opDesc.inputs, from: inScope) - priorBoxVar = try BoxcoderParam.getFirstTensor(key: "PriorBoxVar", map: opDesc.inputs, from: inScope) - targetBox = try BoxcoderParam.getFirstTensor(key: "TargetBox", map: opDesc.inputs, from: inScope) - output = try BoxcoderParam.getFirstTensor(key: "OutputBox", map: opDesc.outputs, from: inScope) - codeType = try BoxcoderParam.getAttr(key: "code_type", attrs: opDesc.attrs) - boxNormalized = try BoxcoderParam.getAttr(key: "box_normalized", attrs: opDesc.attrs) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + priorBox = try BoxcoderParam.getFirstTensor(key: "PriorBox", map: opDesc.inputs, from: inScope) + priorBoxVar = try BoxcoderParam.getFirstTensor(key: "PriorBoxVar", map: opDesc.inputs, from: inScope) + targetBox = try BoxcoderParam.getFirstTensor(key: "TargetBox", map: opDesc.inputs, from: inScope) + output = try BoxcoderParam.getFirstTensor(key: "OutputBox", map: opDesc.outputs, from: inScope) + codeType = try BoxcoderParam.getAttr(key: "code_type", attrs: opDesc.attrs) + boxNormalized = try BoxcoderParam.getAttr(key: "box_normalized", attrs: opDesc.attrs) + } catch let error { + throw error + } + assert(priorBox.tensorDim.cout() == 2) + assert(priorBoxVar.tensorDim.cout() == 2) + assert(targetBox.tensorDim.cout() == 3) + assert(output.tensorDim.cout() == 3) + assert(priorBox.transpose == [0, 1, 2, 3]) + assert(priorBoxVar.transpose == [0, 1, 2, 3]) + assert(targetBox.transpose == [0, 1, 2, 3]) + assert(codeType == "decode_center_size") // encode_center_size is not implemented + assert((targetBox.tensorDim.cout() == 3) && (targetBox.tensorDim[0] == 1)) // N must be 1 (only handle batch size = 1) } - assert(priorBox.tensorDim.cout() == 2) - assert(priorBoxVar.tensorDim.cout() == 2) - assert(targetBox.tensorDim.cout() == 3) - assert(output.tensorDim.cout() == 3) - assert(priorBox.transpose == [0, 1, 2, 3]) - assert(priorBoxVar.transpose == [0, 1, 2, 3]) - assert(targetBox.transpose == [0, 1, 2, 3]) - assert(codeType == "decode_center_size") // encode_center_size is not implemented - assert((targetBox.tensorDim.cout() == 3) && (targetBox.tensorDim[0] == 1)) // N must be 1 (only handle batch size = 1) - } - let priorBox: Texture - let priorBoxVar: Texture - let targetBox: Texture - var output: Texture - let codeType: String - let boxNormalized: Bool + let priorBox: Texture + let priorBoxVar: Texture + let targetBox: Texture + var output: Texture + let codeType: String + let boxNormalized: Bool } class BoxcoderOp: Operator, BoxcoderParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = BoxcoderOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = BoxcoderOp

+ + func inferShape() { + // para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + let device = para.output.metalTexture!.device + let pbv : [Float32] = device.texture2tensor(texture: para.priorBoxVar.metalTexture!, dim: para.priorBoxVar.tensorDim.dims, transpose: para.priorBoxVar.transpose) + let pb : [Float32] = device.texture2tensor(texture: para.priorBox.metalTexture!, dim: para.priorBox.tensorDim.dims, transpose: para.priorBox.transpose) + let tb : [Float32] = device.texture2tensor(texture: para.targetBox.metalTexture!, dim: para.targetBox.tensorDim.dims, transpose: para.targetBox.transpose) + let out : [Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + print(" prior box var ") + print(pbv.strideArray()) + print(" target box ") + print(tb.strideArray()) + print(" prior box ") + print(pb.strideArray()) + print(" output ") + print(out.strideArray()) } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.output.metalTexture!.device - let pbv : [Float32] = device.texture2tensor(texture: para.priorBoxVar.metalTexture!, dim: para.priorBoxVar.tensorDim.dims, transpose: para.priorBoxVar.transpose) - let pb : [Float32] = device.texture2tensor(texture: para.priorBox.metalTexture!, dim: para.priorBox.tensorDim.dims, transpose: para.priorBox.transpose) - let tb : [Float32] = device.texture2tensor(texture: para.targetBox.metalTexture!, dim: para.targetBox.tensorDim.dims, transpose: para.targetBox.transpose) - let out : [Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: para.output.transpose) - print(" prior box var ") - print(pbv.strideArray()) - print(" target box ") - print(tb.strideArray()) - print(" prior box ") - print(pb.strideArray()) - print(" output ") - print(out.strideArray()) - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift index c2c22d55af..a8034c681f 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift @@ -15,62 +15,62 @@ import Foundation class ConcatParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - guard let xlist = opDesc.inputs["X"] else { - fatalError() - } - for x in xlist { - guard let variant = inScope[x], let v = variant as? Texture else { - fatalError() + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + guard let xlist = opDesc.inputs["X"] else { + fatalError() + } + for x in xlist { + guard let variant = inScope[x], let v = variant as? Texture else { + fatalError() + } + if transpose.count == 0 { + transpose = v.transpose + } + if v.transpose != transpose { + fatalError() + } + + input.append(v) + } + axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs) + output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope) + } catch let error { + throw error } - if transpose.count == 0 { - transpose = v.transpose - } - if v.transpose != transpose { - fatalError() - } - - input.append(v) - } - axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs) - output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope) - } catch let error { - throw error } - } - var input: [Texture] = [] - var output: Texture - var transpose: [Int] = [] - let axis: Int + var input: [Texture] = [] + var output: Texture + var transpose: [Int] = [] + let axis: Int } class ConcatOp: Operator, ConcatParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = ConcatOp

- - func inferShape() { - // let dim = para.input.reduce([0, 0]) {[$0[0] + $1.dim[0], $1.dim[1]]} - // para.output.dim = Dim.init(inDim: dim) - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = ConcatOp

+ + func inferShape() { + // let dim = para.input.reduce([0, 0]) {[$0[0] + $1.dim[0], $1.dim[1]]} + // para.output.dim = Dim.init(inDim: dim) + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + + let device = para.output.metalTexture!.device + let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + print(outputArray.strideArray()) } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.output.metalTexture!.device - let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) - print(outputArray.strideArray()) - } - } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift index 552d72f436..e7865045e5 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift @@ -16,94 +16,94 @@ import Foundation import Metal class ConvAddAddPreluParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - filter = try ConvAddAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) - input = try ConvAddAddPreluParam.input(inputs: opDesc.inputs, from: inScope) - output = try ConvAddAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope) - stride = try ConvAddAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs) - paddings = try ConvAddAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs) - dilations = try ConvAddAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs) - groups = try ConvAddAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs) - alpha = try ConvAddAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) - mode = try ConvAddAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs) - y = try ConvAddAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + filter = try ConvAddAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) + input = try ConvAddAddPreluParam.input(inputs: opDesc.inputs, from: inScope) + output = try ConvAddAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope) + stride = try ConvAddAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs) + paddings = try ConvAddAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs) + dilations = try ConvAddAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs) + groups = try ConvAddAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs) + alpha = try ConvAddAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) + mode = try ConvAddAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs) + y = try ConvAddAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) + } catch let error { + throw error + } } - } - - let input: Texture - let y: Tensor

- let filter: Tensor

- let mode: String - let alpha: Tensor

- var output: Texture - let stride: [Int32] - let paddings: [Int32] - let dilations: [Int32] - let groups: Int + + let input: Texture + let y: Tensor

+ let filter: Tensor

+ let mode: String + let alpha: Tensor

+ var output: Texture + let stride: [Int32] + let paddings: [Int32] + let dilations: [Int32] + let groups: Int } class ConvAddAddPreluOp: Operator, ConvAddAddPreluParam

>, Runable, Creator, InferShaperable, Fusion{ - typealias OpType = ConvAddAddPreluOp

- - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gConvType) - _ = beginNode - --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gConvAddAddPreluType - } - - static func needCheck() -> [(Int, String)] { - return [(2, "Y"), (2, "X")] - } - - - - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations + typealias OpType = ConvAddAddPreluOp

+ + static func fusionNode() -> Node { + let beginNode = Node.init(inType: gConvType) + _ = beginNode + --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType) + return beginNode + } + + static func change() -> [String : [(from: String, to: String)]] { + return [:] + } + + static func fusionType() -> String { + return gConvAddAddPreluType + } + + static func needCheck() -> [(Int, String)] { + return [(2, "Y"), (2, "X")] + } + + + + func inferShape() { + let inDims = para.input.dim + let filterDim = para.filter.dim + let strides = para.stride + let paddings = para.paddings + let dilations = para.dilations + + var outDim = [inDims[0]] + for i in 0..: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - - filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) - input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope) - output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope) - stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs) - paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs) - dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs) - epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs) - - groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs) - variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope) - bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope) - - scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope) - mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope) - y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + + filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) + input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope) + output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope) + stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs) + paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs) + dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs) + epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs) + + groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs) + variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope) + bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope) + + scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope) + mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope) + y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope) + } catch let error { + throw error + } } - } - - let input: Texture - - let variance: Tensor

- let bias: Tensor

- let mean: Tensor

- let scale: Tensor

- let y: Tensor

- let filter: Tensor

- let epsilon: Float32 - var newScale: MTLBuffer? - var newBiase: MTLBuffer? - - var output: Texture - let stride: [Int32] - let paddings: [Int32] - let dilations: [Int32] - let groups: Int + + let input: Texture + + let variance: Tensor

+ let bias: Tensor

+ let mean: Tensor

+ let scale: Tensor

+ let y: Tensor

+ let filter: Tensor

+ let epsilon: Float32 + var newScale: MTLBuffer? + var newBiase: MTLBuffer? + + var output: Texture + let stride: [Int32] + let paddings: [Int32] + let dilations: [Int32] + let groups: Int } class ConvAddBatchNormReluOp: Operator, ConvAddBatchNormReluParam

>, Runable, Creator, InferShaperable, Fusion{ - - typealias OpType = ConvAddBatchNormReluOp

- - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations - var outDim = [inDims[0]] - for i in 0.. + + func inferShape() { + let inDims = para.input.dim + let filterDim = para.filter.dim + let strides = para.stride + let paddings = para.paddings + let dilations = para.dilations + + var outDim = [inDims[0]] + for i in 0.. Node { + let beginNode = Node.init(inType: gConvType) + _ = beginNode + --> Node.init(inType: gElementwiseAddType) + --> Node.init(inType: gBatchNormType) + --> Node.init(inType: gReluType) + return beginNode } - } - - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gConvType) - _ = beginNode - --> Node.init(inType: gElementwiseAddType) - --> Node.init(inType: gBatchNormType) - --> Node.init(inType: gReluType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gConvAddBatchNormReluType - } - - func delogOutput() { - print(" conv add batchnorm relu output ") - print(para.output.toTensor().strideArray()) - // let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false) - // para.filter.logDataPointer(header: "filter data pointer: ") - // print("filter: \(para.filter)") - // print("biase: \(para.y)") - // print("padding: \(para.paddings)") - // print("stride: \(para.stride)") + static func change() -> [String : [(from: String, to: String)]] { + return [:] + } - // let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false) - // let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false) - // let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false) + static func fusionType() -> String { + return gConvAddBatchNormReluType + } - // let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false) - } + func delogOutput() { + print(" conv add batchnorm relu output ") + print(para.output.toTensor().strideArray()) + // let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false) + // para.filter.logDataPointer(header: "filter data pointer: ") + // print("filter: \(para.filter)") + + // print("biase: \(para.y)") + // print("padding: \(para.paddings)") + // print("stride: \(para.stride)") + + // let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false) + // let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false) + // let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false) + + // let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false) + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift index 923c2c210d..7b9958a066 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift @@ -15,103 +15,103 @@ import Foundation class ConvAddParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) - input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope) - output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope) - stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs) - paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs) - dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs) - groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs) - - y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) + input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope) + output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope) + stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs) + paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs) + dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs) + groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs) + + y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope) + } catch let error { + throw error + } } - } - - let input: Texture - let y: Tensor

- let filter: Tensor

- - var output: Texture - let stride: [Int32] - let paddings: [Int32] - let dilations: [Int32] - let groups: Int + + let input: Texture + let y: Tensor

+ let filter: Tensor

+ + var output: Texture + let stride: [Int32] + let paddings: [Int32] + let dilations: [Int32] + let groups: Int } class ConvAddOp: Operator, ConvAddParam

>, Runable, Creator, InferShaperable, Fusion{ - typealias OpType = ConvAddOp

- - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gConvType) - _ = beginNode - --> Node.init(inType: gElementwiseAddType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gConvAddType - } - - func inferShape() { + typealias OpType = ConvAddOp

+ + static func fusionNode() -> Node { + let beginNode = Node.init(inType: gConvType) + _ = beginNode + --> Node.init(inType: gElementwiseAddType) + return beginNode + } - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations + static func change() -> [String : [(from: String, to: String)]] { + return [:] + } - var outDim = [inDims[0]] - for i in 0.. String { + return gConvAddType } - outDim.append(filterDim[0]) - para.output.dim = Dim.init(inDim: outDim) - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + func inferShape() { + + let inDims = para.input.dim + let filterDim = para.filter.dim + let strides = para.stride + let paddings = para.paddings + let dilations = para.dilations + + var outDim = [inDims[0]] + for i in 0..: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) - input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope) - output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope) - stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs) - paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs) - dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs) - groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs) - alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) - mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs) - y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) + input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope) + output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope) + stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs) + paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs) + dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs) + groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs) + alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) + mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs) + y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) + } catch let error { + throw error + } } - } - - let input: Texture - let y: Tensor

- let filter: Tensor

- let mode: String - let alpha: Tensor

- var output: Texture - let stride: [Int32] - let paddings: [Int32] - let dilations: [Int32] - let groups: Int + + let input: Texture + let y: Tensor

+ let filter: Tensor

+ let mode: String + let alpha: Tensor

+ var output: Texture + let stride: [Int32] + let paddings: [Int32] + let dilations: [Int32] + let groups: Int } class ConvAddPreluOp: Operator, ConvAddPreluParam

>, Runable, Creator, InferShaperable, Fusion{ - typealias OpType = ConvAddPreluOp

- - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gConvType) - _ = beginNode - --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gConvAddPreluType - } - - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations + typealias OpType = ConvAddPreluOp

+ + static func fusionNode() -> Node { + let beginNode = Node.init(inType: gConvType) + _ = beginNode + --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType) + return beginNode + } - var outDim = [inDims[0]] - for i in 0.. [String : [(from: String, to: String)]] { + return [:] } - outDim.append(filterDim[0]) - para.output.dim = Dim.init(inDim: outDim) - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + static func fusionType() -> String { + return gConvAddPreluType + } + + func inferShape() { + let inDims = para.input.dim + let filterDim = para.filter.dim + let strides = para.stride + let paddings = para.paddings + let dilations = para.dilations + + var outDim = [inDims[0]] + for i in 0..: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - filter = try ConvBNReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) - input = try ConvBNReluParam.input(inputs: opDesc.inputs, from: inScope) - output = try ConvBNReluParam.outputOut(outputs: opDesc.outputs, from: inScope) - stride = try ConvBNReluParam.getAttr(key: "strides", attrs: opDesc.attrs) - paddings = try ConvBNReluParam.getAttr(key: "paddings", attrs: opDesc.attrs) - dilations = try ConvBNReluParam.getAttr(key: "dilations", attrs: opDesc.attrs) - epsilon = try ConvBNReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs) - - groups = try ConvBNReluParam.getAttr(key: "groups", attrs: opDesc.attrs) - variance = try ConvBNReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope) - bias = try ConvBNReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope) - scale = try ConvBNReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope) - mean = try ConvBNReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + filter = try ConvBNReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) + input = try ConvBNReluParam.input(inputs: opDesc.inputs, from: inScope) + output = try ConvBNReluParam.outputOut(outputs: opDesc.outputs, from: inScope) + stride = try ConvBNReluParam.getAttr(key: "strides", attrs: opDesc.attrs) + paddings = try ConvBNReluParam.getAttr(key: "paddings", attrs: opDesc.attrs) + dilations = try ConvBNReluParam.getAttr(key: "dilations", attrs: opDesc.attrs) + epsilon = try ConvBNReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs) + + groups = try ConvBNReluParam.getAttr(key: "groups", attrs: opDesc.attrs) + variance = try ConvBNReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope) + bias = try ConvBNReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope) + scale = try ConvBNReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope) + mean = try ConvBNReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope) + } catch let error { + throw error + } } - } - - let input: Texture - let variance: Tensor

- let bias: Tensor

- let mean: Tensor

- let scale: Tensor

- let filter: Tensor

- let epsilon: Float32 - var newScale: MTLBuffer? - var newBiase: MTLBuffer? - - var output: Texture - let stride: [Int32] - let paddings: [Int32] - let dilations: [Int32] - let groups: Int + + let input: Texture + let variance: Tensor

+ let bias: Tensor

+ let mean: Tensor

+ let scale: Tensor

+ let filter: Tensor

+ let epsilon: Float32 + var newScale: MTLBuffer? + var newBiase: MTLBuffer? + + var output: Texture + let stride: [Int32] + let paddings: [Int32] + let dilations: [Int32] + let groups: Int } class ConvBNReluOp: Operator, ConvBNReluParam

>, Runable, Creator, InferShaperable, Fusion{ - typealias OpType = ConvBNReluOp

- - func inputs() -> [Variant] { - return [para.input, para.variance, para.bias, para.mean, para.scale, para.filter] - } - - - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations + typealias OpType = ConvBNReluOp

+ + func inputs() -> [Variant] { + return [para.input, para.variance, para.bias, para.mean, para.scale, para.filter] + } - var outDim = [inDims[0]] - for i in 0.. Node { + let beginNode = Node.init(inType: gConvType) + _ = beginNode + --> Node.init(inType: gBatchNormType) + --> Node.init(inType: gReluType) + return beginNode } - } - - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gConvType) - _ = beginNode - --> Node.init(inType: gBatchNormType) - --> Node.init(inType: gReluType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gConvBnReluType - } - - func delogOutput() { - print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) - } - + + static func change() -> [String : [(from: String, to: String)]] { + return [:] + } + + static func fusionType() -> String { + return gConvBnReluType + } + + func delogOutput() { + print(" \(type) output: ") + print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) + } + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift index c66813b166..2d402ae431 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift @@ -15,67 +15,67 @@ import Foundation class ConvParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) - input = try ConvParam.input(inputs: opDesc.inputs, from: inScope) - output = try ConvParam.output(outputs: opDesc.outputs, from: inScope) - stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs) - paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs) - dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs) - groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs) - - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) + input = try ConvParam.input(inputs: opDesc.inputs, from: inScope) + output = try ConvParam.output(outputs: opDesc.outputs, from: inScope) + stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs) + paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs) + dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs) + groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs) + + } catch let error { + throw error + } } - } - - let input: Texture - let filter: Tensor

- var output: Texture - let stride: [Int32] - let paddings: [Int32] - let dilations: [Int32] - let groups: Int + + let input: Texture + let filter: Tensor

+ var output: Texture + let stride: [Int32] + let paddings: [Int32] + let dilations: [Int32] + let groups: Int } class ConvOp: Operator, ConvParam

>, Runable, Creator, InferShaperable { - typealias OpType = ConvOp

- - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations + typealias OpType = ConvOp

- var outDim = [inDims[0]] - for i in 0..: ConvParam

{ - //typealias ParamPrecisionType = P + //typealias ParamPrecisionType = P required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - try super.init(opDesc: opDesc, inScope: inScope) - } catch let error { - throw error + do { + try super.init(opDesc: opDesc, inScope: inScope) + } catch let error { + throw error + } } - } } class ConvTransposeOp: Operator, ConvTransposeParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = ConvTransposeOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = ConvTransposeOp

+ + func inferShape() { + // para.output.dim = para.input.dim } - } - - func delogOutput() { - - print(" \(type) output: ") - let padToFourDim = para.output.padToFourDim - if para.output.transpose == [0, 1, 2, 3] { - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) - print(outputArray.strideArray()) - } else if para.output.transpose == [0, 2, 3, 1] { - let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])) - print(output.strideArray()) - } else { - print(" not implement") + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + + print(" \(type) output: ") + let padToFourDim = para.output.padToFourDim + if para.output.transpose == [0, 1, 2, 3] { + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) + print(outputArray.strideArray()) + } else if para.output.transpose == [0, 2, 3, 1] { + let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])) + print(output.strideArray()) + } else { + print(" not implement") + } } - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift index 96818a9fd8..4686501fdd 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift @@ -15,41 +15,41 @@ import Foundation class DepthConvOp: Operator, ConvParam

>, Runable, Creator, InferShaperable { - - typealias OpType = DepthConvOp

- - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations - var outDim = [inDims[0]] - for i in 0.. + + func inferShape() { + let inDims = para.input.dim + let filterDim = para.filter.dim + let strides = para.stride + let paddings = para.paddings + let dilations = para.dilations + + var outDim = [inDims[0]] + for i in 0..: Operator, ConvBNReluParam

>, Runable, Creator, InferShaperable, Fusion{ - typealias OpType = ConvBNReluOp

- - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations + typealias OpType = ConvBNReluOp

- var outDim = [inDims[0]] - for i in 0.. Node { + let beginNode = Node.init(inType: gDepthConvType) + _ = beginNode + --> Node.init(inType: gBatchNormType) + --> Node.init(inType: gReluType) + return beginNode + } + + static func change() -> [String : [(from: String, to: String)]] { + return [:] + } + + static func fusionType() -> String { + return gDwConvBnReluType + } + + func delogOutput() { + print(" \(type) output: ") + print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) } - } - - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gDepthConvType) - _ = beginNode - --> Node.init(inType: gBatchNormType) - --> Node.init(inType: gReluType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gDwConvBnReluType - } - - func delogOutput() { - print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift index 5fa69d4f44..cd5307b584 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift @@ -16,80 +16,80 @@ import Foundation import Metal class ElementwiseAddParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - inputX = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope) - axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + inputX = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope) + axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs) + } catch let error { + throw error + } + do { + inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope) + } catch _ { + let tensorY: Tensor

= try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope) + let device = inputX.metalTexture!.device + inputY = Texture.init(device: device, inDim: tensorY.dim) + let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel())) + inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision) + } + + // required init(device: MTLDevice, param: ElementwiseAddParam

) { + // param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision) + // if computePrecision == .Float32 { + // super.init(device: device, inFunctionName: "elementwise_add") + // } else if computePrecision == .Float16 { + // super.init(device: device, inFunctionName: "elementwise_add_half") + // } else { + // fatalError() + // } + // } + + var offset = axis + if axis == -1 { + offset = inputX.tensorDim.cout() - inputY.tensorDim.cout() + } + for i in 0..<(inputY.tensorDim.cout()) { + assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i]) + } } - do { - inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope) - } catch _ { - let tensorY: Tensor

= try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope) - let device = inputX.metalTexture!.device - inputY = Texture.init(device: device, inDim: tensorY.dim) - let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel())) - inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision) - } - -// required init(device: MTLDevice, param: ElementwiseAddParam

) { -// param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision) -// if computePrecision == .Float32 { -// super.init(device: device, inFunctionName: "elementwise_add") -// } else if computePrecision == .Float16 { -// super.init(device: device, inFunctionName: "elementwise_add_half") -// } else { -// fatalError() -// } -// } - var offset = axis - if axis == -1 { - offset = inputX.tensorDim.cout() - inputY.tensorDim.cout() - } - for i in 0..<(inputY.tensorDim.cout()) { - assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i]) - } - } - - var inputX: Texture - var inputY: Texture - var output: Texture - var axis: Int + var inputX: Texture + var inputY: Texture + var output: Texture + var axis: Int } class ElementwiseAddOp: Operator, ElementwiseAddParam

>, Runable, Creator, InferShaperable{ - typealias OpType = ElementwiseAddOp

- - func inferShape() { -// para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + typealias OpType = ElementwiseAddOp

+ + func inferShape() { + // para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } } - } - - func delogOutput() { - print(" \(type) output: ") - print(para.output) - let padToFourDim = para.output.padToFourDim - if para.output.transpose == [0, 1, 2, 3] { - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) - print(outputArray.strideArray()) - } else if para.output.transpose == [0, 2, 3, 1] { - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) - } else { - print(" not implement") + func delogOutput() { + print(" \(type) output: ") + print(para.output) + + let padToFourDim = para.output.padToFourDim + if para.output.transpose == [0, 1, 2, 3] { + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) + print(outputArray.strideArray()) + } else if para.output.transpose == [0, 2, 3, 1] { + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + } else { + print(" not implement") + } } - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift index 6a49d7bfa2..bd853f6c0f 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift @@ -16,101 +16,101 @@ import Foundation import Metal class ElementwiseAddPreluParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - alpha = try ElementwiseAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) - mode = try ElementwiseAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs) - inputX = try ElementwiseAddPreluParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try ElementwiseAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope) - axis = try ElementwiseAddPreluParam.getAttr(key: "axis", attrs: opDesc.attrs) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + alpha = try ElementwiseAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) + mode = try ElementwiseAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs) + inputX = try ElementwiseAddPreluParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try ElementwiseAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope) + axis = try ElementwiseAddPreluParam.getAttr(key: "axis", attrs: opDesc.attrs) + } catch let error { + throw error + } + do { + inputY = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) + } catch _ { + let tensorY: Tensor

= try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) + let device = inputX.metalTexture!.device + inputY = Texture.init(device: device, inDim: tensorY.dim) + let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel())) + inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision) + } + + // required init(device: MTLDevice, param: ElementwiseAddParam

) { + // param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision) + // if computePrecision == .Float32 { + // super.init(device: device, inFunctionName: "elementwise_add") + // } else if computePrecision == .Float16 { + // super.init(device: device, inFunctionName: "elementwise_add_half") + // } else { + // fatalError() + // } + // } + + var offset = axis + if axis == -1 { + offset = inputX.tensorDim.cout() - inputY.tensorDim.cout() + } + for i in 0..<(inputY.tensorDim.cout()) { + assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i]) + } } - do { - inputY = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) - } catch _ { - let tensorY: Tensor

= try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) - let device = inputX.metalTexture!.device - inputY = Texture.init(device: device, inDim: tensorY.dim) - let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel())) - inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision) + + let mode: String + let alpha: Tensor

+ var inputX: Texture + var inputY: Texture + var output: Texture + var axis: Int +} + +class ElementwiseAddPreluOp: Operator, ElementwiseAddPreluParam

>, Runable, Creator, InferShaperable, Fusion{ + static func fusionNode() -> Node { + let beginNode = Node.init(inType: gElementwiseAddType) + _ = beginNode + --> Node.init(inType: gPreluType) + return beginNode } - // required init(device: MTLDevice, param: ElementwiseAddParam

) { - // param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision) - // if computePrecision == .Float32 { - // super.init(device: device, inFunctionName: "elementwise_add") - // } else if computePrecision == .Float16 { - // super.init(device: device, inFunctionName: "elementwise_add_half") - // } else { - // fatalError() - // } - // } + static func change() -> [String : [(from: String, to: String)]] { + return [:] + } - var offset = axis - if axis == -1 { - offset = inputX.tensorDim.cout() - inputY.tensorDim.cout() + static func fusionType() -> String { + return gElementwiseAddPreluType } - for i in 0..<(inputY.tensorDim.cout()) { - assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i]) + + typealias OpType = ElementwiseAddPreluOp

+ + func inferShape() { + // para.output.dim = para.input.dim } - } - - let mode: String - let alpha: Tensor

- var inputX: Texture - var inputY: Texture - var output: Texture - var axis: Int -} - -class ElementwiseAddPreluOp: Operator, ElementwiseAddPreluParam

>, Runable, Creator, InferShaperable, Fusion{ - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gElementwiseAddType) - _ = beginNode - --> Node.init(inType: gPreluType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gElementwiseAddPreluType - } - - typealias OpType = ElementwiseAddPreluOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } } - } - - - - func delogOutput() { - print(" \(type) output: ") - print(para.output) - let padToFourDim = para.output.padToFourDim - if para.output.transpose == [0, 1, 2, 3] { - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) - print(outputArray.strideArray()) - } else if para.output.transpose == [0, 2, 3, 1] { - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) - } else { - print(" not implement") + + + func delogOutput() { + print(" \(type) output: ") + print(para.output) + + let padToFourDim = para.output.padToFourDim + if para.output.transpose == [0, 1, 2, 3] { + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) + print(outputArray.strideArray()) + } else if para.output.transpose == [0, 2, 3, 1] { + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + } else { + print(" not implement") + } } - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift index 46defcb583..bab3d8dce7 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift @@ -17,54 +17,54 @@ import MetalKit import CoreMedia class FeedParam: OpParam{ - var output: Texture - var input: InputTexture { - return scope.input() as! InputTexture - } - let scope: Scope - - required init(opDesc: PMOpDesc, inScope: Scope) throws { - scope = inScope - do { - output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope) - } catch let error { - throw error + var output: Texture + var input: InputTexture { + return scope.input() as! InputTexture } - } - - //typealias ParamPrecisionType = P + let scope: Scope + + required init(opDesc: PMOpDesc, inScope: Scope) throws { + scope = inScope + do { + output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope) + } catch let error { + throw error + } + } + + //typealias ParamPrecisionType = P } class FeedOp: Operator, FeedParam

>, Runable, Creator, InferShaperable { - typealias OpType = FeedOp

- - func inferShape() { - // print("feed input: \(para.input.expectDim)") - print("feed output: \(para.output.dim)") - // para.output.dim = - // para.output.dim = para.input.expectDim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + typealias OpType = FeedOp

+ + func inferShape() { + // print("feed input: \(para.input.expectDim)") + print("feed output: \(para.output.dim)") + // para.output.dim = + // para.output.dim = para.input.expectDim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + + // let resizeKernel = ResizeKernel

.init(device: device) + // let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim) + // do { + // try resizeKernel.compute(commandBuffer: buffer, param: resizeParam) + // } catch let error { + // throw error + // } } - // let resizeKernel = ResizeKernel

.init(device: device) - // let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim) - // do { - // try resizeKernel.compute(commandBuffer: buffer, param: resizeParam) - // } catch let error { - // throw error - // } - } - - func delogOutput() { - print(" \(type) output: ") - print(para.output.metalTexture) - print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray()) - } + func delogOutput() { + print(" \(type) output: ") + print(para.output.metalTexture) + print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray()) + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift index a5d04a4b03..671c2f33fa 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift @@ -16,43 +16,43 @@ import Foundation import Metal class FetchParam: OpParam{ - var output: FetchHolder - let input: Texture - let scope: Scope - required init(opDesc: PMOpDesc, inScope: Scope) throws { - scope = inScope - do { - input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope) - output = FetchHolder.init(inPaddedCapacity: input.elementCount(), inDim: input.tensorDim) - scope.setOutput(output: output) - } catch let error { - throw error + var output: FetchHolder + let input: Texture + let scope: Scope + required init(opDesc: PMOpDesc, inScope: Scope) throws { + scope = inScope + do { + input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope) + output = FetchHolder.init(inPaddedCapacity: input.elementCount(), inDim: input.tensorDim) + scope.setOutput(output: output) + } catch let error { + throw error + } } - } - - //typealias ParamPrecisionType = P + + //typealias ParamPrecisionType = P } class FetchOp: Operator< FetchKernel

, FetchParam

>, Runable, Creator, InferShaperable { - - typealias OpType = FetchOp

- - func inferShape() { - print(para.input.dim) - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = FetchOp

+ + func inferShape() { + print(para.input.dim) + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print("fetch output: ") + let resArr = self.para.output.result.floatArr(count: self.para.output.capacity) + print(resArr.strideArray()) } - } - - func delogOutput() { - print("fetch output: ") - let resArr = self.para.output.result.floatArr(count: self.para.output.capacity) - print(resArr.strideArray()) - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift index 8500798adc..b982990851 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift @@ -15,45 +15,45 @@ import Foundation class FlattenParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope) - axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope) + axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs) + } catch let error { + throw error + } } - } - let input: Texture - var output: Texture - let axis: Int + let input: Texture + var output: Texture + let axis: Int } class FlattenOp: Operator, FlattenParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = FlattenOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = FlattenOp

+ + func inferShape() { + // para.output.dim = para.input.dim } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.output.metalTexture!.device - let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) - print(outputArray.strideArray()) - } - + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + let device = para.output.metalTexture!.device + let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + print(outputArray.strideArray()) + } + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift index a7aaa9eddc..43ce7927eb 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift @@ -19,125 +19,125 @@ public protocol TestParam { } public protocol Testable { - associatedtype TestParamType: TestParam - func test(commandBuffer: MTLCommandBuffer, param: TestParamType) - init(device: MTLDevice, testParam: TestParamType, initContext: InitContext) + associatedtype TestParamType: TestParam + func test(commandBuffer: MTLCommandBuffer, param: TestParamType) + init(device: MTLDevice, testParam: TestParamType, initContext: InitContext) } protocol Computable { - associatedtype ParamType: OpParam - func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws - init(device: MTLDevice, param: ParamType, initContext: InitContext) + associatedtype ParamType: OpParam + func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws + init(device: MTLDevice, param: ParamType, initContext: InitContext) } protocol KernelProtocol { - var pipline: MTLComputePipelineState { get set } - var functionName: String { get set } - + var pipline: MTLComputePipelineState { get set } + var functionName: String { get set } + } @objc open class Kernel: NSObject{ - let pipline: MTLComputePipelineState - let functionName: String - public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = false, initContext: InitContext) { - pipline = device.pipeLine(funcName: inFunctionName, metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath) - functionName = inFunctionName - } + let pipline: MTLComputePipelineState + let functionName: String + public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = false, initContext: InitContext) { + pipline = device.pipeLine(funcName: inFunctionName, metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath) + functionName = inFunctionName + } } @objc public class Shape: NSObject { - public let width: Int - public let height: Int - public let channel: Int - @objc public init(inWidth: Int, inHeight: Int, inChannel: Int){ - width = inWidth - height = inHeight - channel = inChannel - } + public let width: Int + public let height: Int + public let channel: Int + @objc public init(inWidth: Int, inHeight: Int, inChannel: Int){ + width = inWidth + height = inHeight + channel = inChannel + } } open class BufferToTextureKernel: Kernel { - public let outputTexture: MTLTexture - - public init(device: MTLDevice, outputDim: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) { - let textureDesc = MTLTextureDescriptor.init() - textureDesc.textureType = .type2D - textureDesc.width = outputDim.width - textureDesc.height = outputDim.height - textureDesc.depth = (outputDim.channel + 3) / 4 + public let outputTexture: MTLTexture - if GlobalConfig.shared.computePrecision == .Float16 { - textureDesc.pixelFormat = .rgba16Float - } else if GlobalConfig.shared.computePrecision == .Float32 { - textureDesc.pixelFormat = .rgba32Float - } else { - fatalError() + public init(device: MTLDevice, outputDim: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) { + let textureDesc = MTLTextureDescriptor.init() + textureDesc.textureType = .type2D + textureDesc.width = outputDim.width + textureDesc.height = outputDim.height + textureDesc.depth = (outputDim.channel + 3) / 4 + + if GlobalConfig.shared.computePrecision == .Float16 { + textureDesc.pixelFormat = .rgba16Float + } else if GlobalConfig.shared.computePrecision == .Float32 { + textureDesc.pixelFormat = .rgba32Float + } else { + fatalError() + } + + textureDesc.usage = [.shaderRead, .shaderWrite] + textureDesc.storageMode = .shared + outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error " + let initContext = InitContext.init() + initContext.metalLibPath = metalLibPath + initContext.metalLoadMode = metalLoadMode + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "buffer_to_texture_kernel", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "buffer_to_texture_kernel_half", initContext: initContext) + } } - textureDesc.usage = [.shaderRead, .shaderWrite] - textureDesc.storageMode = .shared - outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error " - let initContext = InitContext.init() - initContext.metalLibPath = metalLibPath - initContext.metalLoadMode = metalLoadMode - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "buffer_to_texture_kernel", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "buffer_to_texture_kernel_half", initContext: initContext) - } - } - - public func compute(inputBuffer: MTLBuffer , commandBuffer: MTLCommandBuffer) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + public func compute(inputBuffer: MTLBuffer , commandBuffer: MTLCommandBuffer) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setBuffer(inputBuffer, offset: 0, index: 0) + encoder.setTexture(outputTexture, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: outputTexture) + encoder.endEncoding() } - encoder.setBuffer(inputBuffer, offset: 0, index: 0) - encoder.setTexture(outputTexture, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: outputTexture) - encoder.endEncoding() - } - } @objc open class CusomKernel: Kernel { - - public let outputTexture: MTLTexture - public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, metalLoadModel: MetalLoadMode, metalLibPath: String?) { - let textureDesc = MTLTextureDescriptor.init() - textureDesc.textureType = .type2D - textureDesc.width = outputDim.width - textureDesc.height = outputDim.height - textureDesc.depth = (outputDim.channel + 3) / 4 - if GlobalConfig.shared.computePrecision == .Float16 { - textureDesc.pixelFormat = .rgba16Float - } else if GlobalConfig.shared.computePrecision == .Float32 { - textureDesc.pixelFormat = .rgba32Float - } else { - fatalError() + public let outputTexture: MTLTexture + public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, metalLoadModel: MetalLoadMode, metalLibPath: String?) { + let textureDesc = MTLTextureDescriptor.init() + textureDesc.textureType = .type2D + textureDesc.width = outputDim.width + textureDesc.height = outputDim.height + textureDesc.depth = (outputDim.channel + 3) / 4 + + if GlobalConfig.shared.computePrecision == .Float16 { + textureDesc.pixelFormat = .rgba16Float + } else if GlobalConfig.shared.computePrecision == .Float32 { + textureDesc.pixelFormat = .rgba32Float + } else { + fatalError() + } + + textureDesc.usage = [.shaderRead, .shaderWrite] + textureDesc.storageMode = .shared + outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error " + + let context = InitContext.init() + context.metalLoadMode = metalLoadModel + context.metalLibPath = metalLibPath + super.init(device: device, inFunctionName: inFunctionName, initContext: context) } - textureDesc.usage = [.shaderRead, .shaderWrite] - textureDesc.storageMode = .shared - outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error " - - let context = InitContext.init() - context.metalLoadMode = metalLoadModel - context.metalLibPath = metalLibPath - super.init(device: device, inFunctionName: inFunctionName, initContext: context) - } - - public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(inputTexuture, index: 0) + encoder.setTexture(outputTexture, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: outputTexture) + encoder.endEncoding() } - encoder.setTexture(inputTexuture, index: 0) - encoder.setTexture(outputTexture, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: outputTexture) - encoder.endEncoding() - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift index 9eeb2aff9c..0e2005b024 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift @@ -15,39 +15,39 @@ import Foundation class BatchNormKernel: Kernel, Computable { - required init(device: MTLDevice, param: BatchNormParam

, initContext: InitContext) { - let count = param.variance.dim.numel() - let varianceP = param.variance.data.pointer - let meanP = param.mean.data.pointer - let scaleP = param.scale.data.pointer - let biasP = param.bias.data.pointer - for i in 0.., initContext: InitContext) { + let count = param.variance.dim.numel() + let varianceP = param.variance.data.pointer + let meanP = param.mean.data.pointer + let scaleP = param.scale.data.pointer + let biasP = param.bias.data.pointer + for i in 0..) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") + + func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBuffer(param.scale.buffer, offset: 0, index: 0) + encoder.setBuffer(param.bias.buffer, offset: 0, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBuffer(param.scale.buffer, offset: 0, index: 0) - encoder.setBuffer(param.bias.buffer, offset: 0, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift index 0db2e98651..c8a6519085 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift @@ -15,41 +15,41 @@ import Foundation struct BilinearInterpMetalParam { - var ratio_h: Float32 - var ratio_w: Float32 + var ratio_h: Float32 + var ratio_w: Float32 } class BilinearInterpKernel: Kernel, Computable{ - func compute(commandBuffer: MTLCommandBuffer, param: BilinearInterpParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: BilinearInterpParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + var ratio_h: Float32 = 0 + var ratio_w: Float32 = 0 + if param.output.tensorDim.dims[2] > 1 { + ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1) + } + if param.output.tensorDim.dims[3] > 1 { + ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1) + } + var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w) + encoder.setBytes(&p, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - var ratio_h: Float32 = 0 - var ratio_w: Float32 = 0 - if param.output.tensorDim.dims[2] > 1 { - ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1) + required init(device: MTLDevice, param: BilinearInterpParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "bilinear_interp_float", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "bilinear_interp_half", initContext: initContext) + } else { + fatalError() + } } - if param.output.tensorDim.dims[3] > 1 { - ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1) - } - var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w) - encoder.setBytes(&p, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - required init(device: MTLDevice, param: BilinearInterpParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "bilinear_interp_float", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "bilinear_interp_half", initContext: initContext) - } else { - fatalError() - } - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift index 6e528a5965..8f295672c1 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift @@ -18,29 +18,29 @@ struct BoxcoderMetalParam { } class BoxcoderKernel: Kernel, Computable{ - func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.priorBox.metalTexture, index: 0) + encoder.setTexture(param.priorBoxVar.metalTexture, index: 1) + encoder.setTexture(param.targetBox.metalTexture, index: 2) + encoder.setTexture(param.output.metalTexture, index: 3) + var bmp = BoxcoderMetalParam.init() + encoder.setBytes(&bmp, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.priorBox.metalTexture, index: 0) - encoder.setTexture(param.priorBoxVar.metalTexture, index: 1) - encoder.setTexture(param.targetBox.metalTexture, index: 2) - encoder.setTexture(param.output.metalTexture, index: 3) - var bmp = BoxcoderMetalParam.init() - encoder.setBytes(&bmp, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - required init(device: MTLDevice, param: BoxcoderParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: GlobalConfig.shared.computePrecision) - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "boxcoder_float", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "boxcoder_half", initContext: initContext) - } else { - fatalError() + + required init(device: MTLDevice, param: BoxcoderParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: GlobalConfig.shared.computePrecision) + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "boxcoder_float", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "boxcoder_half", initContext: initContext) + } else { + fatalError() + } } - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift index edb0289688..195366c796 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift @@ -16,133 +16,133 @@ import Foundation import Metal struct ConcatTestParam: TestParam { - var input: [MTLTexture] - var output: MTLTexture - var dims: [[Int]] - var axis: Int - var odim: [Int] + var input: [MTLTexture] + var output: MTLTexture + var dims: [[Int]] + var axis: Int + var odim: [Int] } struct ConcatMetalParam { - var odim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1) - var axis: Int32 = 0 - var offset: Int32 = 0 - var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) - var vdim: (Int32, Int32, Int32, Int32, Int32, Int32) = (0, 0, 0, 0, 0, 0) + var odim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1) + var axis: Int32 = 0 + var offset: Int32 = 0 + var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) + var vdim: (Int32, Int32, Int32, Int32, Int32, Int32) = (0, 0, 0, 0, 0, 0) } class ConcatKernel: Kernel, Computable{ - var v = "normal" - var pm = ConcatMetalParam.init() - func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam

) throws { - - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - let num = param.input.count - for i in 0...size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - required init(device: MTLDevice, param: ConcatParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: GlobalConfig.shared.computePrecision) - let orank = param.output.tensorDim.cout() - let num = param.input.count - assert(num <= 6) - var axis = 4 - param.output.tensorDim.cout() + param.axis - for i in 0..<4 { - if param.transpose[i] == axis { - axis = i - break - } - } - pm.axis = Int32(axis) - pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3])) - pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3])) - var vdim: [Int] = [0, 0, 0, 0, 0, 0] - for i in 0..) throws { + + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") } - } - } else if orank == 3 { - if axis == 2 { - v = "y" - } else if axis == 3 { - v = "x" - } else if axis == 1 { - var vz = true + let num = param.input.count for i in 0...size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } + + required init(device: MTLDevice, param: ConcatParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: GlobalConfig.shared.computePrecision) + let orank = param.output.tensorDim.cout() + let num = param.input.count + assert(num <= 6) + var axis = 4 - param.output.tensorDim.cout() + param.axis + for i in 0..<4 { + if param.transpose[i] == axis { + axis = i + break + } + } + pm.axis = Int32(axis) + pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3])) + pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3])) + var vdim: [Int] = [0, 0, 0, 0, 0, 0] for i in 0..: Kernel, Computable { - var metalParam: MetalConvParam! - required init(device: MTLDevice, param: ConvAddAddPreluParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) - param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - - if GlobalConfig.shared.computePrecision == .Float16 { - if param.filter.width == 1 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext) - } + var metalParam: MetalConvParam! + required init(device: MTLDevice, param: ConvAddAddPreluParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) + param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - } else if param.filter.channel == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext) - } - } else if param.filter.width == 3 && param.filter.height == 3 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext) + if GlobalConfig.shared.computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext) + } + + } else if param.filter.channel == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext) + } + } else if param.filter.width == 3 && param.filter.height == 3 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext) + } + + } else if param.filter.width == 1 && param.filter.height == 5 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext) + } + } else if param.filter.width == 5 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext) + } + } else { + fatalError(" unsupport yet ") + } + } else if GlobalConfig.shared.computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext) + } + } else if param.filter.channel == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext) + } + } else if param.filter.width == 3 && param.filter.height == 3 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext) + } + + } else if param.filter.width == 1 && param.filter.height == 5 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext) + } + } else if param.filter.width == 5 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext) + } + } else { + fatalError(" unsupport yet ") + } } else { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext) + fatalError() } - } else if param.filter.width == 1 && param.filter.height == 5 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext) - } - } else if param.filter.width == 5 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext) - } - } else { - fatalError(" unsupport yet ") - } - } else if GlobalConfig.shared.computePrecision == .Float32 { - if param.filter.width == 1 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext) - } - } else if param.filter.channel == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext) - } - } else if param.filter.width == 3 && param.filter.height == 3 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext) - } + let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) - } else if param.filter.width == 1 && param.filter.height == 5 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext) - } - } else if param.filter.width == 5 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext) - } - } else { - fatalError(" unsupport yet ") - } - } else { - fatalError() + let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) + + // print(" function: \(functionName)") + // print("offset x: \(offsetX)") + // print("offset y: \(offsetY)") + + let offsetZ = 0.0 + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + // print("metal param: ") + // print(inMetalParam) + + metalParam = inMetalParam } - let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) - - let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) - - // print(" function: \(functionName)") - // print("offset x: \(offsetX)") - // print("offset y: \(offsetY)") - - let offsetZ = 0.0 - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) - // print("metal param: ") - // print(inMetalParam) - - metalParam = inMetalParam - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvAddAddPreluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: ConvAddAddPreluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.setBuffer(param.y.buffer, offset: 0, index: 2) + encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.setBuffer(param.y.buffer, offset: 0, index: 2) - encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift index 6274e3df8f..0ff0b57f6c 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift @@ -16,165 +16,165 @@ import Foundation import Metal struct ConvAddBatchNormReluTestParam: TestParam { - let inputTexture: MTLTexture - let outputTexture: MTLTexture - var metalParam: MetalConvParam - let filterBuffer: MTLBuffer - let biaseBuffer: MTLBuffer - let newScaleBuffer: MTLBuffer - let newBiaseBuffer: MTLBuffer - let filterSize: (width: Int, height: Int, channel: Int) - init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) { - inputTexture = inInputTexture - outputTexture = inOutputTexture - metalParam = inMetalParam - filterBuffer = inFilterBuffer - biaseBuffer = inBiaseBuffer - newScaleBuffer = inNewScaleBuffer - newBiaseBuffer = inNewBiaseBuffer - filterSize = inFilterSize - } + let inputTexture: MTLTexture + let outputTexture: MTLTexture + var metalParam: MetalConvParam + let filterBuffer: MTLBuffer + let biaseBuffer: MTLBuffer + let newScaleBuffer: MTLBuffer + let newBiaseBuffer: MTLBuffer + let filterSize: (width: Int, height: Int, channel: Int) + init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) { + inputTexture = inInputTexture + outputTexture = inOutputTexture + metalParam = inMetalParam + filterBuffer = inFilterBuffer + biaseBuffer = inBiaseBuffer + newScaleBuffer = inNewScaleBuffer + newBiaseBuffer = inNewBiaseBuffer + filterSize = inFilterSize + } } class ConvAddBatchNormReluKernel: Kernel, Computable, Testable { - required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam, initContext: InitContext) { - if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext) - } else if testParam.filterSize.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext) + required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam, initContext: InitContext) { + if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext) + } else if testParam.filterSize.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext) + } } - } - - var metalParam: MetalConvParam! - - required init(device: MTLDevice, param: ConvAddBatchNormReluParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) - param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.variance.initBuffer(device: device, precision: .Float32) - param.mean.initBuffer(device: device, precision: .Float32) - param.scale.initBuffer(device: device, precision: .Float32) - param.bias.initBuffer(device: device, precision: .Float32) - if GlobalConfig.shared.computePrecision == .Float32 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext) - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext) - } else { - fatalError(" unsupport ") - } - } else if GlobalConfig.shared.computePrecision == .Float16 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half", initContext: initContext) - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half", initContext: initContext) - } else { - fatalError(" unsupport ") - } - } else { - fatalError() + var metalParam: MetalConvParam! + + required init(device: MTLDevice, param: ConvAddBatchNormReluParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) + param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.variance.initBuffer(device: device, precision: .Float32) + param.mean.initBuffer(device: device, precision: .Float32) + param.scale.initBuffer(device: device, precision: .Float32) + param.bias.initBuffer(device: device, precision: .Float32) + + if GlobalConfig.shared.computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext) + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext) + } else { + fatalError(" unsupport ") + } + } else if GlobalConfig.shared.computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half", initContext: initContext) + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half", initContext: initContext) + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } + + let offsetX = param.filter.width/2 - Int(param.paddings[0]) + let offsetY = param.filter.height/2 - Int(param.paddings[1]) + + print("offset x: \(offsetX)") + print("offset y: \(offsetY)") + + let offsetZ = 0.0 + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + + var invs: [P] = [] + let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) + + for i in 0...stride { + let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5) + invs.append(P(inv)) + } + + let newScale: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.scale.buffer.length) + let newBiase: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.bias.buffer.length) + + let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self) + let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self) + let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self) + for i in 0...stride { + newScale[i] = invs[i] * scaleContents[i] + newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i] + } + + // var newScaleFP16: UnsafeMutableRawPointer + // + // float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout

.size) + + + // let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>) + + var newBiaseBuffer: MTLBuffer + var newScaleBuffer: MTLBuffer + + if GlobalConfig.shared.computePrecision == .Float32 { + newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)! + newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)! + } else if GlobalConfig.shared.computePrecision == .Float16 { + + newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! + newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! + + float32ToFloat16(input: newBiase as! UnsafeMutablePointer, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout

.size) + + float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout

.size) + } else { + fatalError(" unsupport ") + } + + param.newBiase = newBiaseBuffer + param.newScale = newScaleBuffer + + newScale.deinitialize(count: param.scale.buffer.length) + newScale.deallocate() + + newBiase.deinitialize(count: param.bias.buffer.length) + newBiase.deallocate() } - let offsetX = param.filter.width/2 - Int(param.paddings[0]) - let offsetY = param.filter.height/2 - Int(param.paddings[1]) - - print("offset x: \(offsetX)") - print("offset y: \(offsetY)") - - let offsetZ = 0.0 - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) - - var invs: [P] = [] - let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) - - for i in 0...stride { - let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5) - invs.append(P(inv)) + func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.setBuffer(param.y.buffer, offset: 0, index: 2) + encoder.setBuffer(param.newScale!, offset: 0, index: 3) + encoder.setBuffer(param.newBiase!, offset: 0, index: 4) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - let newScale: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.scale.buffer.length) - let newBiase: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.bias.buffer.length) - - let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self) - let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self) - let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self) - for i in 0...stride { - newScale[i] = invs[i] * scaleContents[i] - newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i] + public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + fatalError() + } + + encoder.setTexture(param.inputTexture, index: 0) + encoder.setTexture(param.outputTexture, index: 1) + var inMetalParam = param.metalParam + encoder.setBytes(&inMetalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filterBuffer, offset: 0, index: 1) + encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2) + encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3) + encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4) + encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture) + encoder.endEncoding() } - -// var newScaleFP16: UnsafeMutableRawPointer -// -// float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout

.size) - - -// let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>) - - var newBiaseBuffer: MTLBuffer - var newScaleBuffer: MTLBuffer - - if GlobalConfig.shared.computePrecision == .Float32 { - newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)! - newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)! - } else if GlobalConfig.shared.computePrecision == .Float16 { - - newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! - newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! - - float32ToFloat16(input: newBiase as! UnsafeMutablePointer, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout

.size) - - float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout

.size) - } else { - fatalError(" unsupport ") - } - - param.newBiase = newBiaseBuffer - param.newScale = newScaleBuffer - - newScale.deinitialize(count: param.scale.buffer.length) - newScale.deallocate() - - newBiase.deinitialize(count: param.bias.buffer.length) - newBiase.deallocate() - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.setBuffer(param.y.buffer, offset: 0, index: 2) - encoder.setBuffer(param.newScale!, offset: 0, index: 3) - encoder.setBuffer(param.newBiase!, offset: 0, index: 4) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - fatalError() - } - - encoder.setTexture(param.inputTexture, index: 0) - encoder.setTexture(param.outputTexture, index: 1) - var inMetalParam = param.metalParam - encoder.setBytes(&inMetalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filterBuffer, offset: 0, index: 1) - encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2) - encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3) - encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4) - encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift index 0ba448161f..d40fa7e445 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift @@ -15,74 +15,74 @@ import Foundation class ConvAddKernel: Kernel, Computable { - var metalParam: MetalConvParam! - required init(device: MTLDevice, param: ConvAddParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) - let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1]) - param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC) - param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - - if GlobalConfig.shared.computePrecision == .Float16 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_1x1_half", initContext: initContext) - } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_add_3x3_half", initContext: initContext) - } else if param.filter.width == 1 && param.filter.height == 5 { - super.init(device: device, inFunctionName: "conv_add_5x1_half", initContext: initContext) - } else if param.filter.width == 5 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_1x5_half", initContext: initContext) - } else { - fatalError(" unsupport yet ") - } - } else if GlobalConfig.shared.computePrecision == .Float32 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_1x1", initContext: initContext) - } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3", initContext: initContext) - } else if param.filter.width == 1 && param.filter.height == 5 { - super.init(device: device, inFunctionName: "conv_add_5x1", initContext: initContext) - } else if param.filter.width == 5 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_1x5", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_add_3x3", initContext: initContext) - } else { - fatalError(" unsupport yet ") - } - } else { - fatalError() + var metalParam: MetalConvParam! + required init(device: MTLDevice, param: ConvAddParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) + let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1]) + param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC) + param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + + if GlobalConfig.shared.computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_1x1_half", initContext: initContext) + } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_add_3x3_half", initContext: initContext) + } else if param.filter.width == 1 && param.filter.height == 5 { + super.init(device: device, inFunctionName: "conv_add_5x1_half", initContext: initContext) + } else if param.filter.width == 5 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_1x5_half", initContext: initContext) + } else { + fatalError(" unsupport yet ") + } + } else if GlobalConfig.shared.computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_1x1", initContext: initContext) + } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3", initContext: initContext) + } else if param.filter.width == 1 && param.filter.height == 5 { + super.init(device: device, inFunctionName: "conv_add_5x1", initContext: initContext) + } else if param.filter.width == 5 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_1x5", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_add_3x3", initContext: initContext) + } else { + fatalError(" unsupport yet ") + } + } else { + fatalError() + } + + + + let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) + + let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) + + // print(" function: \(functionName)") + // print("offset x: \(offsetX)") + // print("offset y: \(offsetY)") + + let offsetZ = 0.0 + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + // print("metal param: ") + // print(inMetalParam) + + metalParam = inMetalParam } - - - let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) - - let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) - -// print(" function: \(functionName)") -// print("offset x: \(offsetX)") -// print("offset y: \(offsetY)") - - let offsetZ = 0.0 - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) -// print("metal param: ") -// print(inMetalParam) - - metalParam = inMetalParam - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.setBuffer(param.y.buffer, offset: 0, index: 2) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.setBuffer(param.y.buffer, offset: 0, index: 2) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift index 1d66696050..1b054cb9ca 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift @@ -15,136 +15,136 @@ import Foundation class ConvAddPreluKernel: Kernel, Computable { - var metalParam: MetalConvParam! - required init(device: MTLDevice, param: ConvAddPreluParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) - param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - - if GlobalConfig.shared.computePrecision == .Float16 { - if param.filter.width == 1 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext) - } + var metalParam: MetalConvParam! + required init(device: MTLDevice, param: ConvAddPreluParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) + param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - } else if param.filter.channel == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext) - } - } else if param.filter.width == 3 && param.filter.height == 3 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext) + if GlobalConfig.shared.computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext) + } + + } else if param.filter.channel == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext) + } + } else if param.filter.width == 3 && param.filter.height == 3 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext) + } + + } else if param.filter.width == 1 && param.filter.height == 5 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext) + } + } else if param.filter.width == 5 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext) + } + } else { + fatalError(" unsupport yet ") + } + } else if GlobalConfig.shared.computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext) + } + } else if param.filter.channel == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext) + } + } else if param.filter.width == 3 && param.filter.height == 3 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext) + } + + } else if param.filter.width == 1 && param.filter.height == 5 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext) + } + } else if param.filter.width == 5 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext) + } + } else { + fatalError(" unsupport yet ") + } } else { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext) + fatalError() } - } else if param.filter.width == 1 && param.filter.height == 5 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext) - } - } else if param.filter.width == 5 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext) - } - } else { - fatalError(" unsupport yet ") - } - } else if GlobalConfig.shared.computePrecision == .Float32 { - if param.filter.width == 1 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext) - } - } else if param.filter.channel == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext) - } - } else if param.filter.width == 3 && param.filter.height == 3 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext) - } + let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) - } else if param.filter.width == 1 && param.filter.height == 5 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext) - } - } else if param.filter.width == 5 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext) - } - } else { - fatalError(" unsupport yet ") - } - } else { - fatalError() + let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) + + // print(" function: \(functionName)") + // print("offset x: \(offsetX)") + // print("offset y: \(offsetY)") + + let offsetZ = 0.0 + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + // print("metal param: ") + // print(inMetalParam) + + metalParam = inMetalParam } - let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) - - let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) - - // print(" function: \(functionName)") - // print("offset x: \(offsetX)") - // print("offset y: \(offsetY)") - - let offsetZ = 0.0 - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) - // print("metal param: ") - // print(inMetalParam) - - metalParam = inMetalParam - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.setBuffer(param.y.buffer, offset: 0, index: 2) + encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.setBuffer(param.y.buffer, offset: 0, index: 2) - encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift index 81c53a57a8..415ec94b51 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift @@ -16,165 +16,165 @@ import Foundation import MetalPerformanceShaders struct ConvBNReluTestParam: TestParam { - let inputTexture: MTLTexture - let outputTexture: MTLTexture - var metalParam: MetalConvParam - let filterBuffer: MTLBuffer - let biaseBuffer: MTLBuffer - let newScaleBuffer: MTLBuffer - let newBiaseBuffer: MTLBuffer - let filterSize: (width: Int, height: Int, channel: Int) - init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) { - - inputTexture = inInputTexture - outputTexture = inOutputTexture - metalParam = inMetalParam - filterBuffer = inFilterBuffer - biaseBuffer = inBiaseBuffer - newScaleBuffer = inNewScaleBuffer - newBiaseBuffer = inNewBiaseBuffer - filterSize = inFilterSize - } + let inputTexture: MTLTexture + let outputTexture: MTLTexture + var metalParam: MetalConvParam + let filterBuffer: MTLBuffer + let biaseBuffer: MTLBuffer + let newScaleBuffer: MTLBuffer + let newBiaseBuffer: MTLBuffer + let filterSize: (width: Int, height: Int, channel: Int) + init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) { + + inputTexture = inInputTexture + outputTexture = inOutputTexture + metalParam = inMetalParam + filterBuffer = inFilterBuffer + biaseBuffer = inBiaseBuffer + newScaleBuffer = inNewScaleBuffer + newBiaseBuffer = inNewBiaseBuffer + filterSize = inFilterSize + } } class ConvBNReluKernel: Kernel, Computable, Testable { - required init(device: MTLDevice, testParam: ConvBNReluTestParam, initContext: InitContext) { - if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext) - } else if testParam.filterSize.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext) + required init(device: MTLDevice, testParam: ConvBNReluTestParam, initContext: InitContext) { + if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext) + } else if testParam.filterSize.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext) + } } - } - - var metalParam: MetalConvParam! - - required init(device: MTLDevice, param: ConvBNReluParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) - param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.variance.initBuffer(device: device, precision: .Float32) - param.mean.initBuffer(device: device, precision: .Float32) - param.scale.initBuffer(device: device, precision: .Float32) - param.bias.initBuffer(device: device, precision: .Float32) - - if GlobalConfig.shared.computePrecision == .Float32 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext) - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext) - } else { - fatalError(" unsupport ") - } - } else if GlobalConfig.shared.computePrecision == .Float16 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half", initContext: initContext) - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half", initContext: initContext) - } else { - fatalError(" unsupport ") - } - } else { - fatalError() + var metalParam: MetalConvParam! + + required init(device: MTLDevice, param: ConvBNReluParam

, initContext: InitContext) { + + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) + param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.variance.initBuffer(device: device, precision: .Float32) + param.mean.initBuffer(device: device, precision: .Float32) + param.scale.initBuffer(device: device, precision: .Float32) + param.bias.initBuffer(device: device, precision: .Float32) + + if GlobalConfig.shared.computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext) + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext) + } else { + fatalError(" unsupport ") + } + } else if GlobalConfig.shared.computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half", initContext: initContext) + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half", initContext: initContext) + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } + + + + let offsetX = param.filter.width/2 - Int(param.paddings[0]) + let offsetY = param.filter.height/2 - Int(param.paddings[1]) + + // print(" param filter width: \(param.filter.width)") + // print(" param filter height: \(param.filter.height)") + // + // print(" param paddings: \(param.paddings)") + // + // print("ConvBNReluKernel offset x: \(offsetX)") + // print("ConvBNReluKernel offset y: \(offsetY)") + + let offsetZ = 0.0 + + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + + var invs: [P] = [] + let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) + + for i in 0...stride { + let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5) + invs.append(P(inv)) + } + + let newScale: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.scale.buffer.length) + let newBiase: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.bias.buffer.length) + + let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self) + let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self) + let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self) + for i in 0...stride { + newScale[i] = invs[i] * scaleContents[i] + newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i] + } + + var newBiaseBuffer: MTLBuffer + var newScaleBuffer: MTLBuffer + + if GlobalConfig.shared.computePrecision == .Float32 { + newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)! + newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)! + } else if GlobalConfig.shared.computePrecision == .Float16 { + + newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! + newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! + + float32ToFloat16(input: newBiase as! UnsafeMutablePointer, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout

.size) + + float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout

.size) + } else { + fatalError(" unsupport ") + } + + param.newBiase = newBiaseBuffer + param.newScale = newScaleBuffer + + newScale.deinitialize(count: param.scale.buffer.length) + newScale.deallocate() + + newBiase.deinitialize(count: param.bias.buffer.length) + newBiase.deallocate() } - - - let offsetX = param.filter.width/2 - Int(param.paddings[0]) - let offsetY = param.filter.height/2 - Int(param.paddings[1]) - -// print(" param filter width: \(param.filter.width)") -// print(" param filter height: \(param.filter.height)") -// -// print(" param paddings: \(param.paddings)") -// -// print("ConvBNReluKernel offset x: \(offsetX)") -// print("ConvBNReluKernel offset y: \(offsetY)") - - let offsetZ = 0.0 - - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) - - var invs: [P] = [] - let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) - - for i in 0...stride { - let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5) - invs.append(P(inv)) + func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.setBuffer(param.newScale!, offset: 0, index: 2) + encoder.setBuffer(param.newBiase!, offset: 0, index: 3) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - let newScale: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.scale.buffer.length) - let newBiase: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.bias.buffer.length) - - let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self) - let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self) - let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self) - for i in 0...stride { - newScale[i] = invs[i] * scaleContents[i] - newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i] + public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + fatalError() + } + + encoder.setTexture(param.inputTexture, index: 0) + encoder.setTexture(param.outputTexture, index: 1) + var inMetalParam = param.metalParam + encoder.setBytes(&inMetalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filterBuffer, offset: 0, index: 1) + encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 2) + encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 3) + encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture) + encoder.endEncoding() } - - var newBiaseBuffer: MTLBuffer - var newScaleBuffer: MTLBuffer - - if GlobalConfig.shared.computePrecision == .Float32 { - newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)! - newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)! - } else if GlobalConfig.shared.computePrecision == .Float16 { - - newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! - newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! - - float32ToFloat16(input: newBiase as! UnsafeMutablePointer, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout

.size) - - float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout

.size) - } else { - fatalError(" unsupport ") - } - - param.newBiase = newBiaseBuffer - param.newScale = newScaleBuffer - - newScale.deinitialize(count: param.scale.buffer.length) - newScale.deallocate() - - newBiase.deinitialize(count: param.bias.buffer.length) - newBiase.deallocate() - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.setBuffer(param.newScale!, offset: 0, index: 2) - encoder.setBuffer(param.newBiase!, offset: 0, index: 3) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - fatalError() - } - - encoder.setTexture(param.inputTexture, index: 0) - encoder.setTexture(param.outputTexture, index: 1) - var inMetalParam = param.metalParam - encoder.setBytes(&inMetalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filterBuffer, offset: 0, index: 1) - encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 2) - encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 3) - encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift index 7571bc155b..7ff040219e 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift @@ -15,48 +15,46 @@ import Foundation public struct MetalConvParam { - let offsetX: Int16 - let offsetY: Int16 - let offsetZ: Int16 - let strideX: UInt16 - let strideY: UInt16 - let dilationX: UInt16 - let dilationY: UInt16 + let offsetX: Int16 + let offsetY: Int16 + let offsetZ: Int16 + let strideX: UInt16 + let strideY: UInt16 + let dilationX: UInt16 + let dilationY: UInt16 } class ConvKernel: Kernel, Computable { - var metalParam: MetalConvParam! - required init(device: MTLDevice, param: ConvParam

, initContext: InitContext) { - param.filter.initBuffer(device: device, precision: ComputePrecision.Float32) - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_1x1", initContext: initContext) - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_3x3", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_3x3", initContext: initContext) - } else { - fatalError(" unsupport ") + var metalParam: MetalConvParam! + required init(device: MTLDevice, param: ConvParam

, initContext: InitContext) { + param.filter.initBuffer(device: device, precision: ComputePrecision.Float32) + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_1x1", initContext: initContext) + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_3x3", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_3x3", initContext: initContext) + } else { + fatalError(" unsupport ") + } + + let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0]) + let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1]) + let offsetZ = 0.0 + + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) } - - let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0]) - let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1]) - let offsetZ = 0.0 - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: ConvParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } - - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift index c8b1361649..f1753d0a09 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift @@ -15,69 +15,69 @@ import Foundation struct MetalConvTransposeParam { - let kernelW: UInt16; - let kernelH: UInt16; - - let strideX: UInt16; - let strideY: UInt16; - - let paddingX: UInt16; - let paddingY: UInt16; - - let dilationX: UInt16; - let dilationY: UInt16; -} - -class ConvTransposeKernel: Kernel, Computable{ - var metalParam: MetalConvTransposeParam! - required init(device: MTLDevice, param: ConvTransposeParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) - param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, convertToNHWC: false, withTranspose: true) - if GlobalConfig.shared.computePrecision == .Float32 { - if param.stride == [2, 2] && param.stride == [2, 2] { - super.init(device: device, inFunctionName: "conv_transpose2x2_stride2", initContext: initContext) - } else { - fatalError(" -- conv transpose unsupported yet -- ") - } - } else if GlobalConfig.shared.computePrecision == .Float16 { - if param.stride == [2, 2] && param.stride == [2, 2] { - super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half", initContext: initContext) - } else { - fatalError(" -- conv transpose unsupported yet -- ") - } - } else { - fatalError() - } + let kernelW: UInt16; + let kernelH: UInt16; -// let filter: [Float32] = param.filter.buffer.array() -// print(" conv transpose filter") -// print(filter) - let kernelWidth = UInt16(param.filter.width) - let kernelHeight = UInt16(param.filter.height) + let strideX: UInt16; + let strideY: UInt16; - let strideX = UInt16(param.stride[0]) - let strideY = UInt16(param.stride[1]) - let paddingX = UInt16(param.paddings[0]) - let paddingY = UInt16(param.paddings[1]) - let dilationX = UInt16(param.dilations[0]) - let dilationY = UInt16(param.dilations[1]) + let paddingX: UInt16; + let paddingY: UInt16; - metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY) + let dilationX: UInt16; + let dilationY: UInt16; +} - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") +class ConvTransposeKernel: Kernel, Computable{ + var metalParam: MetalConvTransposeParam! + required init(device: MTLDevice, param: ConvTransposeParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, convertToNHWC: false, withTranspose: true) + if GlobalConfig.shared.computePrecision == .Float32 { + if param.stride == [2, 2] && param.stride == [2, 2] { + super.init(device: device, inFunctionName: "conv_transpose2x2_stride2", initContext: initContext) + } else { + fatalError(" -- conv transpose unsupported yet -- ") + } + } else if GlobalConfig.shared.computePrecision == .Float16 { + if param.stride == [2, 2] && param.stride == [2, 2] { + super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half", initContext: initContext) + } else { + fatalError(" -- conv transpose unsupported yet -- ") + } + } else { + fatalError() + } + + // let filter: [Float32] = param.filter.buffer.array() + // print(" conv transpose filter") + // print(filter) + let kernelWidth = UInt16(param.filter.width) + let kernelHeight = UInt16(param.filter.height) + + let strideX = UInt16(param.stride[0]) + let strideY = UInt16(param.stride[1]) + let paddingX = UInt16(param.paddings[0]) + let paddingY = UInt16(param.paddings[1]) + let dilationX = UInt16(param.dilations[0]) + let dilationY = UInt16(param.dilations[1]) + + metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY) + } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } + func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift index 21108de10e..2a87d4362f 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift @@ -15,59 +15,59 @@ import Foundation struct ElementwiseAddMetalParam { - var fast: Int32 = 0 - var axis: Int32 = 0 - var ylen: Int32 = 0 - var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0) - var xtrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) - var ydim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0) - var ytrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) + var fast: Int32 = 0 + var axis: Int32 = 0 + var ylen: Int32 = 0 + var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0) + var xtrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) + var ydim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0) + var ytrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) } class ElementwiseAddKernel: Kernel, Computable { - var metalParam: ElementwiseAddMetalParam - required init(device: MTLDevice, param: ElementwiseAddParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision) - - metalParam = ElementwiseAddMetalParam.init() - - let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) } - let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) } - let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) } - let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) } - - metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3]) - metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3]) - metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3]) - metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3]) - if param.axis == -1 { - metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout()) - } else { - metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis) + var metalParam: ElementwiseAddMetalParam + required init(device: MTLDevice, param: ElementwiseAddParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision) + + metalParam = ElementwiseAddMetalParam.init() + + let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) } + let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) } + let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) } + let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) } + + metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3]) + metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3]) + metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3]) + metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3]) + if param.axis == -1 { + metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout()) + } else { + metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis) + } + metalParam.ylen = Int32(param.inputY.tensorDim.cout()) + if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) { + // print("===> elementwise_add fast!!!") + metalParam.fast = 1 + } + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "elementwise_add_half", initContext: initContext) + } else { + fatalError() + } } - metalParam.ylen = Int32(param.inputY.tensorDim.cout()) - if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) { - // print("===> elementwise_add fast!!!") - metalParam.fast = 1 - } - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "elementwise_add_half", initContext: initContext) - } else { - fatalError() - } - } - - func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + + func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.inputX.metalTexture, index: 0) + encoder.setTexture(param.inputY.metalTexture, index: 1) + encoder.setTexture(param.output.metalTexture, index: 2) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.inputX.metalTexture, index: 0) - encoder.setTexture(param.inputY.metalTexture, index: 1) - encoder.setTexture(param.output.metalTexture, index: 2) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift index a423a119f3..cf83c2e750 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift @@ -16,64 +16,64 @@ import Foundation class ElementwiseAddPreluKernel: Kernel, Computable { - var metalParam: ElementwiseAddMetalParam - required init(device: MTLDevice, param: ElementwiseAddPreluParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision) - param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - - metalParam = ElementwiseAddMetalParam.init() - - let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) } - let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) } - let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) } - let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) } - - metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3]) - metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3]) - metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3]) - metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3]) - if param.axis == -1 { - metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout()) - } else { - metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis) - } - metalParam.ylen = Int32(param.inputY.tensorDim.cout()) - if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) { - // print("===> elementwise_add fast!!!") - metalParam.fast = 1 + var metalParam: ElementwiseAddMetalParam + required init(device: MTLDevice, param: ElementwiseAddPreluParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision) + param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + + metalParam = ElementwiseAddMetalParam.init() + + let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) } + let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) } + let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) } + let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) } + + metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3]) + metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3]) + metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3]) + metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3]) + if param.axis == -1 { + metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout()) + } else { + metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis) + } + metalParam.ylen = Int32(param.inputY.tensorDim.cout()) + if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) { + // print("===> elementwise_add fast!!!") + metalParam.fast = 1 + } + + if GlobalConfig.shared.computePrecision == .Float32 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "elementwise_add_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "elementwise_add_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "elementwise_add_prelu_float", initContext: initContext) + } + } else if GlobalConfig.shared.computePrecision == .Float16 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext) + } + } else { + fatalError() + } } - if GlobalConfig.shared.computePrecision == .Float32 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "elementwise_add_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "elementwise_add_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "elementwise_add_prelu_float", initContext: initContext) - } - } else if GlobalConfig.shared.computePrecision == .Float16 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext) - } - } else { - fatalError() - } - } - - func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddPreluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddPreluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.inputX.metalTexture, index: 0) + encoder.setTexture(param.inputY.metalTexture, index: 1) + encoder.setTexture(param.output.metalTexture, index: 2) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.alpha.buffer, offset: 0, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.inputX.metalTexture, index: 0) - encoder.setTexture(param.inputY.metalTexture, index: 1) - encoder.setTexture(param.output.metalTexture, index: 2) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.alpha.buffer, offset: 0, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift index 7d6e68e699..616fcc1f2d 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift @@ -15,47 +15,47 @@ import Foundation class FetchKernel: Kernel, Computable { - - required init(device: MTLDevice, param: FetchParam

, initContext: InitContext) { - param.output.initBuffer(device: device) - if GlobalConfig.shared.computePrecision == .Float16 { - if param.input.transpose == [0, 2, 3, 1] { - super.init(device: device, inFunctionName: "fetch_half", initContext: initContext) - } else if param.input.transpose == [0, 1, 2, 3] { - switch param.input.tensorDim.cout() { - case 1, 2: - super.init(device: device, inFunctionName: "fetch_1or2_half", initContext: initContext) - default: - fatalError(" not support ") + + required init(device: MTLDevice, param: FetchParam

, initContext: InitContext) { + param.output.initBuffer(device: device) + if GlobalConfig.shared.computePrecision == .Float16 { + if param.input.transpose == [0, 2, 3, 1] { + super.init(device: device, inFunctionName: "fetch_half", initContext: initContext) + } else if param.input.transpose == [0, 1, 2, 3] { + switch param.input.tensorDim.cout() { + case 1, 2: + super.init(device: device, inFunctionName: "fetch_1or2_half", initContext: initContext) + default: + fatalError(" not support ") + } + } else { + fatalError(" not support ") + } + } else if GlobalConfig.shared.computePrecision == .Float32 { + if param.input.transpose == [0, 2, 3, 1] { + super.init(device: device, inFunctionName: "fetch_float", initContext: initContext) + } else if param.input.transpose == [0, 1, 2, 3] { + switch param.input.tensorDim.cout() { + case 1, 2: + super.init(device: device, inFunctionName: "fetch_1or2_float", initContext: initContext) + default: + fatalError(" not support ") + } + } else { + fatalError(" not support ") + } + } else { + fatalError(" not support ") } - } else { - fatalError(" not support ") - } - } else if GlobalConfig.shared.computePrecision == .Float32 { - if param.input.transpose == [0, 2, 3, 1] { - super.init(device: device, inFunctionName: "fetch_float", initContext: initContext) - } else if param.input.transpose == [0, 1, 2, 3] { - switch param.input.tensorDim.cout() { - case 1, 2: - super.init(device: device, inFunctionName: "fetch_1or2_float", initContext: initContext) - default: - fatalError(" not support ") - } - } else { - fatalError(" not support ") - } - } else { - fatalError(" not support ") } - } - - func compute(commandBuffer: MTLCommandBuffer, param: FetchParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + + func compute(commandBuffer: MTLCommandBuffer, param: FetchParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift index 06a6537e1f..5956806001 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift @@ -15,57 +15,57 @@ import Foundation struct FlattenMetalParam { - var idim: (Int32, Int32, Int32, Int32) - var itrans: (Int32, Int32, Int32, Int32) - var odim: (Int32, Int32, Int32, Int32) - var otrans: (Int32, Int32, Int32, Int32) + var idim: (Int32, Int32, Int32, Int32) + var itrans: (Int32, Int32, Int32, Int32) + var odim: (Int32, Int32, Int32, Int32) + var otrans: (Int32, Int32, Int32, Int32) } class FlattenKernel: Kernel, Computable{ - - var metalParam: FlattenMetalParam - - required init(device: MTLDevice, param: FlattenParam

, initContext: InitContext) { - param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) - var id: [Int32] = [1, 1, 1, 1] - for i in 0..) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") + + var metalParam: FlattenMetalParam + + required init(device: MTLDevice, param: FlattenParam

, initContext: InitContext) { + param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) + var id: [Int32] = [1, 1, 1, 1] + for i in 0...size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } + func compute(commandBuffer: MTLCommandBuffer, param: FlattenParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift index d3fc5a3ac9..4f59bf9971 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift @@ -15,41 +15,41 @@ import Foundation class MulticlassNMSKernel: Kernel, Computable{ - let pipline1: MTLComputePipelineState - - required init(device: MTLDevice, param: MulticlassNMSParam

, initContext: InitContext) { + let pipline1: MTLComputePipelineState - param.middleOutput.initBuffer(device: device) - param.bboxOutput.initBuffer(device: device) - if GlobalConfig.shared.computePrecision == .Float32 { - pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath) - super.init(device: device, inFunctionName: "nms_fetch_result", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath) - super.init(device: device, inFunctionName: "nms_fetch_result_half", initContext: initContext) - } else { - fatalError( " unsupport precision " ) + required init(device: MTLDevice, param: MulticlassNMSParam

, initContext: InitContext) { + + param.middleOutput.initBuffer(device: device) + param.bboxOutput.initBuffer(device: device) + if GlobalConfig.shared.computePrecision == .Float32 { + pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath) + super.init(device: device, inFunctionName: "nms_fetch_result", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath) + super.init(device: device, inFunctionName: "nms_fetch_result_half", initContext: initContext) + } else { + fatalError( " unsupport precision " ) + } + } - } - - func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.scores.metalTexture, index: 0) + encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture) + encoder.endEncoding() + + guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoderBox.setTexture(param.bboxes.metalTexture, index: 0) + encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0) + encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture) + encoderBox.endEncoding() } - - encoder.setTexture(param.scores.metalTexture, index: 0) - encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture) - encoder.endEncoding() - - guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - - encoderBox.setTexture(param.bboxes.metalTexture, index: 0) - encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0) - encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture) - encoderBox.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift index b6833a4f93..37878f26d0 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift @@ -15,57 +15,57 @@ import Foundation struct PoolMetalParam { - let ksizeX: Int32 - let ksizeY: Int32 - let strideX: Int32 - let strideY: Int32 - let paddingX: Int32 - let paddingY: Int32 - let poolType: Int32 + let ksizeX: Int32 + let ksizeY: Int32 + let strideX: Int32 + let strideY: Int32 + let paddingX: Int32 + let paddingY: Int32 + let poolType: Int32 } class PoolKernel: Kernel, Computable{ - var metalParam: PoolMetalParam - required init(device: MTLDevice, param: PoolParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) - - var poolType: Int32 - switch param.poolType { - case "max": - poolType = 0 - case "avg": - poolType = 1 - default: - fatalError() + var metalParam: PoolMetalParam + required init(device: MTLDevice, param: PoolParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + + var poolType: Int32 + switch param.poolType { + case "max": + poolType = 0 + case "avg": + poolType = 1 + default: + fatalError() + } + metalParam = PoolMetalParam.init( + ksizeX: param.ksize[0], + ksizeY: param.ksize[1], + strideX: param.stride[0], + strideY: param.stride[1], + paddingX: param.padding[0], + paddingY: param.padding[1], + poolType: poolType + ) + + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "pool_float", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "pool_half", initContext: initContext) + } else { + fatalError() + } } - metalParam = PoolMetalParam.init( - ksizeX: param.ksize[0], - ksizeY: param.ksize[1], - strideX: param.stride[0], - strideY: param.stride[1], - paddingX: param.padding[0], - paddingY: param.padding[1], - poolType: poolType - ) - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "pool_float", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "pool_half", initContext: initContext) - } else { - fatalError() - } - } - - func compute(commandBuffer: MTLCommandBuffer, param: PoolParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") + func compute(commandBuffer: MTLCommandBuffer, param: PoolParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift index 61a21331a6..053cb827e3 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift @@ -15,39 +15,39 @@ import Foundation class PreluKernel: Kernel, Computable{ - required init(device: MTLDevice, param: PreluParam

, initContext: InitContext) { - param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) - if GlobalConfig.shared.computePrecision == .Float32 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "prelu_channel", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "prelu_element", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "prelu_other", initContext: initContext) - } - } else if GlobalConfig.shared.computePrecision == .Float16 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "prelu_other_half", initContext: initContext) - } - } else { - fatalError() - } - } - - func compute(commandBuffer: MTLCommandBuffer, param: PreluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") + required init(device: MTLDevice, param: PreluParam

, initContext: InitContext) { + param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + if GlobalConfig.shared.computePrecision == .Float32 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "prelu_channel", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "prelu_element", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "prelu_other", initContext: initContext) + } + } else if GlobalConfig.shared.computePrecision == .Float16 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "prelu_other_half", initContext: initContext) + } + } else { + fatalError() + } } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBuffer(param.alpha.buffer, offset: 0, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } + func compute(commandBuffer: MTLCommandBuffer, param: PreluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBuffer(param.alpha.buffer, offset: 0, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift index 15126bbc83..cb8ef81de3 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift @@ -15,136 +15,136 @@ import Foundation struct PriorBoxMetalParam { - let offset: Float32 - let stepWidth: Float32 - let stepHeight: Float32 - let minSize: Float32 - let maxSize: Float32 - let imageWidth: Float32 - let imageHeight: Float32 - let clip: Bool - let numPriors: uint - let aspecRatiosSize: uint - let minSizeSize: uint - let maxSizeSize: uint + let offset: Float32 + let stepWidth: Float32 + let stepHeight: Float32 + let minSize: Float32 + let maxSize: Float32 + let imageWidth: Float32 + let imageHeight: Float32 + let clip: Bool + let numPriors: uint + let aspecRatiosSize: uint + let minSizeSize: uint + let maxSizeSize: uint } class PriorBoxKernel: Kernel, Computable{ - var metalParam: PriorBoxMetalParam! - - required init(device: MTLDevice, param: PriorBoxParam

, initContext: InitContext) { - - let originDim = param.output.tensorDim; - - param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]]) - param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]]) - - param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision) - param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: GlobalConfig.shared.computePrecision) - - if GlobalConfig.shared.computePrecision == .Float32 { - if param.min_max_aspect_ratios_order { - super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "prior_box", initContext: initContext) - } - - } else if GlobalConfig.shared.computePrecision == .Float16 { - if param.min_max_aspect_ratios_order { - super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "prior_box_half", initContext: initContext) - } - } else { - fatalError() - } - - - guard param.minSizes.count == 1 else { - fatalError(" need implement ") - } - -// let n = 1 -// let h = param.output.dim[1] -// let w = param.output.dim[2] -// let c = param.output.dim[3] * param.output.dim[0] -// -// param.output.dim = Dim.init(inDim: [n, h, w, c]) -// param.output.transpose = [0, 1, 2, 3] - - let imageWidth = Float32(param.inputImage.padToFourDim[3]) - let imageHeight = Float32(param.inputImage.padToFourDim[2]) - - let featureWidth = param.input.padToFourDim[3] - let featureHeight = param.input.padToFourDim[2] - - if param.stepW == 0 || param.stepH == 0 { - param.stepW = Float32(imageWidth) / Float32(featureWidth) - param.stepH = Float32(imageHeight) / Float32(featureHeight) - } - - var outputAspectRatior: [Float32] = [] - outputAspectRatior.append(1.0) - - let epsilon = 1e-6 - for ar in param.aspectRatios { - var alreadyExist = false - for outputAr in outputAspectRatior { - if fabs(Double(ar) - Double(outputAr)) < Double(epsilon) { - alreadyExist = true - break + var metalParam: PriorBoxMetalParam! + + required init(device: MTLDevice, param: PriorBoxParam

, initContext: InitContext) { + + let originDim = param.output.tensorDim; + + param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]]) + param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]]) + + param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision) + param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: GlobalConfig.shared.computePrecision) + + if GlobalConfig.shared.computePrecision == .Float32 { + if param.min_max_aspect_ratios_order { + super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "prior_box", initContext: initContext) + } + + } else if GlobalConfig.shared.computePrecision == .Float16 { + if param.min_max_aspect_ratios_order { + super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "prior_box_half", initContext: initContext) + } + } else { + fatalError() } - } - - if !alreadyExist { - outputAspectRatior.append(ar) - } - if param.flip { - outputAspectRatior.append(1.0 / ar) - } - } - - if GlobalConfig.shared.computePrecision == .Float16 { - let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout.size) - float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count) - param.newAspectRatios = buffer - - } else if GlobalConfig.shared.computePrecision == .Float32 { - let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout.size, options: []) - param.newAspectRatios = buffer - } else { - fatalError() + + + guard param.minSizes.count == 1 else { + fatalError(" need implement ") + } + + // let n = 1 + // let h = param.output.dim[1] + // let w = param.output.dim[2] + // let c = param.output.dim[3] * param.output.dim[0] + // + // param.output.dim = Dim.init(inDim: [n, h, w, c]) + // param.output.transpose = [0, 1, 2, 3] + + let imageWidth = Float32(param.inputImage.padToFourDim[3]) + let imageHeight = Float32(param.inputImage.padToFourDim[2]) + + let featureWidth = param.input.padToFourDim[3] + let featureHeight = param.input.padToFourDim[2] + + if param.stepW == 0 || param.stepH == 0 { + param.stepW = Float32(imageWidth) / Float32(featureWidth) + param.stepH = Float32(imageHeight) / Float32(featureHeight) + } + + var outputAspectRatior: [Float32] = [] + outputAspectRatior.append(1.0) + + let epsilon = 1e-6 + for ar in param.aspectRatios { + var alreadyExist = false + for outputAr in outputAspectRatior { + if fabs(Double(ar) - Double(outputAr)) < Double(epsilon) { + alreadyExist = true + break + } + } + + if !alreadyExist { + outputAspectRatior.append(ar) + } + if param.flip { + outputAspectRatior.append(1.0 / ar) + } + } + + if GlobalConfig.shared.computePrecision == .Float16 { + let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout.size) + float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count) + param.newAspectRatios = buffer + + } else if GlobalConfig.shared.computePrecision == .Float32 { + let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout.size, options: []) + param.newAspectRatios = buffer + } else { + fatalError() + } + + let aspectRatiosSize = uint(outputAspectRatior.count) + + let maxSizeSize: uint = uint(param.maxSizes.count) + let minSizeSize: uint = uint(param.minSizes.count) + + let numPriors = aspectRatiosSize * minSizeSize + maxSizeSize + + let minSize = param.minSizes.last ?? 0.0 + let maxSize = param.maxSizes.last ?? 0.0 + + metalParam = PriorBoxMetalParam.init(offset: param.offset, stepWidth: param.stepW, stepHeight: param.stepH, minSize: minSize, maxSize: maxSize, imageWidth: imageWidth, imageHeight: imageHeight, clip: param.clip, numPriors: numPriors, aspecRatiosSize: aspectRatiosSize, minSizeSize: minSizeSize, maxSizeSize: maxSizeSize) + } - let aspectRatiosSize = uint(outputAspectRatior.count) - - let maxSizeSize: uint = uint(param.maxSizes.count) - let minSizeSize: uint = uint(param.minSizes.count) - - let numPriors = aspectRatiosSize * minSizeSize + maxSizeSize - - let minSize = param.minSizes.last ?? 0.0 - let maxSize = param.maxSizes.last ?? 0.0 - - metalParam = PriorBoxMetalParam.init(offset: param.offset, stepWidth: param.stepW, stepHeight: param.stepH, minSize: minSize, maxSize: maxSize, imageWidth: imageWidth, imageHeight: imageHeight, clip: param.clip, numPriors: numPriors, aspecRatiosSize: aspectRatiosSize, minSizeSize: minSizeSize, maxSizeSize: maxSizeSize) - - } - - func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setTexture(param.outputVariances.metalTexture, index: 2) + + encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0) + + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 1) + + encoder.setBytes(param.variances, length: MemoryLayout.size * param.variances.count, index: 2) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setTexture(param.outputVariances.metalTexture, index: 2) - - encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0) - - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 1) - - encoder.setBytes(param.variances, length: MemoryLayout.size * param.variances.count, index: 2) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift index 0bde0623ef..06ff7d3990 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift @@ -15,23 +15,23 @@ import Foundation class ReluKernel: Kernel, Computable{ - func compute(commandBuffer: MTLCommandBuffer, param: ReluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: ReluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - required init(device: MTLDevice, param: ReluParam

, initContext: InitContext) { - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "relu", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "relu_half", initContext: initContext) - } else { - fatalError() + + required init(device: MTLDevice, param: ReluParam

, initContext: InitContext) { + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "relu", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "relu_half", initContext: initContext) + } else { + fatalError() + } } - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift index f14db86a3a..954eff9a56 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift @@ -15,83 +15,83 @@ import Foundation struct ReshapeMetalParam { - var idim: (Int32, Int32, Int32, Int32) - var itrans: (Int32, Int32, Int32, Int32) - var odim: (Int32, Int32, Int32, Int32) - var otrans: (Int32, Int32, Int32, Int32) + var idim: (Int32, Int32, Int32, Int32) + var itrans: (Int32, Int32, Int32, Int32) + var odim: (Int32, Int32, Int32, Int32) + var otrans: (Int32, Int32, Int32, Int32) } struct ReshapeTestParam: TestParam { - let inputTexture: MTLTexture - let outputTexture: MTLTexture - let param: ReshapeMetalParam + let inputTexture: MTLTexture + let outputTexture: MTLTexture + let param: ReshapeMetalParam } class ReshapeKernel: Kernel, Computable{ - - var metalParam: ReshapeMetalParam - - required init(device: MTLDevice, param: ReshapeParam

, initContext: InitContext) { - param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) - var id: [Int32] = [1, 1, 1, 1] - for i in 0.., initContext: InitContext) { + param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) + var id: [Int32] = [1, 1, 1, 1] + for i in 0..) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - } - - required init(device: MTLDevice, testParam: ReshapeTestParam, initContext: InitContext) { - metalParam = ReshapeMetalParam.init( - idim: (0, 0, 0, 0), - itrans: (0, 0, 0, 0), - odim: (0, 0, 0, 0), - otrans: (0, 0, 0, 0) - ) - super.init(device: device, inFunctionName: "reshape", initContext: initContext) - } - - func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") - } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - -// func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) { -// guard let encoder = commandBuffer.makeComputeCommandEncoder() else { -// fatalError() -// } -// encoder.setTexture(testParam.inputTexture, index: 0) -// encoder.setTexture(testParam.outputTexture, index: 1) -// var pm: ReshapeMetalParam = testParam.param -// encoder.setBytes(&pm, length: MemoryLayout.size, index: 0) -// encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture) -// encoder.endEncoding() -// } + + // func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) { + // guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + // fatalError() + // } + // encoder.setTexture(testParam.inputTexture, index: 0) + // encoder.setTexture(testParam.outputTexture, index: 1) + // var pm: ReshapeMetalParam = testParam.param + // encoder.setBytes(&pm, length: MemoryLayout.size, index: 0) + // encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture) + // encoder.endEncoding() + // } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift index a007196b67..7e9105ae57 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift @@ -15,37 +15,37 @@ import Foundation struct ResizeBilinearMetalParam { - var ratio_h: Float32 - var ratio_w: Float32 + var ratio_h: Float32 + var ratio_w: Float32 } class ResizeBilinearKernel: Kernel, Computable{ - required init(device: MTLDevice, param: ResizeBilinearParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "resize_bilinear", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "resize_bilinear_half", initContext: initContext) - } else { - fatalError() + required init(device: MTLDevice, param: ResizeBilinearParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "resize_bilinear", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "resize_bilinear_half", initContext: initContext) + } else { + fatalError() + } } - } - - func compute(commandBuffer: MTLCommandBuffer, param: ResizeBilinearParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + + func compute(commandBuffer: MTLCommandBuffer, param: ResizeBilinearParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2]) + let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3]) + var p = ResizeBilinearMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w) + encoder.setBytes(&p, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2]) - let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3]) - var p = ResizeBilinearMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w) - encoder.setBytes(&p, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - - + + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift index 2afee5607d..4a6a9a3ee4 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift @@ -15,14 +15,14 @@ import Foundation class ScaleKernel: CusomKernel { - init(device: MTLDevice, shape: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) { - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "scale", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "scale_half", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath) - } else { - fatalError(" unsupport ") + init(device: MTLDevice, shape: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) { + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "scale", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "scale_half", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath) + } else { + fatalError(" unsupport ") + } } - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift index dfec8f9adf..1d2b80cae4 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift @@ -19,24 +19,24 @@ struct ShapeMetalParam { } class ShapeKernel: Kernel, Computable{ - func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam

) throws { -// print("shape compute") -// guard let encoder = commandBuffer.makeComputeCommandEncoder() else { -// throw PaddleMobileError.predictError(message: " encode is nil") -// } -// encoder.setTexture(param.output.metalTexture, index: 0) -// encoder.endEncoding() - } - - required init(device: MTLDevice, param: ShapeParam

, initContext: InitContext) { - param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "shape", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "shape_half", initContext: initContext) - } else { - fatalError() + func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam

) throws { + // print("shape compute") + // guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + // throw PaddleMobileError.predictError(message: " encode is nil") + // } + // encoder.setTexture(param.output.metalTexture, index: 0) + // encoder.endEncoding() } - } - + + required init(device: MTLDevice, param: ShapeParam

, initContext: InitContext) { + param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "shape", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "shape_half", initContext: initContext) + } else { + fatalError() + } + } + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift index 1eac43484d..b4f3281425 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift @@ -15,37 +15,37 @@ import Foundation struct SoftmaxMetalParam { - let N: Int32 - let K: Int32 + let N: Int32 + let K: Int32 } class SoftmaxKernel: Kernel, Computable{ - - var metalParam: SoftmaxMetalParam - required init(device: MTLDevice, param: SoftmaxParam

, initContext: InitContext) { - param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) - metalParam = SoftmaxMetalParam.init( - N: Int32(param.input.tensorDim[0]), - K: Int32(param.input.tensorDim[1]) - ) - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "softmax_float", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "softmax_half", initContext: initContext) - } else { - fatalError() + + var metalParam: SoftmaxMetalParam + required init(device: MTLDevice, param: SoftmaxParam

, initContext: InitContext) { + param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) + metalParam = SoftmaxMetalParam.init( + N: Int32(param.input.tensorDim[0]), + K: Int32(param.input.tensorDim[1]) + ) + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "softmax_float", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "softmax_half", initContext: initContext) + } else { + fatalError() + } } - } - - func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") + + func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift index 8b07a87406..d15e372962 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift @@ -15,79 +15,79 @@ import Foundation struct SplitMetalParam { - var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1) - var axis: Int32 = 0 - var offset: Int32 = 0 - var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) - var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0) + var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1) + var axis: Int32 = 0 + var offset: Int32 = 0 + var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) + var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0) } class SplitKernel: Kernel, Computable{ - var smp: SplitMetalParam - func compute(commandBuffer: MTLCommandBuffer, param: SplitParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + var smp: SplitMetalParam + func compute(commandBuffer: MTLCommandBuffer, param: SplitParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.input.metalTexture, index: 0) + for i in 0...size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - for i in 0.., initContext: InitContext) { + // param.output.initTexture(device: device, computePrecision: computePrecision) + let num = param.outputList.count + let rank = param.input.tensorDim.cout() + assert(num >= 2 && num <= 4) + for output in param.outputList { + output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + } + smp = SplitMetalParam.init() + smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3])) + smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout()) + for i in 0..<4 { + if param.input.transpose[i] == smp.axis { + smp.axis = Int32(i) + break + } + } + smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3])) + var vdim: [Int32] = [0, 0, 0, 0] + for i in 0...size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture) - encoder.endEncoding() - } - - required init(device: MTLDevice, param: SplitParam

, initContext: InitContext) { - // param.output.initTexture(device: device, computePrecision: computePrecision) - let num = param.outputList.count - let rank = param.input.tensorDim.cout() - assert(num >= 2 && num <= 4) - for output in param.outputList { - output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) - } - smp = SplitMetalParam.init() - smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3])) - smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout()) - for i in 0..<4 { - if param.input.transpose[i] == smp.axis { - smp.axis = Int32(i) - break - } - } - smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3])) - var vdim: [Int32] = [0, 0, 0, 0] - for i in 0..: Kernel, Computable{ - func compute(commandBuffer: MTLCommandBuffer, param: FeedParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - encoder.setTexture(param.input.mtlTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture) - encoder.endEncoding() - } - - required init(device: MTLDevice, param: FeedParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) - if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "texture2d_to_2d_array_half", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "texture2d_to_2d_array", initContext: initContext) - } else { - fatalError() + func compute(commandBuffer: MTLCommandBuffer, param: FeedParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.input.mtlTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture) + encoder.endEncoding() } - } + required init(device: MTLDevice, param: FeedParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) + if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "texture2d_to_2d_array_half", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "texture2d_to_2d_array", initContext: initContext) + } else { + fatalError() + } + + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift index e1490052e7..92947dc278 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift @@ -15,65 +15,65 @@ import Foundation struct TransposeMetalParam { - var iC: Int32 = 0 - var oC: Int32 = 0 - var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) + var iC: Int32 = 0 + var oC: Int32 = 0 + var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) } class TransposeKernel: Kernel, Computable { - var metalParam: TransposeMetalParam = TransposeMetalParam.init() - required init(device: MTLDevice, param: TransposeParam

, initContext: InitContext) { - param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) - let rank = param.input.tensorDim.cout() - var axis: [Int] = [0, 1, 2, 3] - for i in 0.., initContext: InitContext) { + param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) + let rank = param.input.tensorDim.cout() + var axis: [Int] = [0, 1, 2, 3] + for i in 0..", kernelFunc) + print(metalParam) + super.init(device: device, inFunctionName: kernelFunc, initContext: initContext) } - print("===========>", kernelFunc) - print(metalParam) - super.init(device: device, inFunctionName: kernelFunc, initContext: initContext) - } - - func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + + func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - + + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift index 6d2e46b649..b438b3c46c 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift @@ -15,57 +15,57 @@ import Foundation class MulticlassNMSParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope) - bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope) - output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope) - - middleOutput = FetchHolder.init(inPaddedCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim) - - bboxOutput = FetchHolder.init(inPaddedCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope) + bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope) + output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope) + + middleOutput = FetchHolder.init(inPaddedCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim) + + bboxOutput = FetchHolder.init(inPaddedCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim) + } catch let error { + throw error + } } - } - var bboxOutput: FetchHolder - var middleOutput: FetchHolder - let scores: Texture - let bboxes: Texture - var output: Texture + var bboxOutput: FetchHolder + var middleOutput: FetchHolder + let scores: Texture + let bboxes: Texture + var output: Texture } class MulticlassNMSOp: Operator, MulticlassNMSParam

>, Runable, Creator, InferShaperable{ - - func inputVariant() -> [String : [MTLBuffer]] { - guard let scoreBuffer = para.middleOutput.resultBuffer, let bboxBuffer = para.middleOutput.resultBuffer else { - fatalError() + + func inputVariant() -> [String : [MTLBuffer]] { + guard let scoreBuffer = para.middleOutput.resultBuffer, let bboxBuffer = para.middleOutput.resultBuffer else { + fatalError() + } + return ["Scores" : [scoreBuffer], "BBoxes" : [bboxBuffer]] } - return ["Scores" : [scoreBuffer], "BBoxes" : [bboxBuffer]] - } - - func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let _ { - fatalError() + + func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let _ { + fatalError() + } + } + + func inferShape() { + // para.output.dim = para.input.dim + } + + typealias OpType = MulticlassNMSOp

+ func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + + } + + func delogOutput() { + print(" nms - output: ") + print(para.bboxes.metalTexture.float32Array().strideArray()) } - } - - func inferShape() { - // para.output.dim = para.input.dim - } - - typealias OpType = MulticlassNMSOp

- func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - - } - - func delogOutput() { - print(" nms - output: ") - print(para.bboxes.metalTexture.float32Array().strideArray()) - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift index e57c8f48e3..8b212f3b1d 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift @@ -15,60 +15,60 @@ import Foundation class PoolParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope) - poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs) - ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs) - stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs) - padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs) - ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs) - globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs) - assert(input.transpose == [0, 2, 3, 1]) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope) + poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs) + ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs) + stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs) + padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs) + ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs) + globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs) + assert(input.transpose == [0, 2, 3, 1]) + } catch let error { + throw error + } + // let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self) } - // let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self) - } - let input: Texture - var output: Texture - var ksize: [Int32] - var stride: [Int32] - var padding: [Int32] - var poolType: String - var ceilMode: Bool - var globalPooling: Bool + let input: Texture + var output: Texture + var ksize: [Int32] + var stride: [Int32] + var padding: [Int32] + var poolType: String + var ceilMode: Bool + var globalPooling: Bool } class PoolOp: Operator, PoolParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = PoolOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = PoolOp

+ + func inferShape() { + // para.output.dim = para.input.dim } - } - - func delogOutput() { - print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) - -// print("pool2d delog") -// let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true) -// print(para.ksize) -// print(para.stride) -// print(para.padding) -// print(para.poolType) -// let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true) - } + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + + + // print("pool2d delog") + // let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true) + // print(para.ksize) + // print(para.stride) + // print(para.padding) + // print(para.poolType) + // let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true) + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift index b7150c2fea..09a6b027e3 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift @@ -15,51 +15,51 @@ import Foundation class PreluParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try PreluParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try PreluParam.outputOut(outputs: opDesc.outputs, from: inScope) - alpha = try PreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) - mode = try PreluParam.getAttr(key: "mode", attrs: opDesc.attrs) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try PreluParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try PreluParam.outputOut(outputs: opDesc.outputs, from: inScope) + alpha = try PreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) + mode = try PreluParam.getAttr(key: "mode", attrs: opDesc.attrs) + } catch let error { + throw error + } } - } - let mode: String - let alpha: Tensor

- let input: Texture - var output: Texture + let mode: String + let alpha: Tensor

+ let input: Texture + var output: Texture } class PreluOp: Operator, PreluParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = PreluOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = PreluOp

+ + func inferShape() { + // para.output.dim = para.input.dim } - } - - func delogOutput() { - print(" \(type) input: ") - print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray()) - print(" \(type) Alpha: ") - let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false) + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) input: ") + print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray()) + + print(" \(type) Alpha: ") + let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false) + + print(" \(type) output: ") + print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) + } - print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) - } - -// print("softmax delog") -// let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false) -// let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false) + // print("softmax delog") + // let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false) + // let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false) } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift index bff7c9870a..80774f22a9 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift @@ -15,109 +15,109 @@ import Foundation class PriorBoxParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - min_max_aspect_ratios_order = try PriorBoxParam.getAttr(key: "min_max_aspect_ratios_order", attrs: opDesc.attrs) - } catch _ { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + min_max_aspect_ratios_order = try PriorBoxParam.getAttr(key: "min_max_aspect_ratios_order", attrs: opDesc.attrs) + } catch _ { + } + + do { + input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope) + output = try PriorBoxParam.outputBoxes(outputs: opDesc.outputs, from: inScope) + inputImage = try PriorBoxParam.inputImage(inputs: opDesc.inputs, from: inScope) + outputVariances = try PriorBoxParam.outputVariances(outputs: opDesc.outputs, from: inScope) + minSizes = try PriorBoxParam.getAttr(key: "min_sizes", attrs: opDesc.attrs) + maxSizes = try PriorBoxParam.getAttr(key: "max_sizes", attrs: opDesc.attrs) + aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs) + variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs) + flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs) + clip = try PriorBoxParam.getAttr(key: "clip", attrs: opDesc.attrs) + stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs) + stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs) + offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs) + } catch let error { + throw error + } } - do { - input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope) - output = try PriorBoxParam.outputBoxes(outputs: opDesc.outputs, from: inScope) - inputImage = try PriorBoxParam.inputImage(inputs: opDesc.inputs, from: inScope) - outputVariances = try PriorBoxParam.outputVariances(outputs: opDesc.outputs, from: inScope) - minSizes = try PriorBoxParam.getAttr(key: "min_sizes", attrs: opDesc.attrs) - maxSizes = try PriorBoxParam.getAttr(key: "max_sizes", attrs: opDesc.attrs) - aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs) - variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs) - flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs) - clip = try PriorBoxParam.getAttr(key: "clip", attrs: opDesc.attrs) - stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs) - stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs) - offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs) - } catch let error { - throw error - } - } - - var min_max_aspect_ratios_order: Bool = false - let minSizes: [Float32] - let maxSizes: [Float32] - let aspectRatios: [Float32] - var newAspectRatios: MTLBuffer? - let variances: [Float32] - let flip: Bool - let clip: Bool - var stepW: Float32 - var stepH: Float32 - let offset: Float32 - - let input: Texture - let inputImage: Texture - var output: Texture - let outputVariances: Texture + var min_max_aspect_ratios_order: Bool = false + let minSizes: [Float32] + let maxSizes: [Float32] + let aspectRatios: [Float32] + var newAspectRatios: MTLBuffer? + let variances: [Float32] + let flip: Bool + let clip: Bool + var stepW: Float32 + var stepH: Float32 + let offset: Float32 + + let input: Texture + let inputImage: Texture + var output: Texture + let outputVariances: Texture } class PriorBoxOp: Operator, PriorBoxParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = PriorBoxOp

- - func inferShape() { - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error - } - } - - func delogOutput() { - - print(" \(type) output: ") - // output -// let outputArray = para.output.metalTexture.float32Array() -// print(outputArray.strideArray()) -// let device = para.input.metalTexture!.device -// let boxes:[Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: [2,0,1,3]) -// let variances:[Float32] = device.texture2tensor(texture: para.outputVariances.metalTexture!, dim: para.outputVariances.tensorDim.dims, transpose: [2,0,1,3]) -// print("boxes: ") -// print(boxes.strideArray()) -// print("variances: ") -// print(variances.strideArray()) - // output - print(" \(type) output: ") - let box = para.output.metalTexture.realNHWC(dim: (para.output.dim[0], para.output.dim[1], para.output.dim[2], para.output.dim[3])) - print(" dim: \(para.output.dim)") - print(box.strideArray()) -// print((0.. -// let padToFourDim = para.output.padToFourDim -// if para.output.transpose == [0, 1, 2, 3] { -// let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision) -// print(outputArray.strideArray()) -// } else if para.output.transpose == [0, 2, 3, 1] { -// print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]), texturePrecision: computePrecision).strideArray()) -// } else { -// print(" not implement") -// } - -// writeToLibrary(fileName: "box_out", array: outputArray) - - // output variance -// let outputVarianceArray = para.outputVariances.metalTexture.floatArray { (o: Float32) -> Float32 in -// return o -// } -// -// print(" output variance: \(outputVarianceArray)") + func inferShape() { + } -// writeToLibrary(fileName: "variance_out", array: outputVarianceArray) + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } - } + func delogOutput() { + + print(" \(type) output: ") + // output + // let outputArray = para.output.metalTexture.float32Array() + // print(outputArray.strideArray()) + // let device = para.input.metalTexture!.device + // let boxes:[Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: [2,0,1,3]) + // let variances:[Float32] = device.texture2tensor(texture: para.outputVariances.metalTexture!, dim: para.outputVariances.tensorDim.dims, transpose: [2,0,1,3]) + // print("boxes: ") + // print(boxes.strideArray()) + // print("variances: ") + // print(variances.strideArray()) + // output + print(" \(type) output: ") + + let box = para.output.metalTexture.realNHWC(dim: (para.output.dim[0], para.output.dim[1], para.output.dim[2], para.output.dim[3])) + print(" dim: \(para.output.dim)") + print(box.strideArray()) + // print((0.. Float32 in + // return o + // } + // + // print(" output variance: \(outputVarianceArray)") + + // writeToLibrary(fileName: "variance_out", array: outputVarianceArray) + + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift index ef10908106..a286114b3f 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift @@ -16,44 +16,44 @@ import Foundation class ReluParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope) + } catch let error { + throw error + } } - } - let input: Texture - var output: Texture + let input: Texture + var output: Texture } class ReluOp: Operator, ReluParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = ReluOp

- - func inferShape() { - para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = ReluOp

+ + func inferShape() { + para.output.dim = para.input.dim } - } - - func delogOutput() { - print(" \(type) output: ") - print(para.output.metalTexture) - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) -// let device = para.output.metalTexture!.device -// let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) -// print(outputArray.strideArray()) - } - + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + print(para.output.metalTexture) + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + // let device = para.output.metalTexture!.device + // let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + // print(outputArray.strideArray()) + } + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift index e40eae02d0..417344f1da 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift @@ -16,63 +16,63 @@ import Foundation import Metal class ReshapeParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try ReshapeParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try ReshapeParam.outputOut(outputs: opDesc.outputs, from: inScope) - shape = try ReshapeParam.getAttr(key: "shape", attrs: opDesc.attrs) - - var s: [Int] = shape.map { Int($0) } - - var di = -1 - var ml = 1 - for i in 0..= 0 { + s[di] = input.dim.numel() / ml + } + output.tensorDim = Dim.init(inDim: s) + var dim: [Int] = [1, 1, 1, 1] + for i in 0..= 0 { - s[di] = input.dim.numel() / ml - } - output.tensorDim = Dim.init(inDim: s) - var dim: [Int] = [1, 1, 1, 1] - for i in 0..: Operator, ReshapeParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = ReshapeOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = ReshapeOp

+ + func inferShape() { + // para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + func delogOutput() { + print("reshape delog") + let device = para.output.metalTexture!.device + let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + print(outputArray.strideArray()) + // print(outputArray) } - } - func delogOutput() { - print("reshape delog") - let device = para.output.metalTexture!.device - let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) - print(outputArray.strideArray()) -// print(outputArray) - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift index 980bb734a7..e71a62b682 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift @@ -15,50 +15,44 @@ import Foundation class ResizeBilinearParam: OpParam { - typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try ResizeBilinearParam.inputX(inputs: opDesc.inputs, from: inScope) -// if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) { -// fatalError() -// } - output = try ResizeBilinearParam.outputOut(outputs: opDesc.outputs, from: inScope) - out_h = try ResizeBilinearParam.getAttr(key: "out_h", attrs: opDesc.attrs) - out_w = try ResizeBilinearParam.getAttr(key: "out_w", attrs: opDesc.attrs) - } catch let error { - throw error + typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try ResizeBilinearParam.inputX(inputs: opDesc.inputs, from: inScope) + // if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) { + // fatalError() + // } + output = try ResizeBilinearParam.outputOut(outputs: opDesc.outputs, from: inScope) + out_h = try ResizeBilinearParam.getAttr(key: "out_h", attrs: opDesc.attrs) + out_w = try ResizeBilinearParam.getAttr(key: "out_w", attrs: opDesc.attrs) + } catch let error { + throw error + } } - } - let input: Texture - var output: Texture - let out_h: Int32 - let out_w: Int32 + let input: Texture + var output: Texture + let out_h: Int32 + let out_w: Int32 } class ResizeBilinearOp: Operator, ResizeBilinearParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = ResizeBilinearOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = ResizeBilinearOp

+ + func inferShape() { + // para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") } - } - - func delogOutput() { - print(" \(type) output: ") - } - + } - - - - - - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift index c13c3864e4..fd358a67ae 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift @@ -15,39 +15,39 @@ import Foundation class ShapeParam: OpParam { - // typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try ShapeParam.input(inputs: opDesc.inputs, from: inScope) - output = try ShapeParam.outputOut(outputs: opDesc.outputs, from: inScope) - } catch let error { - throw error + // typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try ShapeParam.input(inputs: opDesc.inputs, from: inScope) + output = try ShapeParam.outputOut(outputs: opDesc.outputs, from: inScope) + } catch let error { + throw error + } } - } - var output: Texture - let input: Texture + var output: Texture + let input: Texture } class ShapeOp: Operator, ShapeParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = ShapeOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = ShapeOp

+ + func inferShape() { + // para.output.dim = para.input.dim } - } - - func delogOutput() { - print(" \(type) output: ") - } - + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + } + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift index 2b2455eaa6..f13bf20195 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift @@ -16,48 +16,48 @@ import Foundation import Metal class SoftmaxParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope) - - //assert(input.tensorDim.dims.count == 2) - //assert(input.transpose == [0, 1, 2, 3]) - - output.dim = input.dim - output.tensorDim = input.tensorDim - output.padToFourDim = input.padToFourDim - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope) + + //assert(input.tensorDim.dims.count == 2) + //assert(input.transpose == [0, 1, 2, 3]) + + output.dim = input.dim + output.tensorDim = input.tensorDim + output.padToFourDim = input.padToFourDim + } catch let error { + throw error + } } - } - let input: Texture - var output: Texture + let input: Texture + var output: Texture } class SoftmaxOp: Operator, SoftmaxParam

>, Runable, Creator, InferShaperable{ - typealias OpType = SoftmaxOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + typealias OpType = SoftmaxOp

+ + func inferShape() { + // para.output.dim = para.input.dim } - } - - func delogOutput() { - print("softmax delog") - print(para.input) - print(para.output) - let padToFourDim = para.output.padToFourDim - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) - print(outputArray.strideArray()) - } + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print("softmax delog") + print(para.input) + + print(para.output) + let padToFourDim = para.output.padToFourDim + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) + print(outputArray.strideArray()) + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift index 4d9933f392..4d5cb9b0be 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift @@ -15,63 +15,63 @@ import Foundation class SplitParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope) - output = Texture.init(device: input.metalTexture!.device, inDim: input.dim) - axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs) - sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs) - if axis < 0 { - axis = input.tensorDim.cout() + axis - } - guard let outlist = opDesc.outputs["Out"] else { - fatalError() - } - for out in outlist { - guard let variant = inScope[out], let v = variant as? Texture else { - fatalError() + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope) + output = Texture.init(device: input.metalTexture!.device, inDim: input.dim) + axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs) + sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs) + if axis < 0 { + axis = input.tensorDim.cout() + axis + } + guard let outlist = opDesc.outputs["Out"] else { + fatalError() + } + for out in outlist { + guard let variant = inScope[out], let v = variant as? Texture else { + fatalError() + } + outputList.append(v) + sections.append(Int32(v.tensorDim.dims[axis])) + } + } catch let error { + throw error } - outputList.append(v) - sections.append(Int32(v.tensorDim.dims[axis])) - } - } catch let error { - throw error } - } - - var axis: Int - let input: Texture - var output: Texture - var outputList: [Texture] = [] - var sections: [Int32] = [] + + var axis: Int + let input: Texture + var output: Texture + var outputList: [Texture] = [] + var sections: [Int32] = [] } class SplitOp: Operator, SplitParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = SplitOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = SplitOp

+ + func inferShape() { + // para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.input.metalTexture!.device - for out in para.outputList { - let arr: [Float32] = device.texture2tensor(texture: out.metalTexture, dim: out.tensorDim.dims, transpose: out.transpose) - print(arr.strideArray()) + + func delogOutput() { + print(" \(type) output: ") + let device = para.input.metalTexture!.device + for out in para.outputList { + let arr: [Float32] = device.texture2tensor(texture: out.metalTexture, dim: out.tensorDim.dims, transpose: out.transpose) + print(arr.strideArray()) + } } - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift index 064955fcac..c05c080667 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift @@ -16,43 +16,43 @@ import Foundation import Metal class TransposeParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try TransposeParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try TransposeParam.outputOut(outputs: opDesc.outputs, from: inScope) - axis = try TransposeParam.getAttr(key: "axis", attrs: opDesc.attrs) - } catch let error { - throw error + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try TransposeParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try TransposeParam.outputOut(outputs: opDesc.outputs, from: inScope) + axis = try TransposeParam.getAttr(key: "axis", attrs: opDesc.attrs) + } catch let error { + throw error + } } - } - let input: Texture - var output: Texture - let axis: [Int32] + let input: Texture + var output: Texture + let axis: [Int32] } class TransposeOp: Operator, TransposeParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = TransposeOp

- - func inferShape() { - //para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + typealias OpType = TransposeOp

+ + func inferShape() { + //para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + let device = para.output.metalTexture!.device + let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + print(outputArray.strideArray()) } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.output.metalTexture!.device - let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) - print(outputArray.strideArray()) - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift index b021b09008..27ed620c24 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift @@ -45,13 +45,13 @@ public class PMBlockDesc { } extension PMBlockDesc: CustomStringConvertible, CustomDebugStringConvertible { - public var description: String { + public var description: String { var str = "" for i in 0.. Bool) -> [String : [String]] in @@ -58,24 +58,24 @@ class PMOpDesc { } extension PMOpDesc: CustomStringConvertible, CustomDebugStringConvertible { - var description: String { - var str = "" - str += "op type: \(type): \n" - str += " op inputs: \n" - str += " \(inputs) \n" - str += " op para inputs: \n" - str += " \(paraInputs) \n" - str += " op para outputs: \n" - str += " \(outputs) \n" - str += " op attrs: \n" - str += " \(attrs) \n" + var description: String { + var str = "" + str += "op type: \(type): \n" + str += " op inputs: \n" + str += " \(inputs) \n" + str += " op para inputs: \n" + str += " \(paraInputs) \n" + str += " op para outputs: \n" + str += " \(outputs) \n" + str += " op attrs: \n" + str += " \(attrs) \n" + + return str + } + + var debugDescription: String { + return description + } + - return str - } - - var debugDescription: String { - return description - } - - } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift index 130e6f49fb..e97f448e29 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift @@ -79,7 +79,7 @@ public class PMVarDesc { } extension PMVarDesc: CustomStringConvertible, CustomDebugStringConvertible { - public var description: String { + public var description: String { var str = "" str += "var name \(name): \n" if let inTensorDesc = tensorDesc { @@ -93,7 +93,7 @@ extension PMVarDesc: CustomStringConvertible, CustomDebugStringConvertible { return str } - public var debugDescription: String { + public var debugDescription: String { return description } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift index dcb065de3d..e4248b6409 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift @@ -15,286 +15,286 @@ import Foundation precedencegroup ChainNode { - associativity: left - higherThan: MultiplicationPrecedence + associativity: left + higherThan: MultiplicationPrecedence } infix operator --> : ChainNode class Node { - var inputs: [Node] = [] - var outputs: [Node] = [] - var type: String - var opDesc: PMOpDesc? - init(inOpDesc: PMOpDesc) { - type = inOpDesc.type - opDesc = inOpDesc - } - - init(inType: String) { - type = inType - } - - subscript(index: Int) -> [Node] { - var nodes: [Node] = [] - getNodesWithLocation(index: index, nowIndex: 0, nodes: &nodes) - return nodes - } - - func getNodesWithLocation(index: Int, nowIndex: Int, nodes: inout [Node]) { - if index == nowIndex { - nodes.append(self) + var inputs: [Node] = [] + var outputs: [Node] = [] + var type: String + var opDesc: PMOpDesc? + init(inOpDesc: PMOpDesc) { + type = inOpDesc.type + opDesc = inOpDesc } - for output in outputs { - output.getNodesWithLocation(index: index, nowIndex: nowIndex + 1, nodes: &nodes) + init(inType: String) { + type = inType } - } - - static func -->(lNode: Node, rNode: Node) -> Node { - lNode.outputs.append(rNode) - rNode.inputs.append(lNode) - return rNode - } - - func depth(begin: UInt = 1) -> UInt { - var beginMax: UInt = 1 - for output in outputs { - let subDepth = output.depth(begin: begin + 1) - beginMax = max(begin, subDepth) - } - beginMax = max(begin, beginMax) - return beginMax - } - - func to(depth: UInt) -> Node { - let beginNode = Node.init(inType: type) - beginNode.opDesc = opDesc - to(depth: depth - 1, withNode: beginNode) - return beginNode - } - - func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) { - let fusionNode = fusion.fusionNode() - let change = fusion.change() - let inOutputs = outputs - outputs.removeAll() - opDesc?.outputs.removeAll() - for i in 0.. [Node] { + var nodes: [Node] = [] + getNodesWithLocation(index: index, nowIndex: 0, nodes: &nodes) + return nodes } - opDesc?.type = fusion.fusionType() - type = fusion.fusionType() - } - - private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) { - guard let inOpdesc = opDesc else { - fatalError() + + func getNodesWithLocation(index: Int, nowIndex: Int, nodes: inout [Node]) { + if index == nowIndex { + nodes.append(self) + } + + for output in outputs { + output.getNodesWithLocation(index: index, nowIndex: nowIndex + 1, nodes: &nodes) + } } - for attr in inOpdesc.attrs { - beginNode.opDesc?.attrs[attr.key] = attr.value - // print(beginNode.opDesc?.attrs) + static func -->(lNode: Node, rNode: Node) -> Node { + lNode.outputs.append(rNode) + rNode.inputs.append(lNode) + return rNode } - for paraInput in inOpdesc.paraInputs { - if let inChanges = change[type] { - for keyChange in inChanges { - if keyChange.from == paraInput.key { - beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value - } else { - beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value - } + func depth(begin: UInt = 1) -> UInt { + var beginMax: UInt = 1 + for output in outputs { + let subDepth = output.depth(begin: begin + 1) + beginMax = max(begin, subDepth) } - } else { - beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value - } + beginMax = max(begin, beginMax) + return beginMax } - if matchNode.outputs.count == 0 { - beginNode.outputs.append(contentsOf: outputs) - beginNode.opDesc?.outputs = inOpdesc.outputs - + func to(depth: UInt) -> Node { + let beginNode = Node.init(inType: type) + beginNode.opDesc = opDesc + to(depth: depth - 1, withNode: beginNode) + return beginNode } - removedNodes.append(self) - for i in 0.. [String : Node]{ - var map: [String : Node] = [:] - relationship(map: &map) - return map - } - - private func relationship(map: inout [String : Node]) { - guard let inOpDesc = opDesc else { - return + private func to(depth: UInt, withNode: Node) { + if depth < 1 { + return + } + + for output in outputs { + let node = Node.init(inType: output.type) + node.opDesc = output.opDesc + withNode.outputs.append(node) + output.to(depth: depth - 1, withNode: node) + } } - for output in inOpDesc.outputs { - for outputKey in output.value { - map[outputKey] = self - } + func relationship() -> [String : Node]{ + var map: [String : Node] = [:] + relationship(map: &map) + return map } - for output in outputs { - output.relationship(map: &map) + private func relationship(map: inout [String : Node]) { + guard let inOpDesc = opDesc else { + return + } + + for output in inOpDesc.outputs { + for outputKey in output.value { + map[outputKey] = self + } + } + + for output in outputs { + output.relationship(map: &map) + } } - } - + } extension Node: Equatable { - static func == (lhs: Node, rhs: Node) -> Bool { - if lhs.outputs.count != rhs.outputs.count { - return false - } - - if lhs.type != rhs.type { - return false + static func == (lhs: Node, rhs: Node) -> Bool { + if lhs.outputs.count != rhs.outputs.count { + return false + } + + if lhs.type != rhs.type { + return false + } + + for i in 0.. { - // register fusion - let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp

.self, -// ConvAddAddPreluOp

.self, - ConvAddPreluOp

.self, - ConvAddOp

.self, - ConvBNReluOp

.self, - DwConvBNReluOp

.self, - ElementwiseAddPreluOp

.self - ] - - func optimize(originProgramDesc: PMProgramDesc) -> PMProgramDesc { + // register fusion + let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp

.self, + // ConvAddAddPreluOp

.self, + ConvAddPreluOp

.self, + ConvAddOp

.self, + ConvBNReluOp

.self, + DwConvBNReluOp

.self, + ElementwiseAddPreluOp

.self + ] - guard originProgramDesc.blocks.count == 1 else { - fatalError(" not support yet") - } - - var mapForNodeChain: [String : Node] = [:] - var nodes: [Node] = [] - var typeMapNodes: [String : [(node: Node, output: [String : Node])]] = [:] - let block = originProgramDesc.blocks[0] - for opDesc in block.ops { - print(opDesc.type) - guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else { - fatalError() - } - - let node = Node.init(inOpDesc: opDesc) - for inputKey in opInputKeys { - if let inputs = opDesc.inputs[inputKey] { - for input in inputs { - if let inputNode = mapForNodeChain[input] { - _ = inputNode --> node - } - } + func optimize(originProgramDesc: PMProgramDesc) -> PMProgramDesc { + + guard originProgramDesc.blocks.count == 1 else { + fatalError(" not support yet") } - } - - for outputKey in outputKeys { - if let outputs = opDesc.outputs[outputKey] { - for output in outputs { - mapForNodeChain[output] = node - } - } - } - - nodes.append(node) - - if var inNodes = typeMapNodes[opDesc.type] { - inNodes.append((node, mapForNodeChain)) - typeMapNodes[opDesc.type] = inNodes - } else { - typeMapNodes[opDesc.type] = [(node, mapForNodeChain)] - } - } - - for fusion in fusionOps { - let fusionNode = fusion.fusionNode() - let depth = fusionNode.depth() - if let toMatchNodes = typeMapNodes[fusionNode.type] { - for node in toMatchNodes { - - let toNode = node.node.to(depth: depth) - if toNode == fusionNode { // match - var canFolder = true - let relationshipMap = toNode.relationship() + + var mapForNodeChain: [String : Node] = [:] + var nodes: [Node] = [] + var typeMapNodes: [String : [(node: Node, output: [String : Node])]] = [:] + let block = originProgramDesc.blocks[0] + for opDesc in block.ops { + print(opDesc.type) + guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else { + fatalError() + } - for toCheck in fusion.needCheck() { - // let nodes = toCheck - let checkNodes = toNode[toCheck.0] - - for checkNode in checkNodes { - let inputToChecks = checkNode.opDesc?.inputs[toCheck.1] ?? [] - for inputToCheck in inputToChecks { - if node.output[inputToCheck] == nil { - if relationshipMap[inputToCheck] == nil { - canFolder = false + let node = Node.init(inOpDesc: opDesc) + for inputKey in opInputKeys { + if let inputs = opDesc.inputs[inputKey] { + for input in inputs { + if let inputNode = mapForNodeChain[input] { + _ = inputNode --> node + } } - } } - - let paramInputToChecks = checkNode.opDesc?.paraInputs[toCheck.1] ?? [] - for paramInputToCheck in paramInputToChecks { - if node.output[paramInputToCheck] == nil { - if relationshipMap[paramInputToCheck] == nil { - canFolder = false + } + + for outputKey in outputKeys { + if let outputs = opDesc.outputs[outputKey] { + for output in outputs { + mapForNodeChain[output] = node } - } } - } } - if !canFolder { - continue - } + nodes.append(node) - var removeNodes: [Node] = [] - node.node.folderWith(fusion: fusion, removedNodes: &removeNodes) - for removeNode in removeNodes { - nodes.remove(element: removeNode) + if var inNodes = typeMapNodes[opDesc.type] { + inNodes.append((node, mapForNodeChain)) + typeMapNodes[opDesc.type] = inNodes + } else { + typeMapNodes[opDesc.type] = [(node, mapForNodeChain)] } - } } - } - } - - var ops: [PMOpDesc] = [] - for node in nodes { - ops.append(node.opDesc!) + + for fusion in fusionOps { + let fusionNode = fusion.fusionNode() + let depth = fusionNode.depth() + if let toMatchNodes = typeMapNodes[fusionNode.type] { + for node in toMatchNodes { + + let toNode = node.node.to(depth: depth) + if toNode == fusionNode { // match + var canFolder = true + let relationshipMap = toNode.relationship() + + for toCheck in fusion.needCheck() { + // let nodes = toCheck + let checkNodes = toNode[toCheck.0] + + for checkNode in checkNodes { + let inputToChecks = checkNode.opDesc?.inputs[toCheck.1] ?? [] + for inputToCheck in inputToChecks { + if node.output[inputToCheck] == nil { + if relationshipMap[inputToCheck] == nil { + canFolder = false + } + } + } + + let paramInputToChecks = checkNode.opDesc?.paraInputs[toCheck.1] ?? [] + for paramInputToCheck in paramInputToChecks { + if node.output[paramInputToCheck] == nil { + if relationshipMap[paramInputToCheck] == nil { + canFolder = false + } + } + } + } + } + + if !canFolder { + continue + } + + var removeNodes: [Node] = [] + node.node.folderWith(fusion: fusion, removedNodes: &removeNodes) + for removeNode in removeNodes { + nodes.remove(element: removeNode) + } + } + } + } + } + + var ops: [PMOpDesc] = [] + for node in nodes { + ops.append(node.opDesc!) + } + + let newProgramDesc = PMProgramDesc.init() + let newBlock = PMBlockDesc.init(inVars: block.vars, inOps: ops) + newProgramDesc.blocks.append(newBlock) + return newProgramDesc } - - let newProgramDesc = PMProgramDesc.init() - let newBlock = PMBlockDesc.init(inVars: block.vars, inOps: ops) - newProgramDesc.blocks.append(newBlock) - return newProgramDesc - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift index d73eefd096..478867b08c 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift @@ -48,7 +48,7 @@ public class Scope { } } - + func clear(){ vars.removeAll() } -- GitLab